In [1]:
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd

# Add grid id and merge nan rows

In [2]:
df = pd.read_csv('../data/one_travel_chain_origin.csv')

In [3]:
df['geometry_o'] = [Point(xy) for xy in zip(df['lambda_o'], df['phi_o'])]
df['geometry_d'] = [Point(xy) for xy in zip(df['lambda_d'], df['phi_d'])]
grid_gdf = gpd.read_file("../data/shenzhen_grid/shenzhen_grid.shp")

geo_df_o = gpd.GeoDataFrame(df, geometry='geometry_o')
geo_df_d = gpd.GeoDataFrame(df, geometry='geometry_d')

# Ensure CRS matches before joining
geo_df_o.set_crs(grid_gdf.crs, inplace=True)
geo_df_d.set_crs(grid_gdf.crs, inplace=True)

result_o = gpd.sjoin(geo_df_o, grid_gdf, how="left", predicate="within")
result_d = gpd.sjoin(geo_df_d, grid_gdf, how="left", predicate="within")

df['grid_id_o'] = result_o['fnid']  
df['grid_id_d'] = result_d['fnid']

  arr = construct_1d_object_array_from_listlike(values)
  arr = construct_1d_object_array_from_listlike(values)


## merging trajectory breakpoints caused by leaving the city

In [4]:
# Finding rows with at least one NaN value, out of city
columns_to_check = ["lambda_o","phi_o","lambda_d","phi_d","grid_id_o","grid_id_d"]
rows_with_nan = df[columns_to_check].isna().any(axis=1)
nan_indexes = df[rows_with_nan].index
nan_indexes

Index([  51,   52,   77,   78,   96,   97,  154,  155,  168,  169,  515,  516,
        798,  799,  805,  806, 1017, 1018, 1171, 1172, 1312, 1313, 1314, 1315],
      dtype='int64')

In [5]:
for i in nan_indexes[::2]:
    df.at[i, 'etime'] = df.at[i + 1, 'etime']
    df.iloc[i] = df.iloc[i].combine_first(df.iloc[i + 1])
df.drop(nan_indexes[1::2], inplace=True)

In [6]:
df.fillna(0, inplace=True)
df.reset_index(drop=True, inplace=True)

## merging trajectory breakpoints caused by midnight

In [7]:
discontinuous_rows = []

for i in range(len(df) - 1):
    if df['lambda_d'].iloc[i] != df['lambda_o'].iloc[i + 1]:
        # next row index
        discontinuous_rows.append(i + 1)
print(discontinuous_rows)

[69, 113, 216, 338, 465, 783, 980, 1143, 1163, 1303, 1304, 1317, 1486]


In [8]:
for idx in discontinuous_rows:
    df.loc[idx, 'poi_o'] = df.loc[idx - 1, 'poi_d']
    df.loc[idx, 'org_chess_x'] = df.loc[idx - 1, 'dst_chess_x']
    df.loc[idx, 'org_chess_y'] = df.loc[idx - 1, 'dst_chess_y']
    df.loc[idx, 'lambda_o'] = df.loc[idx - 1, 'lambda_d']
    df.loc[idx, 'phi_o'] = df.loc[idx - 1, 'phi_d']
    df.loc[idx, 'pre_chess_x'] = df.loc[idx - 1, 'post_chess_x']
    df.loc[idx, 'pre_chess_y'] = df.loc[idx - 1, 'post_chess_y']
    df.loc[idx, 'grid_id_o'] = df.loc[idx-1, 'grid_id_d']

In [9]:
discontinuous_rows = []
for i in range(len(df) - 1):
    if df['grid_id_d'].iloc[i] != df['grid_id_o'].iloc[i + 1]:
        # next row index
        discontinuous_rows.append(i + 1)
print(discontinuous_rows)

[]


# Drop columns and save to feature csv

In [10]:
columns_to_drop = [
    'poi_o','poi_d','org_chess_x', 'org_chess_y', 'dst_chess_x',
    'dst_chess_y', 'pre_chess_x', 'pre_chess_y', 'post_chess_x', 'post_chess_y',
    'geometry_o', 'geometry_d','home_distance','trip_distance'
]

df = df.drop(columns=columns_to_drop)

In [11]:
df.to_csv('../data/one_travel_chain.csv',index=False)

In [12]:
df_before_migrt = df[df['date'] < df['migrt']]
df_after_migrt = df[df['date']>df['migrt']]

print("DataFrame with 'date' before 'migrt':")
print(len(df_before_migrt))
print("DataFrame with 'date' after 'migrt':")
print(len(df_after_migrt))

DataFrame with 'date' before 'migrt':
627
DataFrame with 'date' after 'migrt':
1043


In [13]:
df_before_migrt.to_csv('../data/one_travel_before.csv',index=False)
df_after_migrt.to_csv('../data/one_travel_after.csv',index=False)

In [27]:
def mergeFeatureDataframes(df):

    selectedColumns1 = ['grid_id_d', 'pre_home_distance', 'post_home_distance', 'LU_Business',
                        'LU_City_Road', 'LU_Consumption', 'LU_Culture', 'LU_Industry',
                        'LU_Medical', 'LU_Park_&_Scenery', 'LU_Public', 'LU_Residence',
                        'LU_Science_&_Education', 'LU_Special', 'LU_Transportation', 'LU_Wild'
                        ]
    
    featureDf1 = df[selectedColumns1].drop_duplicates(subset=['grid_id_d'])
    featureDf1 = featureDf1.rename(columns={'grid_id_d':'fnid'})

    selectedColumns2 = ['grid_id_o', 'LU_Business',
                        'LU_City_Road', 'LU_Consumption', 'LU_Culture', 'LU_Industry',
                        'LU_Medical', 'LU_Park_&_Scenery', 'LU_Public', 'LU_Residence',
                        'LU_Science_&_Education', 'LU_Special', 'LU_Transportation', 'LU_Wild'
                        ]

    featureDf2 = df[selectedColumns2].drop_duplicates(subset=['grid_id_o'])
    featureDf2 = featureDf2.rename(columns={'grid_id_o':'fnid'})

    mergedDf = featureDf1.merge(featureDf2, on='fnid', how='outer')
    mergedDf = mergedDf.fillna(0)

    for col in selectedColumns2:
        if col != 'fnid' and col in featureDf1.columns and col in featureDf2.columns:
            mergedDf[col] = mergedDf[col + '_x'] + mergedDf[col + '_y']
            mergedDf.drop([col + '_x', col + '_y'], axis=1, inplace=True)

    mergedDf.fillna(0, inplace=True)
    return mergedDf

In [28]:
before_feature=mergeFeatureDataframes(df_before_migrt)
before_feature.to_csv('../data/before_migrt_feature.csv',index=False)

after_feature = mergeFeatureDataframes(df_after_migrt)
after_feature.to_csv('../data/after_migrt_feature.csv',index=False)

all_feature = mergeFeatureDataframes(df)
all_feature.to_csv('../data/all_traj_feature.csv',index=False)