In [3]:
import pandas as pd
import numpy as np
import json
from collections import namedtuple

In [18]:
df = pd.read_csv('../data/one_travel_chain.csv')
# 提取所有不同的 grid IDs
unique_grid_ids = pd.unique(df[['grid_id_o', 'grid_id_d']].values.ravel('K'))

# 使用 factorize 方法为所有不同的 grid IDs 分配新的编码
new_codes, unique = pd.factorize(unique_grid_ids, sort=True)

# 创建一个字典来映射旧的 grid IDs 到新的编码
grid_id_mapping = dict(zip(unique, new_codes))

df['grid_new_o'] = df['grid_id_o'].map(grid_id_mapping)
df['grid_new_d'] = df['grid_id_d'].map(grid_id_mapping)
df.head()

Unnamed: 0,who,date,seiqd,mode,poi_o,poi_d,lambda_o,phi_o,lambda_d,phi_d,migrt,home_distance,trip_distance,weekend,grid_id_o,grid_id_d,grid_new_o,grid_new_d
0,77678983,20190101,1,1,2,12,114.101082,22.577668,114.101268,22.556263,20190701,16.03122,9.0,False,20850.0,17637.0,15,152
1,77678983,20190101,2,1,12,1,114.101268,22.556263,114.106437,22.591168,20190701,1.414214,15.132746,False,17637.0,22994.0,152,112
2,77678983,20190102,1,1,2,0,114.101082,22.577668,113.928502,22.521493,20190701,78.790862,75.272837,False,20850.0,11854.0,15,120
3,77678983,20190102,2,1,0,2,113.928502,22.521493,114.101082,22.577668,20190701,7.071068,75.272837,False,11854.0,20850.0,120,15
4,77678983,20190103,1,1,19,0,113.960863,22.545302,113.928502,22.521493,20190701,78.790862,17.029386,False,15794.0,11854.0,129,120


In [11]:
def build_chain(group):
    chains = []
    current_chain = [group['grid_id_o'].iloc[0]]
    
    for i in range(len(group) - 1):
        current_chain.append(group['grid_id_d'].iloc[i])
        # Check if the next 'o' is different from the current 'd'
        if group['grid_id_d'].iloc[i] != group['grid_id_o'].iloc[i + 1]:
            chains.append(current_chain)
            current_chain = [group['grid_id_o'].iloc[i + 1]]
    
    # Append the last destination and the final chain
    current_chain.append(group['grid_id_d'].iloc[-1])
    chains.append(current_chain)
    
    return chains

def build_id_chain(group):
    chains = []
    current_chain = [group['grid_new_o'].iloc[0]]
    
    for i in range(len(group) - 1):
        current_chain.append(group['grid_new_d'].iloc[i])
        # Check if the next 'o' is different from the current 'd'
        if group['grid_new_d'].iloc[i] != group['grid_new_o'].iloc[i + 1]:
            chains.append(current_chain)
            current_chain = [group['grid_new_o'].iloc[i + 1]]
    
    # Append the last destination and the final chain
    current_chain.append(group['grid_new_d'].iloc[-1])
    chains.append(current_chain)
    
    return chains

def build_both_chains(group):
    travel_chain = build_chain(group)
    id_chain = build_id_chain(group)
    return pd.Series({'travel_chain': travel_chain, 'id_chain': id_chain})

In [12]:
# Group by 'date' and apply the chain-building function
grouped1 = df[df['date']<df['migrt']]
grouped2 = df[df['date']>df['migrt']]
grouped1 = grouped1.groupby('date').apply(build_both_chains).reset_index()
grouped2 = grouped2.groupby('date').apply(build_both_chains).reset_index()

In [5]:
# Define the namedtuple type
TravelData = namedtuple('TravelChain', ['date', 'travel_chain','id_chain'])

In [14]:
# Convert each row to a namedtuple
namedtuples_list1 = [TravelData(row.date, row.travel_chain[0], row.id_chain[0]) for _, row in grouped1.iterrows()]
namedtuples_list2 = [TravelData(row.date, row.travel_chain[0], row.id_chain[0]) for _, row in grouped2.iterrows()]
namedtuples_all = namedtuples_list1+namedtuples_list2

In [15]:
def int64_converter(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    raise TypeError

dicts_list1 = [nt._asdict() for nt in namedtuples_list1]
with open('../data/before_migrt.json', 'w') as file:
    json.dump(dicts_list1, file, indent=4,default=int64_converter)

dicts_list2 = [nt._asdict() for nt in namedtuples_list2]
with open('../data/after_migrt.json', 'w') as file:
    json.dump(dicts_list2, file, indent=4,default=int64_converter)

dicts_list_all = [nt._asdict() for nt in namedtuples_all]
with open('../data/all_traj.json', 'w') as file:
    json.dump(dicts_list_all, file, indent=4,default=int64_converter)

In [10]:
with open('../data/all_traj.json', 'r') as file:
    loaded_dicts_all = json.load(file)
loaded_namedtuples_all = [TravelData(**d) for d in loaded_dicts_all]

with open('../data/before_migrt.json', 'r') as file:
    loaded_dicts_all = json.load(file)
loaded_namedtuples_before = [TravelData(**d) for d in loaded_dicts_all]

In [21]:
loaded_namedtuples_all

[TravelChain(date=20190101, travel_chain=[20850.0, 17637.0, 22994.0], id_chain=[15, 152, 112]),
 TravelChain(date=20190102, travel_chain=[20850.0, 11854.0, 20850.0], id_chain=[15, 120, 15]),
 TravelChain(date=20190103, travel_chain=[15794.0, 11854.0, 16557.0, 22994.0], id_chain=[129, 120, 169, 112]),
 TravelChain(date=20190104, travel_chain=[22994.0, 11854.0, 15483.0, 20850.0], id_chain=[112, 120, 39, 15]),
 TravelChain(date=20190106, travel_chain=[22994.0, 15103.0, 16201.0], id_chain=[112, 87, 88]),
 TravelChain(date=20190107, travel_chain=[20850.0, 11854.0, 12966.0, 11854.0, 10472.0], id_chain=[15, 120, 27, 120, 56]),
 TravelChain(date=20190108, travel_chain=[18359.0, 11854.0, 18359.0, 20850.0], id_chain=[122, 120, 122, 15]),
 TravelChain(date=20190109, travel_chain=[22994.0, 11854.0, 16492.0, 11854.0, 14719.0, 18359.0], id_chain=[112, 120, 126, 120, 7, 122]),
 TravelChain(date=20190110, travel_chain=[20850.0, 11854.0, 20850.0], id_chain=[15, 120, 15]),
 TravelChain(date=20190111, tr

# find travel chain fnid

In [19]:
unique_travel_chain_all = set()

for tc in loaded_namedtuples_all:
    unique_travel_chain_all.update(tc.travel_chain)

In [13]:
unique_travel_chain_before = set()

for tc in loaded_namedtuples_before:
    unique_travel_chain_before.update(tc.travel_chain)

In [None]:
df = pd.read_excel("../data/SZ_tfidf.xlsx")

In [17]:

filtered_df = df[df['fnid'].isin(unique_travel_chain_before)]
filtered_df.to_csv('../data/before_migrt_fnid.csv',index=False)


In [20]:
filtered_df = df[df['fnid'].isin(unique_travel_chain_all)]
filtered_df.to_csv('../data/all_traj_fnid.csv',index=False)