In [1]:
# re-build linked trips from leg-based table, Yun Ma, 5/1/25

# initial approaches:

# 1. Sort the leg-based trip table by person_id and trip_num.
# 2. Loop by person_id and when the first destination purpose is not ‘change mode’, a link trip is separated;
# 3. Use the loop’s first leg’s origin-purpose, depart-time, o-lat, o-lon as the linked trip’s origin’s attributes;
# 4. Use the loop’s last leg’s dest-purpose, arrive-time, d-lat, d-lon as the linked trip’s destination’s attributes;
# 5. For the mode_type, a mode type priority table based on vta's model was set up; The mode type with the biggest priority among all legs within a loop was picked as the linked trip’s mode type.
# 6. Similarly, a mode_x priority table was set up for mode_1,mode_2,mode_3, and mode_4 based on vta's model. The mode_x with the biggest priority among all legs within a loop was picked as the linked trip’s mode_x.
# 7. Similarly, a transit access/egress priority table was set up. The transit access or egress with the biggest priority among all legs within a loop was picked as the linked trip’s transit access/egress.
# 8. The weight of the linked trip used the average of all legs as a simple approach. Please modify to meet your needs.

# The approach above worked well in most cases, but we did notice two main issues in some cases:

# By using the first non-change-mode as a separator, a linked trip might last more than a day;
# Within a linked trip, the weights are not always the same among all legs: some legs have zero weights, while others have non-zero weights. In this case, we are not comfortable using the average of all leg weights as the linked-trip weight.

# By further investigation, we noticed that these issues happened because their dest-purpose were not imputed correctly: They were imputed as “change-mode”, which was probably a mistake as their reported destination purpose did have a real purpose other than change mode. It also explained why these linked trips last for days, as a real purpose to stop the loop was missed.
# Eventually, we modified the approach in step 2: Loop by person_id, and when the first destination purpose is not ‘change mode’, or the travel day (day_num) has changed, then a linked trip is separated, and the dest_purpose was updated from the original “change-mode” with the real reported purpose.

# With this fix (forcing trips to end within a day) and a clean (using reported destination to correct wrong imputed destination), there are no trips crossing days anymore,  and most weights are consistent within a linked trip.

In [2]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

pd.options.mode.chained_assignment = None

In [3]:
! python --version
print(pd.__version__)    # 1.2.5

Python 3.8.10
1.2.5


In [4]:
def exist(var,df):
    if var in df.columns:
        print ('exist')
    else:
        print("Not exist")

def col_contain(str,df):
    contain = df.columns.str.contains(str) 
    match_col = df.columns[contain]
    return(match_col)

def remove_list(old_list,remove):
    new_list = [element for element in old_list if element not in remove]
    return(new_list)

def remove_subs(s,sub):
    return s.replace(sub, '')

In [5]:
# 0.1 read correspondence

lookup_mode_type = pd.read_csv(r'..\corres\trip\lookup_mode_type.csv')
lookup_mode1 = pd.read_csv(r'..\corres\trip\lookup_mode_1.csv')
lookup_mode2 = pd.read_csv(r'..\corres\trip\lookup_mode_2.csv')
lookup_mode3 = pd.read_csv(r'..\corres\trip\lookup_mode_3.csv')
lookup_mode4 = pd.read_csv(r'..\corres\trip\lookup_mode_4.csv')

lookup_o_purpose_reported = pd.read_csv(r'..\corres\trip\lookup_o_purpose_reported.csv')
lookup_d_purpose_reported = pd.read_csv(r'..\corres\trip\lookup_d_purpose_reported.csv')

lookup_o_purpose = pd.read_csv(r'..\corres\trip\lookup_o_purpose.csv')
lookup_d_purpose = pd.read_csv(r'..\corres\trip\lookup_d_purpose.csv')

lookup_o_purpose_category = pd.read_csv(r'..\corres\trip\lookup_o_purpose_category.csv')
lookup_d_purpose_category = pd.read_csv(r'..\corres\trip\lookup_d_purpose_category.csv')

lookup_transit_access = pd.read_csv(r'..\corres\trip\lookup_transit_access.csv')
lookup_transit_egress = pd.read_csv(r'..\corres\trip\lookup_transit_egress.csv')

lookup_link_transit_access = pd.read_csv(r'..\corres\trip\lookup_link_transit_access.csv')
lookup_link_transit_egress = pd.read_csv(r'..\corres\trip\lookup_link_transit_egress.csv')

lookup_modex_vta = pd.read_csv(r'..\corres\trip\lookup_mode_x_vta.csv')

# 0.2 read raw dataset from latest delivery

trip = pd.read_csv(r'..\dataset\WeightedDataset_02212025\trip.csv')               # 365830, 107 

In [6]:
# 1. merge/add new attributes

# 1.1 lookup mode_type
trip = trip.merge(lookup_mode_type[['mode_type',    'mode_type_priority']],how='left')

trip = trip.merge(lookup_mode1[['mode_1','mode_1_des','mode_1_vta','mode_1_vta_des']],how = 'left')
trip = trip.merge(lookup_mode2[['mode_2','mode_2_des','mode_2_vta','mode_2_vta_des']],how = 'left')
trip = trip.merge(lookup_mode3[['mode_3','mode_3_des','mode_3_vta','mode_3_vta_des']],how = 'left')
trip = trip.merge(lookup_mode4[['mode_4','mode_4_des','mode_4_vta','mode_4_vta_des']],how = 'left')

# 1.2 lookup access/egress
trip = trip.merge(lookup_transit_access[['transit_access','transit_access_des','transit_access_vta','transit_access_vta_des']],how='left')
trip = trip.merge(lookup_transit_egress[['transit_egress','transit_egress_des','transit_egress_vta','transit_egress_vta_des']],how='left')

# 1.3 lookup purpose
trip = trip.merge(lookup_d_purpose[['d_purpose','d_purpose_des']],how = 'left')
trip = trip.merge(lookup_d_purpose_category[['d_purpose_category','d_purpose_category_des']],how = 'left')
trip = trip.merge(lookup_d_purpose_reported[['d_purpose_reported','d_purpose_reported_des','d_purpose_reported_priority','d_purpose_reported_priority_des']],how = 'left')

trip = trip.merge(lookup_o_purpose[['o_purpose','o_purpose_des']],how = 'left')
trip = trip.merge(lookup_o_purpose_category[['o_purpose_category','o_purpose_category_des']],how = 'left')
trip = trip.merge(lookup_o_purpose_reported[['o_purpose_reported','o_purpose_reported_des','o_purpose_reported_priority','o_purpose_reported_priority_des']],how = 'left')

In [7]:
# 1.2 combine datetime
trip['depart_datetime'] = pd.to_datetime(trip['depart_date'].astype(str) +' '+ 
                                         trip['depart_hour'].astype(str) + ':' + 
                                         trip['depart_minute'].astype(str) + ':' + 
                                         trip['depart_seconds'].astype(str), format='%Y-%m-%d %H:%M:%S')
trip['arrive_datetime'] = pd.to_datetime(trip['arrive_date'].astype(str) +' '+ 
                                         trip['arrive_hour'].astype(str) + ':' + 
                                         trip['arrive_minute'].astype(str) + ':' + 
                                         trip['arrive_second'].astype(str), format='%Y-%m-%d %H:%M:%S')

trip.shape

(365830, 138)

In [8]:
# 2. split trips into 2 parts to save linkage processing time

# select trip-legs having no transfer purpose: 306147, each is individual trip with real purpose
trip_s0 = trip[ (trip['o_purpose_category'] != 11)  & (trip['d_purpose_category'] != 11)]
print(trip_s0.shape)

# select trip-legs with transfer purpose: 59683, which need to be linked
trip_s1 = trip[ (trip['o_purpose_category'] == 11)  | (trip['d_purpose_category'] == 11)]
print(trip_s1.shape)

(306147, 138)
(59683, 138)


In [9]:
# 3. process trip_s1 : 59683 tansfer legs need to be linked

In [10]:
# # 3.0 define least key variables used in the linkage process

C_id = ['hh_id','person_id','trip_id','person_num','trip_num','day_num']

C_purp = ['o_purpose_category','d_purpose_category','o_purpose_reported_priority','d_purpose_reported_priority',
          'o_purpose','d_purpose']
C_time = ['depart_datetime','arrive_datetime']
C_mode = ['mode_type_priority','mode_1_vta','mode_2_vta','mode_3_vta','mode_4_vta']
C_access = ['transit_access_vta','transit_egress_vta']

C_wt =   ['trip_weight','trip_weight_rmove_only']

C_od =   ['o_lat','o_lon','d_lat','d_lon','o_county', 'd_county']
C_oth =  ['distance_miles','duration_minutes','dwell_mins']

C_key = C_purp + C_time + C_mode + C_access+ C_od + C_wt + C_oth
C_key_link = ["link_" + key for key in C_key]

C_flag = ['link_day','link_num','leg_num','leg_delete']

C_last = ['d_purpose_category','d_purpose','arrive_datetime','d_lat','d_lon','d_county']
C_last_link = ["link_" + key for key in C_last]                                   # will replace C_key_link

C_other = C_wt + C_oth
C_other_sum = ["sum_" + key for key in C_other]

dictKey = dict(zip(C_key,C_key_link))
dictLast = dict(zip(C_last,C_last_link))
dictSum = dict(zip(C_other,C_other_sum)) 

In [11]:
# 3.1 sort trip (trips must be sorted before linking!)

trip_s1.fillna(0, inplace=True)
trip_s1 = trip_s1.sort_values(by=['hh_id','person_id','trip_num'])

In [12]:
# 3.2 merge transfer legs to linked trip_s1s for trip_s1s crossing multip days

# initial
for item in C_key_link:
    trip_s1.loc[:,item] = 0.0

for item in C_flag:
    trip_s1.loc[:,item] = 0

i = 0
link_pid = 0
link_day = 0
link_num = 0

# loop
while i < len(trip_s1):
    
    person_id = trip_s1.iloc[i,trip_s1.columns.get_loc('person_id')]
    day_num = trip_s1.iloc[i,trip_s1.columns.get_loc('day_num')] 

    if (person_id == link_pid):
        link_num += 1
    else:
        link_num = 1
        link_pid = person_id
        link_day = day_num        

    # initial for all trips including independent trips with one leg only    
    for item in dictKey:    
        trip_s1.iloc[i,trip_s1.columns.get_loc(dictKey[item])] = trip_s1.iloc[i,trip_s1.columns.get_loc(item)]  
    trip_s1.iloc[i,trip_s1.columns.get_loc('link_num')] = link_num
    trip_s1.iloc[i,trip_s1.columns.get_loc('link_day')] = link_day
    
    if (trip_s1.iloc[i,trip_s1.columns.get_loc('d_purpose_category')] !=11):        
        i += 1
        continue
    
    j = 1
    for item in C_other:
        exec('sum_'+item + '=' + "trip_s1.iloc[i,trip_s1.columns.get_loc('" + item + "')]")

                           
    while ((trip_s1.iloc[(i+j),trip_s1.columns.get_loc('person_id')] == person_id) & \
           (trip_s1.iloc[(i+j),trip_s1.columns.get_loc('day_num')] == day_num)):

        # use last leg's 'd_purpose_category','d_purpose'... as real purpose
        for item in dictLast:
            trip_s1.iloc[i,trip_s1.columns.get_loc(dictLast[item])] = trip_s1.iloc[(i+j),trip_s1.columns.get_loc(item)]
        
        # mode priority
        if trip_s1['mode_type_priority'].iloc[i+j] < trip_s1['link_mode_type_priority'].iloc[i]:
            trip_s1['link_mode_type_priority'].iloc[i] = trip_s1['mode_type_priority'].iloc[i+j]
            
        # mode_x
        if trip_s1['mode_1_vta'].iloc[i+j] < trip_s1['link_mode_1_vta'].iloc[i]:
            trip_s1['link_mode_1_vta'].iloc[i] = trip_s1['mode_1_vta'].iloc[i+j]
            
        if trip_s1['mode_2_vta'].iloc[i+j] < trip_s1['link_mode_2_vta'].iloc[i]:
            trip_s1['link_mode_2_vta'].iloc[i] = trip_s1['mode_2_vta'].iloc[i+j]

        if trip_s1['mode_3_vta'].iloc[i+j] < trip_s1['link_mode_3_vta'].iloc[i]:
            trip_s1['link_mode_3_vta'].iloc[i] = trip_s1['mode_3_vta'].iloc[i+j]        

        if trip_s1['mode_4_vta'].iloc[i+j] < trip_s1['link_mode_4_vta'].iloc[i]:
            trip_s1['link_mode_4_vta'].iloc[i] = trip_s1['mode_4_vta'].iloc[i+j]  

        # transit access
        if trip_s1['transit_access_vta'].iloc[i+j] < trip_s1['transit_access_vta'].iloc[i]:
            trip_s1['link_transit_access_vta'].iloc[i] = trip_s1['transit_access_vta'].iloc[i+j]
            
        if trip_s1['transit_egress_vta'].iloc[i+j] < trip_s1['transit_egress_vta'].iloc[i]:
            trip_s1['link_transit_egress_vta'].iloc[i] = trip_s1['transit_egress_vta'].iloc[i+j]            
        
        # o_purpose: link_o_purpose_reported_priority to be used later as the new real o_purpose, if link_o_purpose_category == 11 
        if trip_s1['o_purpose_reported_priority'].iloc[i+j] < trip_s1['link_o_purpose_reported_priority'].iloc[i]:
            trip_s1['link_o_purpose_reported_priority'].iloc[i] = trip_s1['o_purpose_reported_priority'].iloc[i+j]
            
        # d_purpose: will be used later as the real d_purpose, if link_d_purpose_category == 11    
        if trip_s1['d_purpose_reported_priority'].iloc[i+j] < trip_s1['link_d_purpose_reported_priority'].iloc[i]:
            trip_s1['link_d_purpose_reported_priority'].iloc[i] = trip_s1['d_purpose_reported_priority'].iloc[i+j]        
        
        #sum_other
        for item in C_other:
            exec('sum_'+item + '=' + 'sum_'+item  + " + trip_s1.iloc[(i+j),trip_s1.columns.get_loc('" + item + "')]")
         
        # assign weight/oth    
        trip_s1.iloc[i,trip_s1.columns.get_loc('link_trip_weight')] = sum_trip_weight/(j+1)
        trip_s1.iloc[i,trip_s1.columns.get_loc('link_trip_weight_rmove_only')] = sum_trip_weight_rmove_only/(j+1)
        trip_s1.iloc[i,trip_s1.columns.get_loc('link_distance_miles')] = sum_distance_miles 
        trip_s1.iloc[i,trip_s1.columns.get_loc('link_duration_minutes')] = sum_duration_minutes
        trip_s1.iloc[i,trip_s1.columns.get_loc('link_dwell_mins')] = sum_dwell_mins               
        
        
        # add flags
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('leg_delete')] = 1
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('link_num')] = link_num
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('leg_num')] = j
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('link_day')] = link_day
        
        if trip_s1.iloc[(i+j),trip_s1.columns.get_loc('d_purpose_category')] != 11:            
            j += 1
            break
        j += 1   
        continue

    i = i + j
    continue



In [13]:
# 3.3.1 lookup link_mode_type_priority -> link_mode_type,link_mode_x_vta_des
trip_s1_m = trip_s1.merge(lookup_mode_type[['link_mode_type_priority','link_mode_type','link_mode_type_des']],how='left') 

# 3.3.2 lookup link_transit_access -> link_transit_access_vta_des
trip_s1_m = trip_s1_m.merge(lookup_link_transit_access[['link_transit_access_vta','link_transit_access_vta_des']],how='left') 
trip_s1_m = trip_s1_m.merge(lookup_link_transit_egress[['link_transit_egress_vta','link_transit_egress_vta_des']],how='left')

# 3.3.3 modify link_x_purpose, link_x_purpose_category
# if link_x_purpose_category == 11, use link_x_purpose_new based on link_reported_priority as new real purpose
# 3.3.3.1 
trip_s1_m=trip_s1_m.merge(lookup_o_purpose_reported[['link_o_purpose_reported_priority',
                                                     'link_o_purpose_category_new',
                                                     'link_o_purpose_new',
                                                     'link_o_purpose_reported']],how='left')   
trip_s1_m=trip_s1_m.merge(lookup_d_purpose_reported[['link_d_purpose_reported_priority',
                                                     'link_d_purpose_category_new',
                                                     'link_d_purpose_new',
                                                     'link_d_purpose_reported']],how='left')

# 3.3.3.2 update link_d_purpose using _new for transfer links (11) 
trip_s1_m['link_o_purpose_category'] = np.where(trip_s1_m['link_o_purpose_category'] == 11,
                                       trip_s1_m['link_o_purpose_category_new'],
                                       trip_s1_m['link_o_purpose_category'])
trip_s1_m['link_d_purpose_category'] = np.where(trip_s1_m['link_d_purpose_category'] == 11,
                                       trip_s1_m['link_d_purpose_category_new'],
                                       trip_s1_m['link_d_purpose_category'])

trip_s1_m['link_o_purpose'] = np.where(trip_s1_m['link_o_purpose'] == 60,
                                       trip_s1_m['link_o_purpose_new'],
                                       trip_s1_m['link_o_purpose'])
trip_s1_m['link_d_purpose'] = np.where(trip_s1_m['link_d_purpose'] == 60,
                                       trip_s1_m['link_d_purpose_new'],
                                       trip_s1_m['link_d_purpose'])

In [14]:
# 4.0 define filter columns
 
C_o_purp = ['o_purpose_category','o_purpose','o_purpose_reported']
C_d_purp = ['d_purpose_category','d_purpose','d_purpose_reported']
C_od_purp = C_o_purp + C_d_purp

C_mode_type = ['mode_type']
C_mode_x = ['mode_1','mode_2','mode_3','mode_4']
C_mode_x_vta = ['mode_1_vta','mode_2_vta','mode_3_vta','mode_4_vta']
C_mode_all = C_mode_type + C_mode_x_vta

C_access_egress = ['transit_access_vta','transit_egress_vta']

C_filter_unlink = C_time + C_od_purp + C_mode_all + C_access_egress + C_wt + C_od + C_oth
C_filter_link = ["link_" + key for key in C_filter_unlink]
C_filter_link_unlink = dict(zip(C_filter_link,C_filter_unlink))

C_filter = C_id + C_filter_link + C_flag

In [15]:
# 4.1 output linked trip_s1 with all legs and flags, filter=36

trip_s1 = trip_s1_m.filter(items=C_filter)
trip_s1.to_csv(r'..\output\0501\Combine_from_legs_to_links_with_all_legs_n_flags_filter_59683x36.csv',index=False)
trip_s1.shape

(59683, 36)

In [16]:
# 4.2 create distinct trip_s1_linked, by remove ['leg_delete'] == 1

trip_s1_linked = trip_s1[trip_s1['leg_delete'] == 0]
trip_s1_linked.loc[:,'link_flag'] = "Y"
trip_s1_linked.to_csv(r'..\output\0501\linked_trips_only_built_from_legs_17735x37.csv',index=False)
trip_s1_linked.shape

(17958, 37)

In [17]:
# 4.3 create trip_s1_unlink by rename

trip_s1_linked_rename = trip_s1_linked.copy()
trip_s1_linked_rename = trip_s1_linked_rename.rename(columns=C_filter_link_unlink)

trip_s1_linked_rename.to_csv(r'..\output\0501\linked_trips_only_built_from_legs_rename_17735x37.csv',index=False)
trip_s1_linked_rename.shape

(17958, 37)

In [18]:
# 5.0 process trip_s0 by filter columns

trip_s0.shape    # 306147, 139

C_s1_columns = trip_s1_linked_rename.columns.tolist()
trip_s0_filter = trip_s0.filter(items=C_s1_columns)

trip_s0_filter.loc[:,'link_flag'] = "N"
trip_s0_filter.loc[:,'link_num'] = 0
trip_s0_filter.loc[:,'leg_num'] = 999
trip_s0_filter.loc[:,'leg_delete'] = "NA"

trip_s0_filter.shape

(306147, 36)

In [19]:
# 5.1 append trip_s1 linked trips with trip_s0 filter

trip_s01 = pd.concat([trip_s1_linked_rename,trip_s0_filter])
trip_s01.to_csv(r'..\output\0501\final_output_with_all_linked_n_individual_trips_324105x38.csv',index=False)
trip_s01.shape

(324105, 37)

In [28]:
### end of trip-linkage ###