"SuperShuttle" and "SFO AirTrain" are treated as a transfer leg in the raw survey data. However, they should be considered as an access/egress mode - "SuperShuttle" as "TNC/taxi" and "SFO AirTrain" as "Walk". This script modifies the input survey data to remove them as transfer legs and update the access/egress modes accordingly.

In [1]:
import pandas as pd

#### Golden Gate Transit (2018) survey contains 'SuperShuttle' as a transfer leg

In [2]:
df_ggtransit = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\Golden Gate Transit\2018\As CSV\20180907_OD_GoldenGate_allDays_addCols_NO POUND OR SINGLE QUOTE.csv')
print('read {} rows of Golden Gate Transit survey data with {} unique IDs'.format(
    df_ggtransit.shape[0],
    len(df_ggtransit.id.unique())))

read 4103 rows of Golden Gate Transit survey data with 4103 unique IDs


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# check which leg has 'Supershuttle'
for i in ['final_first_route_before_survey_board_comb', 'final_second_route_before_survey_board_comb',
          'final_third_route_before_survey_board', 'final_first_route_after_survey_alight_comb', 
          'final_second_route_after_survey_alight_comb', 'final_third_route_after_survey_alight_comb']:
    if 'Supershuttle' in list(df_ggtransit[i].unique()):
        print(i)

final_first_route_before_survey_board_comb


In [4]:
# check the relevant fields
df_chg_idx = df_ggtransit.final_first_route_before_survey_board_comb == 'Supershuttle'
display(df_ggtransit.loc[df_chg_idx][['id', 'final_access_mode',
                                      'final_first_route_before_survey_board_comb',
                                      'final_second_route_before_survey_board_comb',
                                      'final_third_route_before_survey_board',
                                      'final_prev_transfers_code']].transpose())

# modify access_mode, before_transfer_num, and first before route
df_ggtransit.loc[df_chg_idx, 'final_access_mode'] = 'Was dropped off using Uber, Lyft, or similar service'
df_ggtransit.loc[df_chg_idx, 'final_first_route_before_survey_board_comb'] = df_ggtransit['final_second_route_before_survey_board_comb']
df_ggtransit.loc[df_chg_idx, 'final_prev_transfers_code'] = 0

# double check
df_ggtransit.loc[df_chg_idx][['id', 'final_access_mode',
                              'final_first_route_before_survey_board_comb',
                              'final_second_route_before_survey_board_comb',
                              'final_third_route_before_survey_board',
                              'final_prev_transfers_code']].transpose()



Unnamed: 0,3685
id,37200
final_access_mode,Walked all the way
final_first_route_before_survey_board_comb,Supershuttle
final_second_route_before_survey_board_comb,
final_third_route_before_survey_board,
final_prev_transfers_code,1


Unnamed: 0,3685
id,37200
final_access_mode,"Was dropped off using Uber, Lyft, or similar s..."
final_first_route_before_survey_board_comb,
final_second_route_before_survey_board_comb,
final_third_route_before_survey_board,
final_prev_transfers_code,0


In [5]:
# export to M
print('export {} rows of Golden Gate Transit survey data with {} unique IDs'.format(
    df_ggtransit.shape[0],
    len(df_ggtransit.id.unique())))
df_ggtransit.to_csv(r'M:\Data\OnBoard\Data and Reports\Golden Gate Transit\2018\As CSV\20180907_OD_GoldenGate_allDays_addCols_modifyTransfer_NO POUND OR SINGLE QUOTE.csv',
                    index=False)

export 4103 rows of Golden Gate Transit survey data with 4103 unique IDs


#### BART (2015) survey contains 'AirTrain (SFO)' as a transfer leg

In [6]:
df_bart = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\BART\As CSV\BART_Final_Database_Mar18_SUBMITTED_with_station_xy_with_first_board_last_alight_fixColname NO POUND OR SINGLE QUOTE.csv')
print('read {} rows of BART survey data with {} unique IDs'.format(
    df_bart.shape[0],
    len(df_bart.ID.unique())))

  interactivity=interactivity, compiler=compiler, result=result)


read 46948 rows of BART survey data with 46948 unique IDs


In [7]:
# check which leg has 'AirTrain (SFO)'
for i in ['ACCESSTRNSFR_LIST1', 'ACCESSTRNSFR_LIST2', 'ACCESSTRSNFR_LIST3',
          'EGRESS_TRNSFR_LIST1', 'EGRESS_TRNSFR_LIST2', 'EGRESSTRNSFR_LIST3']:
    if 'AirTrain (SF Airport)' in list(df_bart[i].unique()):
        print(i)

ACCESSTRNSFR_LIST1
EGRESS_TRNSFR_LIST1


In [8]:
# fix 'AirTrain' in before tansfers
bart_idx_1 = df_bart.ACCESSTRNSFR_LIST1 == 'AirTrain (SF Airport)'
print('{} responses have AirTain (SFO) as the first before transfer'.format(bart_idx_1.sum()))

display(df_bart.loc[bart_idx_1][['ACCESS_MODE',
                                 'ACCESSTRNSFR_LIST1','ACCESSTRNSFR_LIST2', 'ACCESSTRSNFR_LIST3']].drop_duplicates())

# update access_mode to be 'walk' and remove transfer route
df_bart.loc[bart_idx_1, 'ACCESS_MODE'] = 'Walked (includes wheelchair, skateboard)'
df_bart.loc[bart_idx_1, 'ACCESSTRNSFR_LIST1'] = df_bart['ACCESSTRNSFR_LIST2']

# double check
display(df_bart.loc[bart_idx_1][['ACCESS_MODE',
                                 'ACCESSTRNSFR_LIST1','ACCESSTRNSFR_LIST2', 'ACCESSTRSNFR_LIST3']].drop_duplicates())

215 responses have AirTain (SFO) as the first before transfer


Unnamed: 0,ACCESS_MODE,ACCESSTRNSFR_LIST1,ACCESSTRNSFR_LIST2,ACCESSTRSNFR_LIST3
785,"Walked (includes wheelchair, skateboard)",AirTrain (SF Airport),,
41776,Missing - Question Not Asked,AirTrain (SF Airport),,


Unnamed: 0,ACCESS_MODE,ACCESSTRNSFR_LIST1,ACCESSTRNSFR_LIST2,ACCESSTRSNFR_LIST3
785,"Walked (includes wheelchair, skateboard)",,,


In [9]:
# fix 'AirTrain' in after tansfers
bart_idx_2 = df_bart.EGRESS_TRNSFR_LIST1 == 'AirTrain (SF Airport)'
print('{} responses have AirTain (SFO) as the first after transfer'.format(bart_idx_2.sum()))

display(df_bart.loc[bart_idx_2][['EGRESS_TRNSFR_LIST1', 'EGRESS_TRNSFR_LIST2', 'EGRESSTRNSFR_LIST3',
                                 'EGRESS_MODE']].drop_duplicates())

# update egress_mode to be 'walk' and remove transfer route
df_bart.loc[bart_idx_2, 'EGRESS_MODE'] = 'Walk (includes wheelchair, skateboard)'
df_bart.loc[bart_idx_2, 'EGRESS_TRNSFR_LIST1'] = df_bart['EGRESS_TRNSFR_LIST2']

# double check
display(df_bart.loc[bart_idx_2][['EGRESS_TRNSFR_LIST1', 'EGRESS_TRNSFR_LIST2', 'EGRESSTRNSFR_LIST3',
                                 'EGRESS_MODE']].drop_duplicates())

541 responses have AirTain (SFO) as the first after transfer


Unnamed: 0,EGRESS_TRNSFR_LIST1,EGRESS_TRNSFR_LIST2,EGRESSTRNSFR_LIST3,EGRESS_MODE
21,AirTrain (SF Airport),,,"Walk (includes wheelchair, skateboard)"
367,AirTrain (SF Airport),,,"Bus, train, or other public transit (includes ..."
7792,AirTrain (SF Airport),,,A bicycle


Unnamed: 0,EGRESS_TRNSFR_LIST1,EGRESS_TRNSFR_LIST2,EGRESSTRNSFR_LIST3,EGRESS_MODE
21,,,,"Walk (includes wheelchair, skateboard)"


In [10]:
# export to M
print('export {} rows of BART survey data with {} unique IDs'.format(
    df_bart.shape[0],
    len(df_bart.ID.unique())))
df_bart.to_csv(r'M:\Data\OnBoard\Data and Reports\BART\As CSV\BART_Final_Database_Mar18_SUBMITTED_with_station_xy_with_first_board_last_alight_fixColname_modifyTransfer_NO POUND OR SINGLE QUOTE.csv',
               index=False)

export 46948 rows of BART survey data with 46948 unique IDs


#### Caltrain (2014) survey contains 'AirTrain (SFO)' as a transfer leg

In [11]:
df_ct = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\Caltrain\As CSV\Caltrain_Final_Submitted_1_5_2015_TYPE_WEIGHT_DATE NO POUND OR SINGLE QUOTE.csv')
print('read {} rows of Caltrain survey data with {} unique IDs'.format(
    df_ct.shape[0],
    len(df_ct.ID.unique())))

read 6032 rows of Caltrain survey data with 6032 unique IDs


  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
# check which leg has 'AirTrain (SFO)'
for i in ['TRANSFER_FROM_1ST', 'TRANSFER_FROM_2ND', 'TRANSFER_FROM_3RD',
          'TRANSFER_TO_1ST', 'TRANSFER_TO_2ND', 'TRANSFER_TO_3RD']:
    if 'AirTrain (SF Airport)' in list(df_ct[i].unique()):
        print(i)

TRANSFER_FROM_1ST


In [13]:
# fix 'AirTrain' in before tansfers
ct_idx = df_ct.TRANSFER_FROM_1ST == 'AirTrain (SF Airport)'
print('{} responses have AirTain (SFO) as the first before transfer'.format(ct_idx.sum()))

display(df_ct.loc[ct_idx][['ACCESS_MODE_CODE', 'TRANSFERS_FROM_CODE',
                           'TRANSFER_FROM_1ST', 'TRANSFER_FROM_2ND', 'TRANSFER_FROM_3RD']].drop_duplicates())
## note: access_mode 1 = walk

# update access_mode to be 'walk', move 2nd transfer route to be the 1st
df_ct.loc[ct_idx, 'ACCESS_MODE_CODE'] = 1
df_ct.loc[ct_idx, 'TRANSFER_FROM_1ST'] = df_ct['TRANSFER_FROM_2ND']
df_ct.loc[ct_idx, 'TRANSFER_FROM_2ND'] = df_ct['TRANSFER_FROM_3RD']
df_ct.loc[ct_idx, 'TRANSFERS_FROM_CODE'] = 1

# double check
display(df_ct.loc[ct_idx][['ACCESS_MODE_CODE', 'TRANSFERS_FROM_CODE',
                           'TRANSFER_FROM_1ST', 'TRANSFER_FROM_2ND', 'TRANSFER_FROM_3RD']].drop_duplicates())

1 responses have AirTain (SFO) as the first before transfer


Unnamed: 0,ACCESS_MODE_CODE,TRANSFERS_FROM_CODE,TRANSFER_FROM_1ST,TRANSFER_FROM_2ND,TRANSFER_FROM_3RD
2818,1.0,2.0,AirTrain (SF Airport),BART RICH/MILL Richmond to Millbrae,


Unnamed: 0,ACCESS_MODE_CODE,TRANSFERS_FROM_CODE,TRANSFER_FROM_1ST,TRANSFER_FROM_2ND,TRANSFER_FROM_3RD
2818,1.0,1.0,BART RICH/MILL Richmond to Millbrae,,


In [14]:
# fix encoding issues with station names

print(df_ct.ENTER_STATION.value_counts())

df_ct.loc[df_ct.ENTER_STATION == 'College�Park', 'ENTER_STATION'] = 'College Park'
df_ct.loc[df_ct.ENTER_STATION == 'Mountain�View', 'ENTER_STATION'] = 'Mountain View'
df_ct.loc[df_ct.ENTER_STATION == 'Santa�Clara', 'ENTER_STATION'] = 'Santa Clara'
df_ct.loc[df_ct.ENTER_STATION == 'San�Antonio', 'ENTER_STATION'] = 'San Antonio'

print(df_ct.ENTER_STATION.value_counts())

San Francisco        1417
San Jose              708
Palo Alto             554
Mountain�View         408
Millbrae              298
Redwood City          292
Hillsdale             262
Sunnyvale             260
22nd Street           200
San Mateo             200
Menlo Park            168
Santa�Clara           154
San Carlos            133
Burlingame            130
California Ave        127
Lawrence              106
San Bruno             103
So. San Francisco     103
Belmont                92
San�Antonio            90
Tamien                 81
Bayshore               47
Hayward Park           46
College�Park           15
Blossom Hill           10
Morgan Hill             9
Gilroy                  7
Capitol                 6
San Martin              6
Name: ENTER_STATION, dtype: int64
San Francisco        1417
San Jose              708
Palo Alto             554
Mountain View         408
Millbrae              298
Redwood City          292
Hillsdale             262
Sunnyvale             260
22nd

In [15]:
print(df_ct.EXIT_STATION.value_counts())

df_ct.loc[df_ct.EXIT_STATION == 'College�Park', 'EXIT_STATION'] = 'College Park'
df_ct.loc[df_ct.EXIT_STATION == 'Mountain�View', 'EXIT_STATION'] = 'Mountain View'
df_ct.loc[df_ct.EXIT_STATION == 'Santa�Clara', 'EXIT_STATION'] = 'Santa Clara'
df_ct.loc[df_ct.EXIT_STATION == 'San�Antonio', 'EXIT_STATION'] = 'San Antonio'

print(df_ct.EXIT_STATION.value_counts())

San Francisco        1207
Palo Alto             678
San Jose              637
Mountain�View         404
Redwood City          334
Millbrae              301
Sunnyvale             259
Hillsdale             251
Menlo Park            218
Santa�Clara           181
San Mateo             180
San Carlos            174
22nd Street           149
California Ave        148
Burlingame            129
Lawrence              111
So. San Francisco     108
Belmont               105
San Bruno             103
Tamien                 88
San�Antonio            81
Bayshore               57
Hayward Park           50
Morgan Hill            19
Gilroy                 16
Blossom Hill           16
College�Park           13
San Martin              9
Capitol                 6
Name: EXIT_STATION, dtype: int64
San Francisco        1207
Palo Alto             678
San Jose              637
Mountain View         404
Redwood City          334
Millbrae              301
Sunnyvale             259
Hillsdale             251
Menlo

In [16]:
# export to M
print('export {} rows of Caltrain survey data with {} unique IDs'.format(
    df_ct.shape[0],
    len(df_ct.ID.unique())))

df_ct.to_csv(r'M:\Data\OnBoard\Data and Reports\Caltrain\As CSV\Caltrain_Final_Submitted_1_5_2015_TYPE_WEIGHT_DATE_modifyTransfer_NO POUND OR SINGLE QUOTE.csv',
             index=False)

export 6032 rows of Caltrain survey data with 6032 unique IDs
