Pre-processing Caltrain (2014) data in Python caused some issues in station name and transfer route due to encoding error when reading the .csv file into Python, e.g. 'Mountain View' became 'Mountain�View'. I haven't found a way to avoid this string change. So, this script changes the strings back. It also updates the relevant route names in 'canonical_route_crosswalk.csv'.

The same error occurred in 'ENTER_STATION' and 'EXIT_STATION'. It is fixed in 'SuperShuttle_SFOAirTrain_as_Access_Egress_Modes.ipynb'

In [1]:
import pandas as pd
import numpy as np

#### Fix Caltrain 2014 survey data

In [2]:
# Read .csv data of Caltrain survey
# (the data has been processed by 'SuperShuttle_SFOAirTrain_as_Access_Egress_Modes.ipynb')

df_ct = pd.read_csv('M:\\Data\\OnBoard\\Data and Reports\\Caltrain\\As CSV\\Caltrain_Final_Submitted_1_5_2015_TYPE_WEIGHT_DATE_modifyTransfer_NO POUND OR SINGLE QUOTE.csv')
print('read {} rows of Caltrain survey data with {} unique IDs'.format(
    df_ct.shape[0],
    len(df_ct.ID.unique())))

read 6032 rows of Caltrain survey data with 6032 unique IDs


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# fix string errors in transfer routes

transfer_list = [] # create a list to track the routes, which will be compared with 'canonical_route_crosswalk.csv' later

for i in ['TRANSFER_TO_1ST', 'TRANSFER_FROM_1ST',
          'TRANSFER_TO_2ND', 'TRANSFER_FROM_2ND',
          'TRANSFER_TO_3RD', 'TRANSFER_FROM_3RD']:
    print(i)
    print(df_ct[i].unique())
    df_ct[i] = df_ct[i].apply(lambda x: str(x).replace('Ca�ada', 'Canada'))
    df_ct[i] = df_ct[i].apply(lambda x: x.replace('�', ''))
    df_ct.loc[df_ct[i] == 'nan', i] = np.nan
    print(df_ct[i].unique())
    
    transfer_list.extend(list(df_ct[i].unique()))

transfer_list = list(set(transfer_list)) # remove duplicated routes

TRANSFER_TO_1ST
[nan 'BART MILL/RICH Millbrae to Richmond'
 'Muni N Light Rail: Judah - Metro' 'Muni Cable Car - Powell/Mason'
 'Stanford Marguerite Line X Palo Alto Transit Center, Stanford Shopping Center'
 'Muni KT Light Rail: Ingleside/Third Street - Metro'
 'VTA Route 68 GILROY - SAN JOSE DIRIDON'
 'Caltrain-�Shuttles Campus Drive'
 'SamTrans Route ECR Palo Alto - Daly City'
 'Caltrain-�Shuttles Redwood Shores (Bridge Park)'
 'BART RICH/MILL Richmond to Millbrae' 'Caltrain-�Shuttles Seaport Centre'
 'SamTrans Route 275 Woodside/Fernside � Redwood City Transit Center'
 'Muni Route 47 Van Ness' 'VTA Route 35 STANFORD SHOP CTR - DNTN MTN VIEW'
 'Caltrain-�Shuttles Marsh Road' 'Muni Route 83X Mid-Market Express'
 'Muni Route NX Judah Express - Peak direction only'
 'Caltrain-�Shuttles Mary - Moffett' 'VTA Route 522 EASTRIDGE - PALO ALTO'
 'Muni Route 82X Levi Plaza Express'
 'Santa Clara VTA 902 Light Rail: MOUNTAIN VIEW - WINCHESTER'
 'Santa Clara VTA 901 Light Rail: ALUM ROCK-SANTA 

In [4]:
# export the survey data with transfer route names fixed
df_ct.to_csv('M:\\Data\\OnBoard\\Data and Reports\\Caltrain\\As CSV\\Caltrain_Final_Submitted_1_5_2015_TYPE_WEIGHT_DATE_modifyTransfer_fixRouteNames_NO POUND OR SINGLE QUOTE.csv',
             index=False)

#### Fix 'canonical_route_crosswalk.csv'

In [5]:
# read crosswalk
route_df = pd.read_csv(r'C:\Users\ywang\Documents\GitHub\onboard-surveys\make-uniform\production\canonical_route_crosswalk.csv')

# fix string errors in 'survey_name' field which is the transfer route names used in the survey
print(route_df.loc[route_df.survey == 'Caltrain'].survey_name.unique())

route_df.loc[route_df.survey == 'Caltrain',
             'survey_name'] = route_df['survey_name'].apply(lambda x: str(x).replace('Ca�ada', 'Canada'))
route_df.loc[route_df.survey == 'Caltrain',
             'survey_name'] = route_df['survey_name'].apply(lambda x: x.replace('�', ''))
route_df.loc[route_df.survey == 'Caltrain',
             'survey_name'] = route_df['survey_name'].apply(lambda x: x.replace('?', ' '))

# print to double check it's all fixed
print(route_df.loc[route_df.survey == 'Caltrain'].survey_name.unique())

['AC Transit Route 18 University Village Albany to Montclair'
 'AC Transit Route 20 Dimond District Oakland to downtown Oakland'
 'AC Transit Route 26 Emery Bay Public Market to Lakeshore Ave.'
 'AC Transit Route 51B Rockridge BART to Berkeley Amtrak'
 'AC Transit Route 52 University Village to UC Campus (Berkeley BART).'
 'AC Transit Route 72 Hilltop Mall to Oakland Amtrak'
 'AC Transit Route 98 Coliseum BART Edgewater Dr.'
 'Dumbarton Express Route DB Union City BART to Stanford Oval'
 'AC Transit Route M Hayward BART to Oracle'
 'AC Transit Route NX3 Marlow Dr. & Foothill Way Oakland' 'Amtrak Shuttle'
 'BART DALY/DUBLIN Daly City To Dublin/Pleasanton'
 'BART DALY/FREMONT Daly City To Fremont'
 'BART DUBLIN/DALY Dublin/Pleasanton To Daly City'
 'BART FREMONT/DALY Fremont To Daly City'
 'BART FREMONT/RICH Fremont To Richmond'
 'BART MILL/RICH Millbrae to Richmond'
 'BART BAY PT/SFIA Pittsburg/Bay Point to San Francisco International Airport'
 'BART RICH/FREMONT Richmond To Fremont'
 '

['AC Transit Route 18 University Village Albany to Montclair'
 'AC Transit Route 20 Dimond District Oakland to downtown Oakland'
 'AC Transit Route 26 Emery Bay Public Market to Lakeshore Ave.'
 'AC Transit Route 51B Rockridge BART to Berkeley Amtrak'
 'AC Transit Route 52 University Village to UC Campus (Berkeley BART).'
 'AC Transit Route 72 Hilltop Mall to Oakland Amtrak'
 'AC Transit Route 98 Coliseum BART Edgewater Dr.'
 'Dumbarton Express Route DB Union City BART to Stanford Oval'
 'AC Transit Route M Hayward BART to Oracle'
 'AC Transit Route NX3 Marlow Dr. & Foothill Way Oakland' 'Amtrak Shuttle'
 'BART DALY/DUBLIN Daly City To Dublin/Pleasanton'
 'BART DALY/FREMONT Daly City To Fremont'
 'BART DUBLIN/DALY Dublin/Pleasanton To Daly City'
 'BART FREMONT/DALY Fremont To Daly City'
 'BART FREMONT/RICH Fremont To Richmond'
 'BART MILL/RICH Millbrae to Richmond'
 'BART BAY PT/SFIA Pittsburg/Bay Point to San Francisco International Airport'
 'BART RICH/FREMONT Richmond To Fremont'
 '

In [6]:
# also fix the canonical names
route_df.loc[route_df.survey == 'Caltrain',
             'canonical_name'] = route_df['canonical_name'].apply(lambda x: str(x).replace('Ca�ada', 'Canada'))
route_df.loc[route_df.survey == 'Caltrain',
             'canonical_name'] = route_df['canonical_name'].apply(lambda x: x.replace('�', ''))
route_df.loc[route_df.survey == 'Caltrain',
             'canonical_name'] = route_df['canonical_name'].apply(lambda x: x.replace('?', ' '))

In [7]:
# drop duplicates - duplicate might be introduced, e.g. when replacing 'Ca�ada' with 'Canada'
print('crosswalk previously has {} rows'.format(route_df.shape[0]))
route_df.drop_duplicates(inplace=True)
print('after fixing the strings, crosswalk has {} rows'.format(route_df.shape[0]))

crosswalk previously has 8997 rows
after fixing the strings, crosswalk has 8997 rows


In [8]:
# check no transfer route is missing from the crosswalk

d = list(route_df.loc[route_df.survey == 'Caltrain']['survey_name'])

print('dictionary is missing')
for i in transfer_list:
    if not i in d:
        print(i)   

dictionary is missing
nan


In [9]:
# export the crosswalk
route_df.to_csv(r'C:\Users\ywang\Documents\GitHub\onboard-surveys\make-uniform\production\canonical_route_crosswalk.csv',
             index=False)