In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

### Table of Contents

* [modify survey data](#modify_survey_data)
    * [read raw data](#read_raw_data)
    * [build before_transfers and 'access_mode' from raw access mode fields](#build_access_mode)
    * [build after_transfers and 'egress_mode' from raw access mode fields](#build_egress_mode)
    * [deal with 'hispanic' and 'ethnicity/race'](#race)
    * [deal with trip purpose](#trip_purp)
    * [code home lat/lon based on zipcode](#home_lat_lon)
    * [code board/alight station name and lat/lon](#station_lat_lon)
    * [update 'weight'](#weight)
    * [impute year_born from 'age group'](#age)
    * [export survey data](#survey_export)

* [build standard dictionary](#standard_dict)
    * [read raw variable dictionary 'Field Guide'](#raw_dict)
    * [add rows to the dictionary for the new fields added to the survey data](#add_row)
    * [add default fields in the standard dictionary](#add_fields)
    * [check consistency between values in survey data and in the dictionary](#check)
    * [export raw standard dictionary](#export_dict)
    
* [build canonical route crosswalk](#canonical_route)

## modify survey data <a class="anchor" id="modify_survey_data"></a>

### read raw data  <a class="anchor" id="read_raw_data"></a>

In [2]:
df_raw = pd.read_excel(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\CAPCO19 Data-For MTC.xlsx',
                       sheet_name='Data')
print('read {} records, with {} unique CCGID'.format(df_raw.shape[0], len(df_raw.CCGID.unique())))

df = df_raw.copy()
print(list(df))

read 2406 records, with 2406 unique CCGID
['RESPNUM', 'CCGID', 'TRAIN', 'INTDAY', 'INTDATE', 'PERIOD', 'LANGUAGE', 'Q1A', 'BOARD', 'Q1B', 'ALIGHT', 'Q2A_1', 'Q2A_2', 'Q2A_3', 'Q2A_4', 'Q2A_5', 'Q2A_6', 'Q2B_1', 'Q2B_2', 'Q2B_3', 'Q2B_4', 'Q2B_5', 'Q2B_6', 'Q3', 'Q4', 'Q5_1', 'Q5_2', 'Q5_3', 'Q5_4', 'Q6_1', 'Q6_2', 'Q6_3', 'Q6_4', 'Q7', 'Q8_1', 'Q8_2', 'Q8_3', 'Q8_4', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13_1', 'Q13_2', 'Q13_3', 'Q13_4', 'Q15_1', 'Q15_2', 'Q15_3', 'Q15_4', 'Q16_1', 'Q16_2', 'Q16_3', 'Q16_4', 'Q17', 'Q18', 'Q19', 'Unnamed: 57', 'Q20_1', 'Q20_2', 'Q20_3', 'Q20_4', 'Q21', 'Q22', 'CITY', 'CITY_CODE', 'COUNTY', 'COUNTY_CODE ', 'STATE', 'STATE_CODE ', 'COUNTRY', 'WEIGHT']


### build before_transfers and 'access_mode' from raw access mode fields Q2A_1-Q2A_6 <a class="anchor" id="build_access_mode"></a>

In [3]:
# row dictionary of Q2A_1-Q2A_6 / Q2B_1-Q2B_6:

mode_dict = {1: 'Dropped off/Picked up', 
            2: 'Drove alone', 
            3: 'Carpool', 
            4: 'Taxi/Uber/Lyft', 
            5: 'BART', 
            6: 'Caltrain', 
            7: 'Light rail (VTA, Sacramento RT)', 
            8: 'Amtrak thruway bus', 
            9: 'Amtrak long distance train', 
            10: 'Bus transit', 
            11: 'Walked', 
            12: 'Bike', 
            13: 'Electric Scooter/Scooter', 
            14: 'Other (Specify)' 
            }

mode_cat_dict = {'Dropped off/Picked up': 'knr',
                 'Drove alone': 'pnr',
                 'Carpool': 'carpool',
                 'Taxi/Uber/Lyft': 'tnc',
                 'BART': 'transit',
                 'Caltrain': 'transit', 
                 'Light rail (VTA, Sacramento RT)': 'transit',
                 'Amtrak thruway bus': 'transit', 
                 'Amtrak long distance train': 'transit', 
                 'Bus transit': 'transit', 
                 'Walked': 'walk',
                 'Bike': 'bike',
                 'Electric Scooter/Scooter': 'scooter', 
                 'Other (Specify)': 'other'}

## create fields

for colname in ['access_mode', 'egress_mode',
                'first_route_before_survey_board', 'second_route_before_survey_board', 'third_route_before_survey_board',
                'first_route_after_survey_alight', 'second_route_after_survey_alight', 'third_route_after_survey_alight']:
    df[colname] = np.nan
    
# first, error in CCGID 63 Q2A_3, update to nan
display(df.loc[df.CCGID == 63][['CCGID', 'Q2A_1', 'Q2A_2', 'Q2A_3', 'Q2A_4', 'Q2A_5', 'Q2A_6']])
df.loc[df.CCGID == 63, 'Q2A_3'] = np.nan

Unnamed: 0,CCGID,Q2A_1,Q2A_2,Q2A_3,Q2A_4,Q2A_5,Q2A_6
62,63,4,,11+H13:H32,,,


In [4]:
# create a column to show the names for Q2A_1-6, Q2B_1-6, and count number of access_modes/egress_modes
for i in ['Q2A_1', 'Q2A_2', 'Q2A_3', 'Q2A_4', 'Q2A_5', 'Q2A_6', 'Q2B_1', 'Q2B_2', 'Q2B_3', 'Q2B_4', 'Q2B_5', 'Q2B_6']:
    df[i+'_name'] = df[i].map(mode_dict)
    df[i+'_cat'] = df[i+'_name'].map(mode_cat_dict)

df['access_count'] = df[['Q2A_1', 'Q2A_2', 'Q2A_3', 'Q2A_4', 'Q2A_5', 'Q2A_6']].count(axis=1)
df['egress_count'] = df[['Q2B_1', 'Q2B_2', 'Q2B_3', 'Q2B_4', 'Q2B_5', 'Q2B_6']].count(axis=1) 

# create a column as a placeholder to concatenate access_mode_categories and egress_mode_categories
df['access_concat'] = ''
df['egress_concat'] = ''

In [5]:
## one access mode in raw data
# 1. for responses with one access_mode (access_count==1) and the mode is not public transit,
# set Q2A_1 as access mode, and no before-transfer
idx_1 = ((df.access_count == 1) & (df.Q2A_1_cat != 'transit'))
print('access_mode idx_1 find {} rows, {:.1%} of total'.format(idx_1.sum(), idx_1.sum()/df.shape[0]))
df.loc[idx_1, 'access_mode'] = df['Q2A_1']

# 2. for responses with one access_mode (access_count==1) and the mode is public transit,
# set access mode as "Missing", and Q2A_1 as before-transfer
idx_2 = ((df.access_count == 1) & (df.Q2A_1_cat == 'transit'))
print('access_mode idx_2 find {} rows, {:.1%} of total'.format(idx_2.sum(), idx_2.sum()/df.shape[0]))
df.loc[idx_2, 'access_mode'] = 'Missing'
df.loc[idx_2, 'first_route_before_survey_board'] = df['Q2A_1']

## two access modes in raw data
df.loc[df.access_count == 2, 'access_concat'] = df['Q2A_1_cat'] + '_' + df['Q2A_2_cat']
print('\n responses with 2 access modes have the following unique combinations of mode categories:\n{}\n'.format(df.loc[df.access_count == 2].access_concat.unique()))

# 3. for responses with two access_mode (access_count==2) and both are public transit,
# set access mode as "Missing", and before-transfer as "transit_pending"
idx_3 = ((df.access_count == 2) & (df.access_concat == 'transit_transit'))
print('access_mode idx_3 find {} rows, {:.1%} of total'.format(idx_3.sum(), idx_3.sum()/df.shape[0]))
df.loc[idx_3, 'access_mode'] = 'Missing'
df.loc[idx_3, 'first_route_before_survey_board'] = 'transit_pending'
df.loc[idx_3, 'second_route_before_survey_board'] = 'transit_pending'

# 4/5. for responses with two access_mode (access_count==2) and only one value public transit,
# set access mode as the non-transit value, and the transit-value as before-transfer

idx_4 = ((df.access_count == 2) & (
              (df.access_concat == 'transit_walk') | (
               df.access_concat == 'transit_bike') | (
               df.access_concat == 'transit_pnr')))            # Q2A_1 is public transit
print('access_mode idx_4 find {} rows, {:.1%} of total'.format(idx_4.sum(), idx_4.sum()/df.shape[0]))
df.loc[idx_4, 'access_mode'] = df['Q2A_2']
df.loc[idx_4, 'first_route_before_survey_board'] = df['Q2A_1']

idx_5 = ((df.access_count == 2) & (
              (df.access_concat == 'knr_transit') | (
               df.access_concat == 'tnc_transit') | (
               df.access_concat == 'pnr_transit') | (
               df.access_concat == 'walk_transit')))           # Q2A_2 is public transit
print('access_mode idx_5 find {} rows, {:.1%} of total'.format(idx_5.sum(), idx_5.sum()/df.shape[0]))
df.loc[idx_5, 'access_mode'] = df['Q2A_1']
df.loc[idx_5, 'first_route_before_survey_board'] = df['Q2A_2']

# 6. for responses with two access_mode (access_count==2) and both are not public transit,
# set access mode as "Missing", and no before-transfer

idx_6 = ((df.access_count == 2) & (df.access_concat.str.contains('transit') == False))
print('access_mode idx_6 find {} rows, {:.1%} of total'.format(idx_6.sum(), idx_6.sum()/df.shape[0]))
df.loc[idx_6, 'access_mode'] = 'Missing'


## three access modes in raw data
df.loc[df.access_count == 3, 'access_concat'] = df['Q2A_1_cat'] + '_' + df['Q2A_2_cat'] + '_' + df['Q2A_3_cat']
print('\n responses with 3 access modes have the following unique combinations of mode categories:\n{}\n'.format(df.loc[df.access_count == 3].access_concat.unique()))

# 7. for responses with three access_mode (access_count==3) and all three are transit,
# then access mode is "Missing", before-transfers are pending
idx_7 = ((df.access_count == 3) & (df.access_concat == 'transit_transit_transit'))
print('access_mode idx_7 find {} rows, {:.1%} of total'.format(idx_7.sum(), idx_7.sum()/df.shape[0]))
df.loc[idx_7, 'access_mode'] = 'Missing'
df.loc[idx_7, 'first_route_before_survey_board'] = 'transit_pending'
df.loc[idx_7, 'second_route_before_survey_board'] = 'transit_pending'
df.loc[idx_7, 'third_route_before_survey_board'] = 'transit_pending'

# 8/9/10. for responses with three access_mode (access_count==3) and only one value is public transit,
# then access mode is "Missing", the transit value is before-transfer
idx_8 = ((df.access_count == 3) & (
              (df.access_concat == 'transit_walk_bike') | (
               df.access_concat == 'transit_walk_scooter') | (
               df.access_concat == 'transit_knr_pnr')))         # Q2A_1 is public transit
print('access_mode idx_8 find {} rows, {:.1%} of total'.format(idx_8.sum(), idx_8.sum()/df.shape[0]))
df.loc[idx_8, 'access_mode'] = 'Missing'
df.loc[idx_8, 'first_route_before_survey_board'] = df['Q2A_1']

idx_9 = ((df.access_count == 3) & (
              (df.access_concat == 'knr_transit_tnc') | (
               df.access_concat == 'knr_transit_pnr') | (
               df.access_concat == 'pnr_transit_walk') | (
               df.access_concat == 'tnc_transit_bike') | (
               df.access_concat == 'knr_transit_walk')))        # Q2A_2 is public transit
print('access_mode idx_9 find {} rows, {:.1%} of total'.format(idx_9.sum(), idx_9.sum()/df.shape[0]))
df.loc[idx_9, 'access_mode'] = 'Missing'
df.loc[idx_9, 'first_route_before_survey_board'] = df['Q2A_2']

idx_10 = ((df.access_count == 3) & (
              (df.access_concat == 'pnr_knr_transit') | (
               df.access_concat == 'knr_pnr_transit')))        # Q1A_3 is public transit
print('access_mode idx_10 find {} rows, {:.1%} of total'.format(idx_10.sum(), idx_10.sum()/df.shape[0]))
df.loc[idx_10, 'access_mode'] = 'Missing'
df.loc[idx_10, 'first_route_before_survey_board'] = df['Q2A_3']

# 11/12. for responses with three access_mode (access_count==3) and two are transits,
# then access mode is the non-transit value, before-transfers are pending
idx_11 = ((df.access_count == 3) & (df.access_concat == 'knr_transit_transit'))  # Q2A_1 not transit
print('access_mode idx_11 find {} rows, {:.1%} of total'.format(idx_11.sum(), idx_11.sum()/df.shape[0]))
df.loc[idx_11, 'access_mode'] = df['Q2A_1']
df.loc[idx_11, 'first_route_before_survey_board'] = 'transit_pending'
df.loc[idx_11, 'second_route_before_survey_board'] = 'transit_pending'

idx_12 = ((df.access_count == 3) & (df.access_concat == 'transit_walk_transit'))  # Q2A_2 not transit
print('access_mode idx_12 find {} rows, {:.1%} of total'.format(idx_12.sum(), idx_12.sum()/df.shape[0]))
df.loc[idx_12, 'access_mode'] = df['Q2A_2']
df.loc[idx_12, 'first_route_before_survey_board'] = 'transit_pending'
df.loc[idx_12, 'second_route_before_survey_board'] = 'transit_pending'

# 13. for responses with three access_mode (access_count==3) and all are non-transit,
# set access mode as "Missing", and no before-transfer

idx_13 = ((df.access_count == 3) & (df.access_concat.str.contains('transit') == False))
print('access_mode idx_13 find {} rows, {:.1%} of total'.format(idx_13.sum(), idx_13.sum()/df.shape[0]))
df.loc[idx_13, 'access_mode'] = 'Missing'

# examine the remaining access_mode.isnull()
display(df.loc[df.access_mode.isnull()][['CCGID', 'access_count', 'Q2A_1_cat', 'Q2A_2_cat',
                                         'Q2A_3_cat', 'Q2A_4_cat',
                                         'Q2A_5_cat', 'Q2A_6_cat']].dropna(how='all', axis=1))

access_mode idx_1 find 1923 rows, 79.9% of total
access_mode idx_2 find 303 rows, 12.6% of total

 responses with 2 access modes have the following unique combinations of mode categories:
['knr_transit' 'pnr_tnc' 'transit_walk' 'knr_tnc' 'carpool_tnc'
 'knr_carpool' 'transit_transit' 'transit_bike' 'knr_pnr' 'knr_walk'
 'tnc_transit' 'pnr_walk' 'pnr_bike' 'pnr_transit' 'walk_knr'
 'pnr_carpool' 'tnc_walk' 'bike_scooter' 'walk_bike' 'knr_bike'
 'pnr_scooter' 'tnc_scooter' 'walk_transit' 'transit_pnr' 'bike_pnr'
 'walk_scooter' 'pnr_knr' 'tnc_bike']

access_mode idx_3 find 4 rows, 0.2% of total
access_mode idx_4 find 18 rows, 0.7% of total
access_mode idx_5 find 27 rows, 1.1% of total
access_mode idx_6 find 87 rows, 3.6% of total

 responses with 3 access modes have the following unique combinations of mode categories:
['knr_tnc_walk' 'knr_transit_transit' 'pnr_knr_transit' 'knr_transit_tnc'
 'walk_knr_tnc' 'knr_pnr_bike' 'knr_transit_pnr' 'pnr_walk_bike'
 'pnr_scooter_bike' 'pnr_transit

Unnamed: 0,CCGID,access_count,Q2A_1_cat,Q2A_2_cat,Q2A_3_cat,Q2A_4_cat,Q2A_5_cat,Q2A_6_cat
120,122,6,knr,carpool,transit,walk,bike,scooter
157,159,6,knr,tnc,transit,transit,transit,walk
441,443,5,tnc,transit,transit,walk,pnr,
466,468,5,knr,tnc,transit,transit,walk,
876,878,4,knr,transit,tnc,transit,,
924,926,4,knr,transit,transit,walk,,
965,967,4,bike,knr,pnr,tnc,,
967,969,4,transit,tnc,transit,transit,,
992,994,4,knr,pnr,tnc,transit,,
1016,1018,6,pnr,transit,transit,transit,walk,scooter


In [6]:
## cases with one transit mode and multiple non-transit modes - set the transit mode as before-transfer, access_mode is 'Missing'
# CCGID 122 (knr-carpool-transit-walk-bike-scooter): Q2A_3 before-transfer
# CCGID 1566 (knr-pnr-transit-walk-bike-tnc): Q2A_3 before-transfer
# CCGID 994 (knr-pnr-tnc-transit): Q2A_4 before-transfer
# CCGID 2163 (pnr-transit-walk-scooter): Q2A_2 before-transfer
# CCGID 1122 (transit-knr-tnc-bike): Q2A_1 before-transfer
df.loc[(df.CCGID==122) | (df.CCGID==1566) | (df.CCGID==994) | (df.CCGID==2163) | (df.CCGID==1122),
       'access_mode'] = 'Missing'
df.loc[(df.CCGID==122) | (df.CCGID==1566), 'first_route_before_survey_board'] = df['Q2A_3']
df.loc[df.CCGID==994, 'first_route_before_survey_board'] = df['Q2A_4']
df.loc[df.CCGID==2163, 'first_route_before_survey_board'] = df['Q2A_2']
df.loc[df.CCGID==1122, 'first_route_before_survey_board'] = df['Q2A_1']

## cases with two transit modes and multiple non-transit modes - before-transfers as 'pending', access_mode 'Missing'
# CCGID 443 (tnc-transit-transit-walk-pnr)
# CCGID 468 (knr-tnc-transit-transit-walk)
# CCGID 878 (knr-transit-tnc-transit)
# CCGID 926 (knr-transit-transit-walk)
# CCGID 2010 (knr-tnc-transit-transit)
df.loc[(df.CCGID==443) | (df.CCGID==468) | (df.CCGID==878) | (df.CCGID==926) | (df.CCGID==2010),
       'access_mode'] = 'Missing'
df.loc[(df.CCGID==443) | (df.CCGID==468) | (df.CCGID==878) | (df.CCGID==926) | (df.CCGID==2010),
       'first_route_before_survey_board'] = 'transit_pending'
df.loc[(df.CCGID==443) | (df.CCGID==468) | (df.CCGID==878) | (df.CCGID==926) | (df.CCGID==2010),
       'second_route_before_survey_board'] = 'transit_pending'

## cases with three transit modes and one non-transit mode - before-transfer as 'pending', access_mode is the non-transit mode
# CCGID 969 (transit-tnc-transit-transit)
df.loc[df.CCGID==969, 'access_mode'] = df['Q2A_2']
df.loc[df.CCGID==969, 'first_route_before_survey_board'] = 'transit_pending'
df.loc[df.CCGID==969, 'second_route_before_survey_board'] = 'transit_pending'
df.loc[df.CCGID==969, 'third_route_before_survey_board'] = 'transit_pending'

## cases with three transit modes and multiple non-transit modes - before-transfers as 'pending', access_mode 'Missing'
# CCGID 159 (knr-tnc-transit-transit-transit-walk)
# CCGID 1018 (pnr-transit-transit-transit-walk-scooter)
# CCGID 1714 (knr-pnr-transit-transit-transit)
df.loc[(df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1714), 'access_mode'] = 'Missing'
df.loc[(df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1714), 'first_route_before_survey_board'] = 'transit_pending'
df.loc[(df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1714), 'second_route_before_survey_board'] = 'transit_pending'
df.loc[(df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1714), 'third_route_before_survey_board'] = 'transit_pending'

## cases with no transit modes and multiple non-transit modes - access_mode 'Missing', no before-transfer
# CCGID 967 (bike-knr-pnr-tnc)
# CCGID 1134 (bike-knr-pnr-carpool)
# CCGID 2073 (knr-pnr-carpool-tnc)
df.loc[(df.CCGID==967) | (df.CCGID==1134) | (df.CCGID==2073), 'access_mode'] = 'Missing'

# finally, check there is no row with access_mode.isnull()
print('{} rows is missing access_mode'.format(df.access_mode.isnull().sum()))

0 rows is missing access_mode


### build after_transfers and 'egress_mode' from raw access mode fields Q2B_1-Q2B_6  <a class="anchor" id="build_egress_mode"></a>

In [7]:
# 1. for responses with one egress_mode (egress_count==1) and the mode is not public transit,
# set Q2B_1 as egress mode, and no after-transfer
idx_1 = ((df.egress_count == 1) & (df.Q2B_1_cat != 'transit'))
print('egress_mode idx_1 find {} rows, {:.1%} of total'.format(idx_1.sum(), idx_1.sum()/df.shape[0]))
df.loc[idx_1, 'egress_mode'] = df['Q2B_1']

# 2. for responses with one egress_mode (egress_count==1) and the mode is public transit,
# set egress mode as "Missing", and Q2B_1 as after-transfer
idx_2 = ((df.egress_count == 1) & (df.Q2B_1_cat == 'transit'))
print('egress_mode idx_2 find {} rows, {:.1%} of total'.format(idx_2.sum(), idx_2.sum()/df.shape[0]))
df.loc[idx_2, 'egress_mode'] = 'Missing'
df.loc[idx_2, 'first_route_after_survey_alight'] = df['Q2B_1']


## two egress modes in raw data
df.loc[df.egress_count == 2, 'egress_concat'] = df['Q2B_1_cat'] + '_' + df['Q2B_2_cat']
print('\n responses with 2 egress modes have the following unique combinations of mode categories:\n{}\n'.format(df.loc[df.egress_count == 2].egress_concat.unique()))

# 3. for responses with two egress_mode (egress_count==2) and both are public transit,
# set egress mode as "Missing", and after-transfer as "transit_pending"
idx_3 = ((df.egress_count == 2) & (df.egress_concat == 'transit_transit'))
print('egress_mode idx_3 find {} rows, {:.1%} of total'.format(idx_3.sum(), idx_3.sum()/df.shape[0]))
df.loc[idx_3, 'egress_mode'] = 'Missing'
df.loc[idx_3, 'first_route_after_survey_alight'] = 'transit_pending'
df.loc[idx_3, 'second_route_after_survey_alight'] = 'transit_pending'

# 4/5. for responses with two egress_mode (egress_count==2) and only one value public transit,
# set egress mode as the non-transit value, and the transit-value as after-transfer

idx_4 = ((df.egress_count == 2) & (
              (df.egress_concat == 'transit_walk') | (
               df.egress_concat == 'transit_bike') | (
               df.egress_concat == 'transit_scooter') | (
               df.egress_concat == 'transit_carpool')))            # Q2B_1 is public transit
print('egress_mode idx_4 find {} rows, {:.1%} of total'.format(idx_4.sum(), idx_4.sum()/df.shape[0]))
df.loc[idx_4, 'egress_mode'] = df['Q2B_2']
df.loc[idx_4, 'first_route_after_survey_alight'] = df['Q2B_1']


idx_5 = ((df.egress_count == 2) & (
              (df.egress_concat == 'walk_transit') | (
               df.egress_concat == 'tnc_transit') | (
               df.egress_concat == 'carpool_transit') | (
               df.egress_concat == 'knr_transit') | (
               df.egress_concat == 'pnr_transit')))             # Q2B_2 is public transit
print('egress_mode idx_5 find {} rows, {:.1%} of total'.format(idx_5.sum(), idx_5.sum()/df.shape[0]))
df.loc[idx_5, 'egress_mode'] = df['Q2B_1']
df.loc[idx_5, 'first_route_after_survey_alight'] = df['Q2B_2']

# 6. for responses with two egress_mode (egress_count==2) and both are not public transit,
# set egress mode as "Missing", and no after-transfer

idx_6 = ((df.egress_count == 2) & (df.egress_concat.str.contains('transit') == False))
print('egress_mode idx_6 find {} rows, {:.1%} of total'.format(idx_6.sum(), idx_6.sum()/df.shape[0]))
df.loc[idx_6, 'egress_mode'] = 'Missing'


## three egress modes in raw data
df.loc[df.egress_count == 3, 'egress_concat'] = df['Q2B_1_cat'] + '_' + df['Q2B_2_cat'] + '_' + df['Q2B_3_cat']
print('\n responses with 3 egress modes have the following unique combinations of mode categories:\n{}\n'.format(df.loc[df.egress_count == 3].egress_concat.unique()))

# 7. for responses with three egress_mode (egress_count==3) and all three are transit,
# then egress mode is "Missing", after-transfers are pending

idx_7 = ((df.egress_count == 3) & (df.egress_concat == 'transit_transit_transit'))
print('egress_mode idx_7 find {} rows, {:.1%} of total'.format(idx_7.sum(), idx_7.sum()/df.shape[0]))
df.loc[idx_7, 'egress_mode'] = 'Missing'
df.loc[idx_7, 'first_route_after_survey_alight'] = 'transit_pending'
df.loc[idx_7, 'second_route_after_survey_alight'] = 'transit_pending'
df.loc[idx_7, 'third_route_after_survey_alight'] = 'transit_pending'

# 8/9/10. for responses with three egress_mode (egress_count==3) and only one value is public transit,
# then egress mode is "Missing", the transit value is after-transfer
idx_8 = ((df.egress_count == 3) & (
              (df.egress_concat == 'transit_walk_bike') | (
               df.egress_concat == 'transit_bike_walk') | (
               df.egress_concat == 'transit_knr_pnr') | (
               df.egress_concat == 'transit_walk_scooter') | (
               df.egress_concat == 'transit_knr_transit')))         # Q2B_1 is public transit
print('egress_mode idx_8 find {} rows, {:.1%} of total'.format(idx_8.sum(), idx_8.sum()/df.shape[0]))
df.loc[idx_8, 'egress_mode'] = 'Missing'
df.loc[idx_8, 'first_route_after_survey_alight'] = df['Q2B_1']

idx_9 = ((df.egress_count == 3) & (
              (df.egress_concat == 'tnc_transit_walk') | (
               df.egress_concat == 'knr_transit_walk') | (
               df.egress_concat == 'pnr_transit_walk')))        # Q2B_2 is public transit
print('egress_mode idx_9 find {} rows, {:.1%} of total'.format(idx_9.sum(), idx_9.sum()/df.shape[0]))
df.loc[idx_9, 'egress_mode'] = 'Missing'
df.loc[idx_9, 'first_route_after_survey_alight'] = df['Q2B_2']

idx_10 = ((df.egress_count == 3) & (
              (df.egress_concat == 'knr_pnr_transit') | (
               df.egress_concat == 'knr_tnc_transit')))        # Q2B_3 is public transit
print('egress_mode idx_10 find {} rows, {:.1%} of total'.format(idx_10.sum(), idx_10.sum()/df.shape[0]))
df.loc[idx_10, 'egress_mode'] = 'Missing'
df.loc[idx_10, 'first_route_after_survey_alight'] = df['Q2B_3']

# 11/12. for responses with three egress_mode (egress_count==3) and two are transits,
# then egress mode is the non-transit value, after-transfers are pending
idx_11 = ((df.egress_count == 3) & (
            (df.egress_concat == 'transit_walk_transit') | (
             df.egress_concat == 'transit_knr_transit')))     # Q2B_2 not transit
print('egress_mode idx_11 find {} rows, {:.1%} of total'.format(idx_11.sum(), idx_11.sum()/df.shape[0]))
df.loc[idx_11, 'egress_mode'] = df['Q2B_2']
df.loc[idx_11, 'first_route_after_survey_alight'] = 'transit_pending'
df.loc[idx_11, 'second_route_after_survey_alight'] = 'transit_pending'

idx_12 = ((df.egress_count == 3) & (df.egress_concat == 'transit_transit_walk'))  # Q2B_3 not transit
print('egress_mode idx_12 find {} rows, {:.1%} of total'.format(idx_12.sum(), idx_12.sum()/df.shape[0]))
df.loc[idx_12, 'egress_mode'] = df['Q2B_3']
df.loc[idx_12, 'first_route_after_survey_alight'] = 'transit_pending'
df.loc[idx_12, 'second_route_after_survey_alight'] = 'transit_pending'

# 13. for responses with three egress_mode (egress_count==3) and all are non-transit,
# set egress mode as "Missing", and no after-transfer

idx_13 = ((df.egress_count == 3) & (df.egress_concat.str.contains('transit') == False))
print('egress_mode idx_13 find {} rows, {:.1%} of total'.format(idx_13.sum(), idx_13.sum()/df.shape[0]))
df.loc[idx_13, 'egress_mode'] = 'Missing'


# examine the remaining egress_mode.isnull()
display(df.loc[df.egress_mode.isnull()][['CCGID', 'Q2B_1_cat', 'Q2B_2_cat',
                                         'Q2B_3_cat', 'Q2B_4_cat',
                                         'Q2B_5_cat', 'Q2B_6_cat']].dropna(how='all', axis=1))

egress_mode idx_1 find 1886 rows, 78.4% of total
egress_mode idx_2 find 329 rows, 13.7% of total

 responses with 2 egress modes have the following unique combinations of mode categories:
['transit_transit' 'transit_walk' 'transit_bike' 'walk_transit' 'knr_tnc'
 'tnc_transit' 'pnr_walk' 'knr_walk' 'knr_pnr' 'walk_bike'
 'carpool_transit' 'knr_transit' 'pnr_scooter' 'pnr_transit' 'scooter_tnc'
 'walk_tnc' 'pnr_carpool' 'walk_knr' 'bike_scooter' 'knr_carpool'
 'carpool_tnc' 'transit_scooter' 'pnr_tnc' 'tnc_walk' nan 'walk_pnr'
 'transit_carpool' 'bike_walk' 'tnc_bike' 'carpool_pnr' 'walk_scooter'
 'pnr_bike']

egress_mode idx_3 find 6 rows, 0.2% of total
egress_mode idx_4 find 47 rows, 2.0% of total
egress_mode idx_5 find 28 rows, 1.2% of total
egress_mode idx_6 find 64 rows, 2.7% of total

 responses with 3 egress modes have the following unique combinations of mode categories:
['knr_tnc_walk' 'tnc_transit_walk' 'knr_pnr_transit' 'knr_transit_walk'
 'knr_tnc_transit' 'walk_bike_scooter'

Unnamed: 0,CCGID,Q2B_1_cat,Q2B_2_cat,Q2B_3_cat,Q2B_4_cat,Q2B_5_cat,Q2B_6_cat
120,122,pnr,tnc,transit,transit,transit,transit
157,159,walk,transit,transit,transit,tnc,knr
441,443,transit,transit,walk,pnr,,
570,572,transit,walk,bike,scooter,,
924,926,knr,transit,transit,walk,,
1016,1018,knr,carpool,tnc,transit,transit,transit
1053,1055,transit,,,,,
1564,1566,tnc,transit,transit,walk,,
1712,1714,carpool,tnc,scooter,other,,
1772,1774,pnr,carpool,tnc,transit,transit,transit


In [8]:
## cases with one transit mode and multiple non-transit modes - set the transit mode as after-transfer, egress_mode is 'Missing'
# CCGID 572 (transit-walk-bike-scooter): Q2B_1 after-transfer
# CCGID 1055 (transit): Q2B_1 after-transfer
# CCGID 2073 (carpool-tnc-transit-walk): Q2B_3 after-transfer
# CCGID 2147 (knr-carpool-tnc-transit): Q2B_4 after-transfer
df.loc[(df.CCGID==572) | (df.CCGID==1055) | (df.CCGID==2073) | (df.CCGID==2147),
       'egress_mode'] = 'Missing'
df.loc[(df.CCGID==572) | (df.CCGID==1055), 'first_route_after_survey_alight'] = df['Q2B_1']
df.loc[df.CCGID==2073, 'first_route_after_survey_alight'] = df['Q2B_3']
df.loc[df.CCGID==2147, 'first_route_after_survey_alight'] = df['Q2B_4']

# cases with two transit modes and multiple non-transit modes - set after-transfers as pending, egress_mode as 'Missing'
# CCGID 443 (transit-transit-walk-pnr)
# CCGID 926 (knr-transit-transit-walk)
# CCGID 1566 (tnc-transit-transit-walk)
df.loc[(df.CCGID==443) | (df.CCGID==926) | (df.CCGID==1566),
       'egress_mode'] = 'Missing'
df.loc[(df.CCGID==443) | (df.CCGID==926) | (df.CCGID==1566),
       'first_route_after_survey_alight'] = 'transit_pending'
df.loc[(df.CCGID==443) | (df.CCGID==926) | (df.CCGID==1566),
       'second_route_after_survey_alight'] = 'transit_pending'

# cases with three or more transit modes and multiple non-transit modes - set after-transfers as pending, egress_mode as 'Missing'
# CCGID 122 (pnr-tnc-transit-transit-transit-transit)
# CCGID 159 (walk-transit-transit-transit-tnc-knr)
# CCGID 1018 (knr-carpool-tnc-transit-transit-transit)
# CCGID 1774 (pnr-carpool-tnc-transit-transit-transit)
df.loc[(df.CCGID==122) | (df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1774),
       'egress_mode'] = 'Missing'
df.loc[(df.CCGID==122) | (df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1774),
       'first_route_after_survey_alight'] = 'transit_pending'
df.loc[(df.CCGID==122) | (df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1774),
       'second_route_after_survey_alight'] = 'transit_pending'
df.loc[(df.CCGID==122) | (df.CCGID==159) | (df.CCGID==1018) | (df.CCGID==1774),
       'third_route_after_survey_alight'] = 'transit_pending'

# cases with no transit mode and multiple non-transit modes - egree_mode is 'Missing', no after-transfer
# CCGID 1714 (carpool-tnc-scooter-other)
df.loc[df.CCGID==1714, 'egress_mode'] = 'Missing'


# finally, check there is no row with egress_mode.isnull()
print('{} rows is missing egress_mode'.format(df.egress_mode.isnull().sum()))

0 rows is missing egress_mode


In [9]:
# export records with 'transit_pending' for further investigation
df_pending = df.loc[(df.first_route_before_survey_board == 'transit_pending') | (
                     df.first_route_after_survey_alight == 'transit_pending')]
print('export {} rows with pending before/after transfers'.format(df_pending.shape[0]))
df_pending.to_csv(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\pending_transfers.csv', index=False)


# drop interim fields
df.drop(columns = [x + '_name' for x in ['Q2A_1', 'Q2A_2', 'Q2A_3', 'Q2A_4', 'Q2A_5', 'Q2A_6', 
                                         'Q2B_1', 'Q2B_2', 'Q2B_3', 'Q2B_4', 'Q2B_5', 'Q2B_6']] + \
                  [x + '_cat' for x in ['Q2A_1', 'Q2A_2', 'Q2A_3', 'Q2A_4', 'Q2A_5', 'Q2A_6', 
                                        'Q2B_1', 'Q2B_2', 'Q2B_3', 'Q2B_4', 'Q2B_5', 'Q2B_6']] + \
                  ['access_count', 'egress_count', 'access_concat', 'egress_concat'], inplace=True)

export 30 rows with pending before/after transfers


### deal with 'hispanic' and 'ethnicity/race'  <a class="anchor" id="race"></a>

In [10]:
race_dict = {1: 'White',
             2: 'Black/African American',
             3: 'Asian/Pacific Islander',
             4: 'Hispanic/Latino',
             5: 'American Indian/Alaskan Native',
             6: 'Other',
             7: 'Other',
             8: 'Other',
             9: 'Mixed',
             10: 'Middle Eastern',
             11: 'East Indian/Pakistani',
             0: 'NA'}

for i in ['Q20_1', 'Q20_2', 'Q20_3', 'Q20_4']:
    df[i] = df[i].fillna(10)
    df[i].replace(to_replace = ' ', value = 10, inplace=True)
    df[i] = df[i].apply(lambda x: int(x))
    df[i+'_temp'] = df[i].map(race_dict)
    
df['race_concat'] = df['Q20_1_temp'] + '_' + df['Q20_2_temp'] + '_' + df['Q20_3_temp'] + '_' + df['Q20_4_temp']
print(list(df['race_concat'].unique()))

['Hispanic/Latino_Middle Eastern_Middle Eastern_Middle Eastern', 'White_Middle Eastern_Middle Eastern_Middle Eastern', 'Asian/Pacific Islander_Middle Eastern_Middle Eastern_Middle Eastern', 'Black/African American_Middle Eastern_Middle Eastern_Middle Eastern', 'Other_Middle Eastern_Middle Eastern_Middle Eastern', 'White_Hispanic/Latino_Middle Eastern_Middle Eastern', 'Mixed_Middle Eastern_Middle Eastern_Middle Eastern', 'White_Black/African American_Middle Eastern_Middle Eastern', 'White_Hispanic/Latino_American Indian/Alaskan Native_Middle Eastern', 'NA_Middle Eastern_Middle Eastern_Middle Eastern', 'White_Black/African American_Asian/Pacific Islander_Middle Eastern', 'American Indian/Alaskan Native_Middle Eastern_Middle Eastern_Middle Eastern', 'White_Black/African American_American Indian/Alaskan Native_Middle Eastern', 'Asian/Pacific Islander_Hispanic/Latino_Middle Eastern_Middle Eastern', 'White_Black/African American_Asian/Pacific Islander_Hispanic/Latino', 'Black/African America

In [11]:
# create 'hispanic' field
df['hispanic'] = 'NO'
df.loc[df.race_concat.str.contains('Hispanic',na=False), 'hispanic'] = 'YES'
df.hispanic.value_counts()

NO     2142
YES     264
Name: hispanic, dtype: int64

In [12]:
# create race_dmy_xx

df['race_dmy_ind'] = 0
df.loc[df.race_concat.str.contains('American Indian/Alaskan Native',na=False), 'race_dmy_ind'] = 1

df['race_dmy_hwi'] = 0
df.loc[df.race_concat.str.contains('Native Hawaiian/Pacific Islander',na=False), 'race_dmy_hwi'] = 1

df['race_dmy_blk'] = 0
df.loc[df.race_concat.str.contains('Black/African American',na=False), 'race_dmy_blk'] = 1

df['race_dmy_wht'] = 0
df.loc[df.race_concat.str.contains('White',na=False), 'race_dmy_wht'] = 1

df['race_dmy_asn'] = 0
df.loc[df.race_concat.str.contains('Asian',na=False) | df.race_concat.str.contains('East Indian/Pakistani',na=False), 'race_dmy_asn'] = 1

df['race_dmy_mdl_estn'] = 0
df.loc[df.race_concat.str.contains('Middle Eastern',na=False), 'race_dmy_mdl_estn'] = 1

# drop temp fields
df.drop(columns = ['Q20_1_temp', 'Q20_2_temp', 'Q20_3_temp', 'Q20_4_temp', 'race_concat'], inplace=True)

### deal with trip purpose <a class="anchor" id="trip_purp"></a>

In [13]:
trip_purp_dict = {
    '1': 'Commute to/from work', 
    '2': 'Business travel', 
    '3': 'Travel to/from school', 
    '4': 'Leisure/Recreation', 
    '5': 'Visit family/friends', 
    '6': 'Vacation', 
    '7': 'Other', 
    '8': 'Personal / Family business', 
    '9': 'Travel to or from school', 
    '10': 'Other (specify)', 
    '11': 'School/ Group Trip', 
    '12': 'Church/volunteering/political', 
    '13': 'Just to enjoy the train/Outing to ride train', 
    '14': 'Moving/traveling between homes', 
    '15': 'Going Home', 
    '16': 'Airport trip'}

for colname in ['Q8_1','Q8_2','Q8_3','Q8_4']:
    df[colname].fillna(0, inplace=True)
    df[colname] = df[colname].apply(lambda x: str(int(x)))
    df[colname] = df[colname].map(trip_purp_dict)

# print out all possible combinations in the data
display(df[['Q8_1','Q8_2','Q8_3','Q8_4']].dropna(how='all', axis=1).drop_duplicates())

# "work" takes precedent when non-work is in front of work
df.loc[(df.Q8_1 == 'Leisure/Recreation') & (df.Q8_2 == 'Commute to/from work'), 'trip_purp'] = df['Q8_2']

# otherwise, Q8_1 seems the proper trip_purp
df['trip_purp'] = df['Q8_1']
df.loc[df.trip_purp.isnull(), 'trip_purp'] = 'missing' 

Unnamed: 0,Q8_1,Q8_2,Q8_3,Q8_4
0,Visit family/friends,,,
1,Leisure/Recreation,,,
2,Travel to/from school,,,
3,Leisure/Recreation,Visit family/friends,,
4,,,,
6,Vacation,,,
8,Other,,,
14,Commute to/from work,,,
23,Business travel,,,
34,Visit family/friends,Vacation,,


### code home lat/lon based on zipcode <a class="anchor" id="home_lat_lon"></a>

In [14]:
# read zipcode spatial data
zip_shp = gpd.read_file(r'M:\Data\GIS layers\zip_code_sr\zip_code_sr.shp')

# get lat/lon
def getXY(pt):
    return (pt.x, pt.y)
centroidseries = zip_shp['geometry'].centroid
x,y = [list(t) for t in zip(*map(getXY, centroidseries))]

zip_shp['lat'] = y
zip_shp['lon'] = x

zip_shp['postcode'] = zip_shp['postcode'].apply(lambda x: int(x))

# merge into the survey data
df = df.merge(zip_shp[['postcode', 'lat', 'lon']], left_on='Q22', right_on='postcode', how='left')

no_latlon = df.loc[df.lat.isnull() | df.lon.isnull()].shape[0]
print('{} records are missing home lat/lon, accounting for {:.1%} of all'.format(no_latlon, no_latlon/df.shape[0]))

# rename
df.rename(columns = {'lat': 'home_lat',
                     'lon': 'home_lon'}, inplace=True)

629 records are missing home lat/lon, accounting for 26.1% of all


### code board/alight station name and lat/lon <a class="anchor" id="station_lat_lon"></a>

In [15]:
# read x/y data
station_xy = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\passenger_rail_stations.csv',
                         usecols = ['routename', 'station_na', 'x', 'y'])

station_xy_cc  = station_xy.loc[station_xy.routename == 'Capitol Corridor']

# rename 'Santa Clara' to 'Santa Clara Great America' and add one row for 'Santa Clara University'
station_xy_cc.loc[station_xy_cc.station_na == 'Santa Clara', 'station_na'] = 'Santa Clara Great America'

add_station = {'routename': 'Capitol Corridor', 'station_na': 'Santa Clara University', 'x': -121.9396494, 'y': 37.3517273} 
station_xy_cc = station_xy_cc.append(add_station, ignore_index = True)


# build dictionary for Boarding/Alighting Station
# for survey responses with value 21 Fairfield (Unspecified), 
# 22 Oakland (Unspecified), 23 Santa Clara (Unspecified),
# need to re-assign to the station with more ridership within the same city

cc_station_dict = {'1': 'Auburn',
                   '2': 'Berkeley',
                   '3': 'Colfax',
                   '4': 'Davis',
                   '5': 'Emeryville',
                   '6': 'Suisun-fairfield',
                   '7': 'Fairfield/Vacaville Station',
                   '8': 'Fremont',
                   '9': 'Hayward',
                   '10': 'Martinez',
                   '11': 'Jack London Square',
                   '12': 'Oakland Coliseum',
                   '13': 'Richmond',
                   '14': 'Rocklin',
                   '15': 'Roseville',
                   '16': 'Sacramento',
                   '17': 'San Jose',
                   '18': 'Santa Clara Great America',
                   '19': 'Santa Clara University',
                   '20': 'Other',
                   '21': 'Suisun-fairfield',
                   '22': 'Jack London Square',
                   '23': 'Santa Clara Great America'}

# merge station names into the survey data
for colname in ['Q1A', 'Q1B']:
    df[colname] = df[colname].fillna(0)
    df[colname] = df[colname].apply(lambda x: str(x))
    df[colname] = df[colname].map(cc_station_dict)
    
# merge lat/lon into the survey data
df_board = df[['CCGID', 'Q1A']].merge(station_xy_cc, left_on='Q1A', right_on='station_na', how='left')
df_board = df_board[['CCGID', 'x', 'y']].rename(columns = {'x': 'survey_board_lon',
                                                        'y': 'survey_board_lat'})

df_alight = df[['CCGID', 'Q1B']].merge(station_xy_cc, left_on='Q1B', right_on='station_na', how='left')
df_alight = df_alight[['CCGID', 'x', 'y']].rename(columns = {'x': 'survey_alight_lon',
                                                          'y': 'survey_alight_lat'})

df = df.merge(df_board, on='CCGID', how='left').merge(df_alight, on='CCGID', how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


### update 'weight' <a class="anchor" id="weight"></a>

In [16]:
weight_df = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\Weighting\Capitol_Corridor_Weights.csv')
print('read {} rows of updated weights: \n{}'.format(weight_df.shape[0], weight_df.head()))

df = df.merge(weight_df, on='CCGID', how='left')
print('total weights: {}'.format(df.weight.sum()))

# drop the raw 'weight' column
df.drop(columns = ['WEIGHT'], inplace=True)

read 2406 rows of updated weights: 
   CCGID  weight
0      1     0.0
1      2     0.0
2      3     0.0
3      4     0.0
4      5     0.0
total weights: 5762.2030651341


### impute year_born from 'age group' <a class="anchor" id="age"></a>

In [17]:
year_born_dict = {1: 2004,   # Under 18, 2019-15=2004
                  2: 1998,   # 18-24, 2019-21=1998
                  3: 1989,   # 25-34, 2019-30=1989
                  4: 1979,   # 35-44, 2019-40=1979
                  5: 1969,   # 45-54, 2019-50=1969
                  6: 1959,   # 55-64, 2019-60=1959
                  7: 1949,   # 65 and older, 2019-70=1949
                 }

df['year_born_four_digit'] = df['Q18'].map(year_born_dict)
df.year_born_four_digit.value_counts()

1989.0    520
1979.0    497
1969.0    421
1959.0    415
1949.0    209
1998.0    202
2004.0     36
Name: year_born_four_digit, dtype: int64

### export survey data <a class="anchor" id="survey_export"></a>

In [18]:
df.rename(columns = {'CCGID': 'ID'}, inplace=True)

final_fname = 'CAPCO19 Data-For MTC_NO POUND OR SINGLE QUOTE.csv'
print('export {} rows of data to {}'.format(df.shape[0], final_fname))

df.to_csv(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\As CSV\CAPCO19 Data-For MTC_NO POUND OR SINGLE QUOTE.csv', index=False)

export 2406 rows of data to CAPCO19 Data-For MTC_NO POUND OR SINGLE QUOTE.csv


## build standard dictionary <a class="anchor" id="standard_dict"></a>

### read raw variable dictionary 'Field Guide'  <a class="anchor" id="raw_dict"></a>

In [19]:
# read raw survey dictionary
survey_dict = pd.read_excel(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\CAPCO19 Data-For MTC.xlsx',
                            sheet_name='Field Guide')

# back fill name in 'Field' column
survey_dict.loc[(survey_dict.Field == '       ') | (survey_dict.Field == '  ') | (survey_dict.Field == '        '),
                'Field'] = np.nan
survey_dict['Field'].fillna(method='ffill', inplace=True)

# rename to correctly reflect the info
survey_dict.rename(columns={'Field': 'Survey_Variable',
                            'Question/Description': 'Survey_Response',
                            'Unnamed: 2': 'Generic_Response_old'}, inplace=True)

# only keep needed columns
var_dict = survey_dict[['Survey_Variable', 'Survey_Response', 'Generic_Response_old']].drop_duplicates()

In [20]:
var_dict.head(30)

Unnamed: 0,Survey_Variable,Survey_Response,Generic_Response_old
0,RespNum,Software added ID number,
1,CCGID,CCG ID Number,
2,TRAIN,Train Number,
3,INTDATE,Interview Date,
4,PERIOD,STRATA,
5,PERIOD,1,Weekday
6,PERIOD,2,Weekend
7,Language,Language of survey,
8,Language,1,English
9,Language,2,Spanish


### add rows to the dictionary for the new fields added to the survey data <a class="anchor" id="add_row"></a>

In [21]:
# add rows for 'access_mode'
access_mode_dict = var_dict.loc[(var_dict.Survey_Variable == 'Q2B_1-Q2B_6') & (var_dict.Generic_Response_old.notnull())]
access_mode_dict.loc[access_mode_dict.Survey_Variable == 'Q2B_1-Q2B_6', 'Survey_Variable'] = 'access_mode'
# display(access_mode_dict)
var_dict = var_dict.append(access_mode_dict, ignore_index=True)

# replace 'Q2A_1-Q2A_6' with new field name 'egress_mode'
var_dict.loc[(var_dict.Survey_Variable == 'Q2B_1-Q2B_6') & (var_dict.Generic_Response_old.notnull()), 'Survey_Variable'] = 'egress_mode'

# drop the categoric dictionary part of Q1B
var_dict = var_dict.loc[(var_dict.Survey_Variable != 'Q1B') | (
                             (var_dict.Survey_Variable == 'Q1B') & (var_dict.Survey_Response == 'Alighting Station'))]

# add race and hispanic fields to the dictionary
race_dict = pd.DataFrame(np.array([['hispanic', 'YES', 'YES'],
                                   ['hispanic', 'NO', 'NO'],
                                   ['race_dmy_ind', 1, 1],
                                   ['race_dmy_hwi', 1, 1],
                                   ['race_dmy_blk', 1, 1],
                                   ['race_dmy_wht', 1, 1],
                                   ['race_dmy_asn', 1, 1],
                                   ['race_dmy_mdl_estn', 1, 1],
                                   ['race_dmy_ind', 0, 0],
                                   ['race_dmy_hwi', 0, 0],
                                   ['race_dmy_blk', 0, 0],
                                   ['race_dmy_wht', 0, 0],
                                   ['race_dmy_asn', 0, 0],
                                   ['race_dmy_mdl_estn', 0, 0]]),
                         columns=['Survey_Variable', 'Survey_Response', 'Generic_Response_old'])
# display(race_dict)
var_dict = var_dict.append(race_dict, ignore_index=True)


# add trip_purp rows
trip_purpose_dict = df[['trip_purp']].drop_duplicates()
trip_purpose_dict.columns = ['Survey_Response']
trip_purpose_dict['Survey_Variable'] = 'trip_purp'
trip_purpose_dict['Generic_Response_old'] = trip_purpose_dict['Survey_Response']
# trip_purpose_dict[['Survey_Variable', 'Survey_Response', 'Generic_Response_old']]
var_dict = var_dict.append(trip_purpose_dict, ignore_index=True)


# add home lat/lon rows
home_dict = pd.DataFrame(np.array([['home_lat', 'NONCATEGORICAL', np.nan],
                                   ['home_lon', 'NONCATEGORICAL', np.nan]]),
                         columns=['Survey_Variable', 'Survey_Response', 'Generic_Response_old'])
var_dict = var_dict.append(home_dict, ignore_index=True)


# add survey_board/alight lat/lon rows
board_alight_dict = pd.DataFrame(np.array([['survey_board_lon', 'NONCATEGORICAL', np.nan],
                                           ['survey_board_lat', 'NONCATEGORICAL', np.nan],
                                           ['survey_alight_lon', 'NONCATEGORICAL', np.nan],
                                           ['survey_alight_lat', 'NONCATEGORICAL', np.nan]]),
                                 columns=['Survey_Variable', 'Survey_Response', 'Generic_Response_old'])
var_dict = var_dict.append(board_alight_dict, ignore_index=True)


# add transfer routes rows
trans_routes_dict = pd.DataFrame(np.array([['first_route_before_survey_board', 'NONCATEGORICAL', np.nan],
                                           ['second_route_before_survey_board', 'NONCATEGORICAL', np.nan],
                                           ['third_route_before_survey_board', 'NONCATEGORICAL', np.nan],
                                           ['first_route_after_survey_alight', 'NONCATEGORICAL', np.nan],
                                           ['second_route_after_survey_alight', 'NONCATEGORICAL', np.nan],
                                           ['third_route_after_survey_alight', 'NONCATEGORICAL', np.nan]]),
                                 columns=['Survey_Variable', 'Survey_Response', 'Generic_Response_old'])
var_dict = var_dict.append(trans_routes_dict, ignore_index=True)

# add new weight row
var_dict.loc[len(var_dict.index)] = ['weight', 'NONCATEGORICAL', np.nan]

# add year_born row
var_dict.loc[len(var_dict.index)] = ['year_born_four_digit', 'NONCATEGORICAL', np.nan]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


### add default fields in the standard dictionary <a class="anchor" id="add_fields"></a>

In [22]:
# add columns 'Generic_variable'
var_dict['Generic_Variable'] = ''
var_dict.loc[var_dict.Survey_Variable == 'CCGID', 'Generic_Variable'] = 'ID'
var_dict.loc[var_dict.Survey_Variable == 'CCGID', 'Survey_Variable'] = 'ID'

var_dict.loc[var_dict.Survey_Variable == 'TRAIN', 'Generic_Variable'] = 'route'
var_dict.loc[var_dict.Survey_Variable == 'INTDATE', 'Generic_Variable'] = 'date_string'
var_dict.loc[var_dict.Survey_Variable == 'PERIOD', 'Generic_Variable'] = 'weekpart'
var_dict.loc[var_dict.Survey_Variable == 'Language', 'Generic_Variable'] = 'interview_language'
var_dict.loc[var_dict.Survey_Variable == 'Language', 'Survey_Variable'] = 'LANGUAGE'

var_dict.loc[var_dict.Survey_Variable == 'Q1A', 'Generic_Variable'] = 'onoff_enter_station'
var_dict.loc[var_dict.Survey_Variable == 'Q1B', 'Generic_Variable'] = 'onoff_exit_station'

var_dict.loc[var_dict.Survey_Variable == 'Q9', 'Generic_Variable'] = 'fare_category'
var_dict.loc[var_dict.Survey_Variable == 'Q10', 'Generic_Variable'] = 'fare_medium'

var_dict.loc[var_dict.Survey_Variable == 'Q17', 'Generic_Variable'] = 'gender'
var_dict.loc[var_dict.Survey_Variable == 'Q19', 'Generic_Variable'] = 'household_income'

var_dict.loc[var_dict.Survey_Variable == 'Q21', 'Generic_Variable'] = 'persons'

# newly created fields:
var_dict.loc[var_dict.Survey_Variable.str.contains('race_dmy_',na=False), 'Generic_Variable'] = var_dict['Survey_Variable']

for varname in ['access_mode', 'egress_mode', 'trip_purp', 'home_lat', 'home_lon', 'weight', 'hispanic',
                'survey_board_lon', 'survey_board_lat', 'survey_alight_lon', 'survey_alight_lat',
                'first_route_before_survey_board', 'second_route_before_survey_board',
                'third_route_before_survey_board', 'first_route_after_survey_alight',
                'second_route_after_survey_alight', 'third_route_after_survey_alight']:
    var_dict.loc[var_dict.Survey_Variable == varname,  'Generic_Variable'] = varname

In [23]:
# add values for 'Survey_Response' and 'Generic_Response' for noncanonical variables
var_dict['Generic_Response'] = ''

for varname in ['ID', 'TRAIN', 'INTDATE', 'Q1A', 'Q1B', 'Q22', 'weight']:
    var_dict.loc[var_dict.Survey_Variable == varname, 'Survey_Response'] = 'NONCATEGORICAL'
    
for varname in ['ID', 'TRAIN', 'INTDATE', 'Q1A', 'Q1B', 'Q22', 'weight', 'year_born_four_digit',
                'home_lat', 'home_lon',
                'survey_board_lon', 'survey_board_lat', 'survey_alight_lon', 'survey_alight_lat',
                'first_route_before_survey_board', 'second_route_before_survey_board',
                'third_route_before_survey_board', 'first_route_after_survey_alight',
                'second_route_after_survey_alight', 'third_route_after_survey_alight']:
    var_dict.loc[var_dict.Survey_Variable == varname, 'Generic_Response'] = 'NONCATEGORICAL'


In [24]:
var_dict['operator'] = 'Capitol Corridor'
var_dict['Survey_year'] = 2019

### check consistency between values in survey data and in the dictionary <a class="anchor" id="check"></a>

In [25]:
# check if all values in the survey data are represented in the dictionary

var_dict_cat = var_dict.loc[(var_dict.Survey_Response != 'NONCATEGORICAL') & (
                             var_dict.Generic_Response_old.notnull()) & (
                             var_dict.Generic_Variable != '')]

for fieldname in list(var_dict_cat.Survey_Variable.unique()):
#     fieldname = fieldname.upper()
    print(fieldname)
    values = var_dict.loc[var_dict.Survey_Variable == fieldname]
    
    if fieldname in ['race_dmy_ind', 'race_dmy_hwi', 'race_dmy_blk', 'race_dmy_wht', 'race_dmy_asn', 'race_dmy_mdl_estn']:
        values.Survey_Response = values.Survey_Response.apply(lambda x: int(x))
        
    comp = df.merge(values, left_on = fieldname, right_on = 'Survey_Response', how='left')
    comp_diff = comp.loc[comp[fieldname].isnull()]
    if comp_diff.shape[0] > 0:
        print(comp_diff[fieldname].unique())

PERIOD
LANGUAGE
egress_mode
Q9
Q10
Q17
Q19
Q21
access_mode
hispanic
race_dmy_ind
race_dmy_hwi
race_dmy_blk
race_dmy_wht
race_dmy_asn
race_dmy_mdl_estn
trip_purp


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### export raw standard dictionary <a class="anchor" id="export_dict"></a>

In [26]:
# will manually add 'Generic_Response' for canonical variables in the exported file

# only keep rows for needed variables
var_dict = var_dict.loc[(var_dict.Generic_Variable != '') & (
    (var_dict.Generic_Response == 'NONCATEGORICAL') | var_dict.Generic_Response_old.notnull())]

print('export raw dictionary with {} variables:\n{}'.format(len(var_dict.Survey_Variable.unique()),
                                                          var_dict.Survey_Variable.unique()))

var_dict.to_csv(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\var_dict_raw.csv', index=False)

export raw dictionary with 35 variables:
['ID' 'TRAIN' 'INTDATE' 'PERIOD' 'LANGUAGE' 'Q1A' 'Q1B' 'egress_mode' 'Q9'
 'Q10' 'Q17' 'Q19' 'Q21' 'access_mode' 'hispanic' 'race_dmy_ind'
 'race_dmy_hwi' 'race_dmy_blk' 'race_dmy_wht' 'race_dmy_asn'
 'race_dmy_mdl_estn' 'trip_purp' 'home_lat' 'home_lon' 'survey_board_lon'
 'survey_board_lat' 'survey_alight_lon' 'survey_alight_lat'
 'first_route_before_survey_board' 'second_route_before_survey_board'
 'third_route_before_survey_board' 'first_route_after_survey_alight'
 'second_route_after_survey_alight' 'third_route_after_survey_alight'
 'weight']


## build canonical route crosswalk <a class="anchor" id="canonical_route"></a>

In [27]:
canonical_routes = pd.DataFrame(np.array([[5, 'BART___BART',             'BART',     'heavy rail'],
                                          [6, 'CALTRAIN___CALTRAIN',     'Caltrain', 'commuter rail'],
                                          [7, 'Missing___missing',       'Missing',  'light rail'],
                                          [8, 'AMTRAK___Amtrak Shuttle', 'AMTRAK',   'local bus'],
                                          [9, 'AMTRAK___AMTRAK',         'AMTRAK',   'commuter rail'],
                                          [10,'Missing___missing',       'Missing',  'local_bus']]),
                                columns=['survey_name','canonical_name','canonical_operator','technology'])

canonical_routes['survey'] = 'Capitol Corridor'
canonical_routes['survey_year'] = 2019
canonical_routes.to_csv(r'M:\Data\OnBoard\Data and Reports\Capitol Corridor\OD Survey 2019\routes_canonical.csv', index=False)