In [1]:
import pandas as pd

In [2]:
df_raw = pd.read_excel(r'M:\Data\OnBoard\Data and Reports\SMART\As CSV\SMART Standardized Final Data NO POUND NO SINGLE QUOTE.xlsx')
print('read {} records, with {} unique RespNum'.format(df_raw.shape[0], len(df_raw.RespNum.unique())))

read 411 records, with 411 unique RespNum


In [3]:
# remove space in field names
df = df_raw.copy()
df.rename(columns = {'RespNum': 'ID'}, inplace=True)
cols = [x.strip().replace(' ','_') for x in list(df)]
df.columns = cols

In [4]:
# create survey route based on board and alight station names
df['survey_route'] = df['Board_Station'] + ' - ' + df['Alight_station']

In [5]:
# aggregate access_mode and egress_mode
df['access_mode_final'] = df['access_mode_Leg1']
df.loc[df.access_mode_final == 'PUBLIC TRANSIT', 'access_mode_final'] = df['access_mode_Leg2']

df['egress_mode_final'] = df['egress_mode_Leg1']
df.loc[df.egress_mode_final == 'PUBLIC TRANSIT', 'egress_mode_final'] = df['egress_mode_Leg2']

In [6]:
# aggregate transfer agency and route info
df['first_route_before_survey_board'] = df['1_system_before'] + '___' + df['1_route_before'].astype(str)
df['second_route_before_survey_board'] = df['2_system_before'] + '___' + df['2_route_before'].astype(str)
df['third_route_before_survey_board'] = df['3_system_before'] + '___' + df['3_route_before'].astype(str)
df['first_route_after_survey_alight'] = df['1_after_system'] + '___' + df['1_route_after_system'].astype(str)
df['second_route_after_survey_alight'] = df['2_after_system'] + '___' + df['2_route_after_system'].astype(str)
df['third_route_after_survey_alight'] = df['3_after_system'] + '___' + df['3_route_after_system'].astype(str)

# use interview start time as the "time_string"
df['time_string'] = df['interview_start_time']

df.to_csv(r'M:\Data\OnBoard\Data and Reports\SMART\As CSV\SMART Standardized Final Data_addRouteCols_NO POUND NO SINGLE QUOTE.csv', index=False)

In [7]:
routes = pd.DataFrame(columns = ['survey_name'])
for i in ['first_route_before_survey_board', 'second_route_before_survey_board', 'third_route_before_survey_board',
          'first_route_after_survey_alight', 'second_route_after_survey_alight', 'third_route_after_survey_alight']:
    route_unique = df[[i]]
    route_unique.columns = ['survey_name']
    routes = pd.concat([routes, route_unique])

routes_clean = routes.loc[routes.survey_name.notnull()]
routes_clean.drop_duplicates(inplace=True)
routes_clean['survey'] = 'Sonoma-Marin Area Rail Transit'
routes_clean['survey_year'] = 2018

print(routes_clean.shape)
#routes_clean[['survey','survey_year','survey_name']].to_csv(r'M:\Data\OnBoard\Data and Reports\SMART\As CSV\all_routes_raw.csv', index=False)

(52, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [8]:
# bring in standard dictionary to check field consistency

# dictionary for Smart survey
var = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\SMART\As CSV\variables_dictionary.csv',
                  encoding = "ISO-8859-1", engine='python')

# standard dictionary
var_standard = pd.read_csv(r'C:\Users\ywang\Documents\GitHub\onboard-surveys\make-uniform\production\Dictionary for Standard Database.csv')
var_standard.columns = [x+'_s' for x in list(var_standard)]

# merge
var_merge = var.merge(var_standard, left_on='Generic_Variable', right_on='Generic_Variable_s', how='outer')

# check if 'Generic_Variable' in Smart dictionary matches the standard 'Generic_Variable'. chk1 should be empty
chk1 = var_merge.loc[(var_merge.Generic_Variable.notnull()) & (var_merge.Generic_Variable_s.isnull())]
print('Generic_Variable that should not exit:')
print(chk1.Generic_Variable.unique())
print()

# check if columns names in survey data matches 'Survey_Variable' in Smart dictionary.
# the following loops should not include variables that are needed for standardization

for i in var.loc[var.Generic_Variable.notnull()]['Survey_Variable']:
    if i not in list(df):
        print(i)
        
for i in list(df):
    if i not in list(var.Survey_Variable):
        print(i)

Generic_Variable that should not exit:
[]

CCGID
RunID
WC
sch
school_name
college_name
access_mode
access_mode_Leg1
access_mode_Leg2
access_mode_Leg3
access_mode_Leg4
egress_mode
egress_mode_Leg1
egress_mode_Leg2
egress_mode_Leg3
egress_mode_Leg4
1_system_before
1_route_before
2_system_before
2_route_before
3_system_before
3_route_before
route
Board_Station
Alight_station
1_after_system
1_route_after_system
2_after_system
2_route_after_system
3_after_system
3_route_after_system
age
race_other
livebay
ST


In [9]:
# check if all the values in the survey data are included in Smart dictionary
# look at non-categorical variables; "diff" should be empty or only contains nan

var_clean = var[['operator', 'Survey_year', 'Survey_Variable', 'Survey_Response', 
                 'Generic_Variable', 'Generic_Response']].drop_duplicates()
var_clean = var_clean.loc[var_clean.Generic_Variable.notnull()]

for i in var_clean.loc[var_clean.Survey_Response != 'NONCATEGORICAL']['Survey_Variable'].unique():
    print(i)
    df_sub = df[['ID', i]]
    var_sub = var_clean.loc[var_clean.Survey_Variable == i]

    if i in ['race_dmy_ind', 'race_dmy_hwi', 'race_dmy_blk', 'race_dmy_wht', 'race_dmy_asn', 'WC', 
             'xfers_after', 'xfers_before']:
        var_sub.Survey_Response = var_sub.Survey_Response.apply(lambda x: int(x))

    if i in ['depart_hour', 'return_hour']:
        df_sub[i] = df_sub[i].apply(lambda x: str(x))

    compare = df_sub.merge(var_sub, left_on=i, right_on='Survey_Response', how='left')
    diff = compare.loc[compare.Generic_Response.isnull()]
    if diff.shape[0] > 0:
        print(diff[i].unique())

access_mode_final
at_school_after_dest_purp
[nan]
at_school_prior_to_orig_purp
[nan]
at_work_after_dest_purp
[nan]
at_work_prior_to_orig_purp
[nan]
cars
day_of_week
depart_hour
['nan']
dest_purp
DIRECTION
DTYPE
egress_mode_final
engspk
[nan]
fare
farecat
gender
hh
hhwork
hisp
income
language_at_home_binary
language_at_home_detail
[nan]
Mode
orig_purp
race_dmy_asn
race_dmy_blk
race_dmy_hwi
race_dmy_ind
race_dmy_wht
return_hour
['nan']
sch.1
STRATA
work_status
xfers_after
[nan]
xfers_before
[nan]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [10]:
# finally, check all necessary fields are included, and export
print(var_clean.Generic_Variable.unique())
var_clean.to_csv(r'M:\Data\OnBoard\Data and Reports\SMART\As CSV\vars_for_standard_dictionary.csv', index=False)

['route' 'access_mode' 'at_school_after_dest_purp'
 'at_school_prior_to_orig_purp' 'at_work_after_dest_purp'
 'at_work_prior_to_orig_purp' 'year_born_four_digit' 'vehicles'
 'date_string' 'day_of_week' 'depart_hour' 'dest_purp' 'direction'
 'weekpart' 'egress_mode' 'dest_lat' 'dest_lon' 'eng_proficient'
 'fare_medium' 'fare_category' 'first_board_lat' 'first_board_lon'
 'gender' 'persons' 'workers' 'hispanic' 'home_lat' 'home_lon' 'ID'
 'household_income' 'interview_end_time' 'interview_start_time'
 'language_at_home_binary' 'language_at_home_detail' 'last_alight_lat'
 'last_alight_lon' 'survey_type' 'orig_purp' 'orig_lat' 'orig_lon'
 'race_other_string' 'race_dmy_asn' 'race_dmy_blk' 'race_dmy_hwi'
 'race_dmy_ind' 'race_dmy_wht' 'return_hour' 'student_status' 'school_lat'
 'school_lon' 'time_period' 'time_string' 'tweight' 'weight'
 'workplace_lat' 'workplace_lon' 'work_status'
 'number_transfers_alight_dest' 'number_transfers_orig_board'
 'first_route_before_survey_board' 'second_rout