In [1]:
import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_excel(r'M:\Data\OnBoard\Data and Reports\Marin Transit\Final Data\marin transit_data file_finalreweighted043018_01222021.xlsx', sheet_name='MarinTransit_Data File')
print('dataset has {} records, with {} unique RespNum and {} unique CCGID'.format(df_raw.shape[0],
                                                                                  len(df_raw.sys_RespNum.unique()),
                                                                                  len(df_raw.CCGID.unique())))
print('sum of WEIGHT of all records: {}'.format(df_raw.WEIGHT.sum()))
print('sum of TWEIGHT of all records: {}'.format(df_raw.TWEIGHT.sum()))

dataset has 1216 records, with 1216 unique RespNum and 1216 unique CCGID
sum of WEIGHT of all records: 18016.96681668181
sum of TWEIGHT of all records: 53322.704040519806


In [3]:
df = df_raw.copy()

# remove space in columns names
cols = [x.strip().replace(' ','_') for x in list(df)]
df.columns = cols

# use sys_RespNum as ID
df['ID'] = df['sys_RespNum']

# the data doesn't have time_string
df['time_string'] = np.nan

# convert the data type of the 'DATE' variable from datetime to string and then remove the hour/min/sec part
df['DATE'] = df['DATE'].apply(lambda x: str(x)[:10])
print(df.DATE.unique())

# there is a '2027-05-24' which should be '2017-05-24'
df.loc[df.DATE == '2027-05-24','DATE'] = '2017-05-24'
print(df.DATE.unique())

['2017-03-28' '2017-03-29' '2017-03-31' '2017-04-01' '2017-04-03'
 '2017-04-08' '2017-03-30' '2017-04-06' '2017-04-09' '2017-04-17'
 '2017-04-21' '2017-04-23' '2017-04-24' '2017-04-28' '2017-04-30'
 '2017-04-07' '2017-04-04' '2017-04-05' '2017-04-18' '2017-04-19'
 '2017-04-20' '2017-04-22' '2017-04-27' '2017-04-29' '2017-05-01'
 '2017-05-02' '2017-05-06' '2017-05-09' '2017-05-10' '2017-05-11'
 '2017-05-24' '2017-04-25' '2017-04-26' '2017-05-03' '2017-05-05'
 '2017-05-08' '2017-04-02' '4/26/2017' '2017-05-04' '2017-05-07'
 '2017-03-09' '2017-03-16' '2017-03-01' '2017-03-13' '2017-03-08'
 '4/18/2017' '4/23/2017' '4/20/2017' '4/24/2017' '4/21/2017' '4/22/2017'
 '4/25/2017' '4/3/2017' '4/17/2017' '4/28/2017' '4/27/2017' '4/7/2017'
 '3/31/2017' '3/1/2017' '3/30/2017' '3/3/2017' '3/16/2017' '3/8/2017'
 '3/13/2017' '4/19/2017' '2017-05-18' '2027-05-24']
['2017-03-28' '2017-03-29' '2017-03-31' '2017-04-01' '2017-04-03'
 '2017-04-08' '2017-03-30' '2017-04-06' '2017-04-09' '2017-04-17'
 '2017-04

In [4]:
# depending on the answer to the "Morebus" question (if has transfer), 
# access_mode could be 'Gettolotsbus_c1' or 'Getto1bus_c1'
# egress_mode could be 'Gettolotsbus_c2' or 'Getto1bus_c2'
# therefore, create new columns to consolidate these fields

for i in ['Gettolotsbus_c1', 'Getto1bus_c1', 'Gettolotsbus_c2', 'Getto1bus_c2']:
    df[i].replace(to_replace = ' ', value = np.nan, inplace=True)
    
df['access_mode'] = df['Gettolotsbus_c1']
df.loc[df.access_mode.isnull(), 'access_mode'] = df['Getto1bus_c1']

df['egress_mode'] = df['Gettolotsbus_c2']
df.loc[df.egress_mode.isnull(), 'egress_mode'] = df['Getto1bus_c2']

In [5]:
# create new field to aggregate survey route number and direction
df['survey_route'] = df['ROUTE'].astype(str) + df['DIR'].apply(lambda x: x[:1])

In [6]:
# code transfer system and create columns for 'system + route' which will be used for canonical_route mapping

system_dict = {1: 'Marin Transit',
               2: 'Golden Gate Transit',
               3: 'Sonoma County Transit',        
               4: 'Santa Rosa CityBus',          
               5: 'Golden Gate Ferry',   
               6: 'Napa Vine',
               7: 'SolTrans (Solano County Transit)',
               8: 'FAST (Fairfield-Suisun Transit)',       
               9: 'Vacaville City Coach', 
               10: 'BART',
               11: 'Muni',
               12: 'AC Transit',
               13: 'SamTrans',
               14: 'Caltrain',
               15: 'VTA',
               16: 'Tri Delta Transit',
               17: 'WestCat',
               18: 'County Connection',
               19: 'WHEELS', 
               20: 'Other (not specified)'}

for i in ['sys1', 'sys2', 'sys3', 'sys4']:
    #print(i)
    #print(df[i])
#     df[i].replace(to_replace = ' ', value = 0, inplace=True)
    df[i] = df[i].fillna(0)
    df[i] = df[i].apply(lambda x: int(x))
    df[i+'_temp'] = df[i].map(system_dict)

# for i in ['businfo1', 'businfo2', 'businfo3', 'businfo4']:
#     df[i].replace(to_replace = ' ', value = np.nan, inplace=True)  

df['route1'] = df['sys1_temp'] + '___' + df['businfo1'].astype(str)
df['route2'] = df['sys2_temp'] + '___' + df['businfo2'].astype(str)
df['route3'] = df['sys3_temp'] + '___' + df['businfo3'].astype(str)
df['route4'] = df['sys4_temp'] + '___' + df['businfo4'].astype(str)

In [7]:
# route1/2/3/4 include the current survey route, so need to identify before transfers and after transfers

df['first_route_before_survey_board'] = ''
df['second_route_before_survey_board'] = ''
df['third_route_before_survey_board'] = ''
df['first_route_after_survey_alight'] = ''
df['second_route_after_survey_alight'] = ''
df['third_route_after_survey_alight'] = ''

# add 'Marin Transit' to survey route name in order to compare
df['survey_route_temp'] = 'Marin Transit___' + df['survey_route']

# if the first route is the survey route, then the later routes are after transfers
df['first_route_idx'] = df.survey_route_temp == df.route1
df.loc[df.first_route_idx == True, 'first_route_after_survey_alight'] = df['route2']
df.loc[df.first_route_idx == True, 'second_route_after_survey_alight'] = df['route3']
df.loc[df.first_route_idx == True, 'third_route_after_survey_alight'] = df['route4']

# if the second route is the survey route, then route1 is before transfer and route3/route4 are after transfers
df['second_route_idx'] = df.survey_route_temp == df.route2
df.loc[df.second_route_idx == True, 'first_route_before_survey_board'] = df['route1']
df.loc[df.second_route_idx == True, 'first_route_after_survey_alight'] = df['route3']
df.loc[df.second_route_idx == True, 'second_route_after_survey_alight'] = df['route4']

# if the third route is the survey route, then route1/route2 are before transfers and route4 is after transfer
df['third_route_idx'] = df.survey_route_temp == df.route3
df.loc[df.third_route_idx == True, 'first_route_before_survey_board'] = df['route1']
df.loc[df.third_route_idx == True, 'second_route_before_survey_board'] = df['route2']
df.loc[df.third_route_idx == True, 'first_route_after_survey_alight'] = df['route4']

# if the fourth route is the survey route, then route1/route2/route3 are before transfers
df['fourth_route_idx'] = df.survey_route_temp == df.route4
df.loc[df.fourth_route_idx == True, 'first_route_before_survey_board'] = df['route1']
df.loc[df.fourth_route_idx == True, 'second_route_before_survey_board'] = df['route2']
df.loc[df.fourth_route_idx == True, 'third_route_before_survey_board'] = df['route3']

In [8]:
# derive first_board_lat/lon and last_alight_lat/lon

# first_board_lat/lon = ONBUS-LAT1/LONG1
df['first_board_lat'] = df['ONBUS-LAT1']
df['first_board_lon'] = df['ONBUS-LONG1']

# last_alight_lat/lon = the last available OFFBUS-LAT/LONG
df['last_alight_lat'] = df['OFFBUS-LAT4']
df.loc[df.last_alight_lat.isnull(), 'last_alight_lat'] = df['OFFBUS-LAT3']
df.loc[df.last_alight_lat.isnull(), 'last_alight_lat'] = df['OFFBUS-LAT2']
df.loc[df.last_alight_lat.isnull(), 'last_alight_lat'] = df['OFFBUS-LAT1']

df['last_alight_lon'] = df['OFFBUS-LONG4']
df.loc[df.last_alight_lon.isnull(), 'last_alight_lon'] = df['OFFBUS-LONG3']
df.loc[df.last_alight_lon.isnull(), 'last_alight_lon'] = df['OFFBUS-LONG2']
df.loc[df.last_alight_lon.isnull(), 'last_alight_lon'] = df['OFFBUS-LONG1']

In [9]:
# create race dmy variables based on Race1 through Race5

race_dict = {1: 'American Indian/Alaska Native',
             2: 'Native Hawaiian/Pacific Islander',
             3: 'Black/African American',
             4: 'White/Caucasian',
             5: 'Asian',
             6: 'Hispanic',
             7: 'Mixed',
             8: 'Persian/Arab/North African/Middle Eastern',
             0: 'Refused',
             10: 'NA'}

for i in ['race_1', 'race_2', 'race_3', 'race_4', 'race_5']:
    df[i] = df[i].fillna(10)
    df[i].replace(to_replace = ' ', value = 10, inplace=True)
    df[i] = df[i].apply(lambda x: int(x))
    df[i+'_temp'] = df[i].map(race_dict)
    
df['race_concat'] = df['race_1_temp'] + '_' + df['race_2_temp'] + '_' + df['race_3_temp'] + '_' + df['race_4_temp'] + '_' + df['race_5_temp']
print(list(df['race_concat'].unique()))

['White/Caucasian_NA_NA_NA_NA', 'Hispanic_NA_NA_NA_NA', 'Refused_NA_NA_NA_NA', 'White/Caucasian_Hispanic_NA_NA_NA', 'Black/African American_NA_NA_NA_NA', 'Asian_NA_NA_NA_NA', 'American Indian/Alaska Native_Native Hawaiian/Pacific Islander_Black/African American_White/Caucasian_Asian', 'American Indian/Alaska Native_NA_NA_NA_NA', 'Native Hawaiian/Pacific Islander_White/Caucasian_NA_NA_NA', 'American Indian/Alaska Native_White/Caucasian_NA_NA_NA', 'Persian/Arab/North African/Middle Eastern_NA_NA_NA_NA', 'White/Caucasian_Asian_NA_NA_NA', 'Black/African American_White/Caucasian_NA_NA_NA', 'Native Hawaiian/Pacific Islander_NA_NA_NA_NA', 'Native Hawaiian/Pacific Islander_Black/African American_NA_NA_NA', 'Native Hawaiian/Pacific Islander_Asian_NA_NA_NA', 'Mixed_NA_NA_NA_NA', 'Black/African American_White/Caucasian_Asian_NA_NA', 'Black/African American_Asian_NA_NA_NA', 'Black/African American_Hispanic_NA_NA_NA', 'Asian_Hispanic_NA_NA_NA', 'Hispanic_Mixed_NA_NA_NA', 'American Indian/Alaska Nat

In [10]:
# estimate if hispanic based on race1 through race5
df['hisp_from_race'] = 'non-hisp'
df.loc[df.race_concat.str.contains('Hispanic',na=False), 'hisp_from_race'] = 'hisp'

# check consistency between 'hisp' derived from race1 through race5 and the original 'hisp' variable and make corrections

# records where race1 through race5 indicates 'hispanic' but 'hisp' is 1 (not hispanic)
display(df.loc[(df.hisp_from_race == 'hisp') & (df.hisp != 2)][['ID','hisp','hisp_from_race',
                                                                'race_1', 'race_2', 'race_3', 'race_4', 'race_5']])
# make hisp=2 for records where hisp_from_ETH == 'hisp'
df.loc[df.hisp_from_race == 'hisp', 'hisp'] = 2

# records where race1-race5 doesn't indicate 'hispanic' but 'hisp' is 2 (is hispanic)
# for these, keep the '2' value because the surveyed may leave 'race-hispanic' out when they feel
# they have already provided the information in 'hisp'
display(df.loc[(df.hisp_from_race != 'hisp') & (df.hisp == 2)][['ID','hisp','hisp_from_race',
                                                                'race_1', 'race_2', 'race_3', 'race_4', 'race_5']])


Unnamed: 0,ID,hisp,hisp_from_race,race_1,race_2,race_3,race_4,race_5
413,303525.0,1,hisp,4,6,10,10,10
414,303526.0,0,hisp,6,10,10,10,10
459,303572.0,1,hisp,3,6,10,10,10
556,313923.0,1,hisp,1,6,10,10,10
572,313940.0,1,hisp,3,6,10,10,10
717,335500.0,0,hisp,6,10,10,10,10


Unnamed: 0,ID,hisp,hisp_from_race,race_1,race_2,race_3,race_4,race_5
0,3.0,2,non-hisp,4,10,10,10,10
3,9.0,2,non-hisp,0,10,10,10,10
7,25.0,2,non-hisp,4,10,10,10,10
9,29.0,2,non-hisp,4,10,10,10,10
10,30.0,2,non-hisp,0,10,10,10,10
...,...,...,...,...,...,...,...,...
1205,336161.0,2,non-hisp,0,10,10,10,10
1207,336163.0,2,non-hisp,0,10,10,10,10
1210,336166.0,2,non-hisp,4,10,10,10,10
1214,336170.0,2,non-hisp,0,10,10,10,10


In [11]:
# create race_dmy_xx

df['race_dmy_ind'] = 0
df.loc[df.race_concat.str.contains('American Indian/Alaska Native',na=False), 'race_dmy_ind'] = 1

df['race_dmy_hwi'] = 0
df.loc[df.race_concat.str.contains('Native Hawaiian/Pacific Islander',na=False), 'race_dmy_hwi'] = 1

df['race_dmy_blk'] = 0
df.loc[df.race_concat.str.contains('Black/African American',na=False), 'race_dmy_blk'] = 1

df['race_dmy_wht'] = 0
df.loc[df.race_concat.str.contains('White/Caucasian',na=False), 'race_dmy_wht'] = 1

df['race_dmy_asn'] = 0
df.loc[df.race_concat.str.contains('Asian',na=False), 'race_dmy_asn'] = 1

df['race_dmy_mdl_estn'] = 0
df.loc[df.race_concat.str.contains('Persian/Arab/North African/Middle Eastern',na=False), 'race_dmy_mdl_estn'] = 1

In [12]:
# create 'language_at_home_binary' variable based on "langhh"（What language do you primarily speak in your household?）
df['language_at_home_binary'] = 'OTHER'
df.loc[df.langhh == 1, 'language_at_home_binary'] = 'ENGLISH ONLY'
display(df.langhh.value_counts())
display(df.language_at_home_binary.value_counts())

1     711
2     455
4      10
0       6
5       6
10      5
6       4
3       3
8       3
9       2
14      2
18      1
11      1
7       1
12      1
13      1
15      1
16      1
17      1
19      1
Name: langhh, dtype: int64

ENGLISH ONLY    711
OTHER           505
Name: language_at_home_binary, dtype: int64

In [13]:
# drop unnecessary columns and export

df.drop(columns = ['survey_route_temp',
                   'sys1_temp', 'sys2_temp', 'sys3_temp', 'sys4_temp', 
                   'route1', 'route2', 'route3', 'route4',
                   'first_route_idx', 'second_route_idx', 'third_route_idx', 'fourth_route_idx',
                   'race_1_temp', 'race_2_temp','race_3_temp','race_4_temp','race_5_temp',
                   'race_concat', 'hisp_from_race'], inplace=True)

print(list(df))

df.to_csv(r'M:\Data\OnBoard\Data and Reports\Marin Transit\Final Data\marin transit_data file_final01222021_NO POUND OR SINGLE QUOTE.csv', index=False)

['sys_RespNum', 'sys_StartTime', 'sys_EndTime', 'sys_LastQuestion', 'CCGID', 'RUNID', 'ROUTE', 'DIR', 'DAY', 'DATE', 'STRATA', 'wcode', 'WEIGHT', 'TWEIGHT', 'LANG_1', 'FromTo_c1', 'FromTo_c2', 'k12hs1', 'uni1', 'STARTLAT', 'STARTLONG', 'k12hs2', 'uni2', 'ENDLAT', 'ENDLONG', 'morebus', 'Gettolotsbus_c1', 'Gettolotsbus_c2', 'Getto1bus_c1', 'Getto1bus_c2', 'Route1', 'ONBUS-LAT1', 'ONBUS-LONG1', 'OFFBUS-LAT1', 'OFFBUS-LONG1', 'Route2', 'ONBUS-LAT2', 'ONBUS-LONG2', 'OFFBUS-LAT2', 'OFFBUS-LONG2', 'Route3', 'ONBUS-LAT3', 'ONBUS-LONG3', 'OFFBUS-LAT3', 'OFFBUS-LONG3', 'Route4', 'ONBUS-LAT4', 'ONBUS-LONG4', 'OFFBUS-LAT4', 'OFFBUS-LONG4', 'totalbus', 'businfo1', 'businfo2', 'businfo3', 'businfo4', 'sys1', 'sys2', 'sys3', 'sys4', 'fare', 'farecat', 'RideFreq', 'Sat', 'IntAccess_1', 'IntAccess_2', 'IntAccess_3', 'IntAccess_4', 'cars', 'hh', 'hhwork', 'age', 'birthyear', 'hisp', 'race_1', 'race_2', 'race_3', 'race_4', 'race_5', 'income', 'langhh', 'engspk', 'livebay', 'HOMELAT', 'HOMELONG', 'sch', '

In [14]:
# gather all transfer routes
routes = pd.DataFrame(columns = ['survey_name'])
for i in ['first_route_before_survey_board', 'second_route_before_survey_board',
          'third_route_before_survey_board', 'first_route_after_survey_alight', 
          'second_route_after_survey_alight', 'third_route_after_survey_alight']:
    route_unique = df[[i]]
    route_unique.columns = ['survey_name']
    routes = pd.concat([routes, route_unique])

routes_clean = routes.loc[(routes.survey_name.notnull()) & (routes.survey_name != '')]
routes_clean.drop_duplicates(inplace=True)
routes_clean['survey'] = 'Marin Transit'
routes_clean['survey_year'] = 2017

print(routes_clean.shape)
routes_clean[['survey','survey_year','survey_name']].to_csv(r'M:\Data\OnBoard\Data and Reports\Marin Transit\Final Data\all_routes_raw.csv', index=False)

(102, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [15]:
# bring in standard dictionary to check field consistency

# dictionary for Marin Transit survey
var = pd.read_csv(r'M:\Data\OnBoard\Data and Reports\Marin Transit\Final Data\var_dict_raw.csv',
                  encoding = "ISO-8859-1", engine='python')

# standard dictionary
var_standard = pd.read_csv(r'C:\Users\ywang\Documents\GitHub\onboard-surveys\make-uniform\production\Dictionary for Standard Database.csv')
var_standard.columns = [x+'_s' for x in list(var_standard)]

# merge
var_merge = var.merge(var_standard, left_on='Generic_Variable', right_on='Generic_Variable_s', how='outer')

# check if 'Generic_Variable' in Marin Transit dictionary matches the standard 'Generic_Variable'. chk1 should be empty
chk1 = var_merge.loc[(var_merge.Generic_Variable.notnull()) & (var_merge.Generic_Variable_s.isnull())]
print('Generic_Variable that shouldn not exit:')
print(chk1.Generic_Variable.unique())
print()

# check if columns names in survey data matches 'Survey_Variable' in Marin Transit dictionary.
# the following loops should not include variables that are needed for standardization

print('variables not in standard dictionary')
for i in var.loc[var.Generic_Variable.notnull()]['Survey_Variable']:
    if i not in list(df):
        print(i)

print('variables in df but not in the current Marin Transit dictionary')
for i in list(df):
    if i not in list(var.Survey_Variable):
        print(i)

Generic_Variable that shouldn not exit:
[]

variables not in standard dictionary
variables in df but not in the current Marin Transit dictionary
sys_RespNum
sys_StartTime
sys_EndTime
sys_LastQuestion
CCGID
RUNID
ROUTE
wcode
k12hs1
uni1
k12hs2
uni2
morebus
Gettolotsbus_c1
Gettolotsbus_c2
Getto1bus_c1
Getto1bus_c2
Route1
ONBUS-LAT1
ONBUS-LONG1
OFFBUS-LAT1
OFFBUS-LONG1
Route2
ONBUS-LAT2
ONBUS-LONG2
OFFBUS-LAT2
OFFBUS-LONG2
Route3
ONBUS-LAT3
ONBUS-LONG3
OFFBUS-LAT3
OFFBUS-LONG3
Route4
ONBUS-LAT4
ONBUS-LONG4
OFFBUS-LAT4
OFFBUS-LONG4
totalbus
businfo1
businfo2
businfo3
businfo4
sys1
sys2
sys3
sys4
RideFreq
Sat
IntAccess_1
IntAccess_2
IntAccess_3
IntAccess_4
age
race_1
race_2
race_3
race_4
race_5
livebay
k12hs3
uni3
School_Name
COMMENTS
break
on17n
off17n
on17s
off17s
on22n
off22n
on22s
off22s
on23e
off23e
on23w
off23w
on23xe
off23xe
on23xw
off23xw
on29e
off29
on29w
off29w
on35n
off35n
on35s
off35s
on36n
off36n
on36s
off36s
on49n
off49n
on49s
off49s
on61e
off61
on61w
off61w
on68e
off68e
on68w

In [16]:
# check if all the values in the survey data are included in Napa Vine dictionary
# look at non-categorical variables; "diff" should be empty or only contains nan

var_clean = var[['operator', 'Survey_year', 'Survey_Variable', 'Survey_Response', 
                 'Generic_Variable', 'Generic_Response']].drop_duplicates()
var_clean = var_clean.loc[var_clean.Generic_Variable.notnull()]

for i in var_clean.loc[var_clean.Survey_Response != 'NONCATEGORICAL']['Survey_Variable'].unique():
    print(i)
    df_sub = df[['ID', i]]
    var_sub = var_clean.loc[var_clean.Survey_Variable == i]

    if i in ['LANG_1', 'FromTo_c1', 'FromTo_c2', 'access_mode', 'egress_mode', 'fare', 'farecat', 
             'engspk', 'langhh', 'sch', 'work', 'workafter', 'workbefore', 'schafter', 'schbefore',
             'hometime_c1', 'gender', 'Mode',
             'race_dmy_ind', 'race_dmy_hwi', 'race_dmy_blk', 'race_dmy_wht', 'race_dmy_asn', 'race_dmy_mdl_estn']:
        var_sub.Survey_Response = var_sub.Survey_Response.apply(lambda x: int(x))

    if i in ['hh', 'cars', 'hhwork', 'hisp', 'income']:
        df_sub[i] = df_sub[i].apply(lambda x: str(x))    
    
    compare = df_sub.merge(var_sub, left_on=i, right_on='Survey_Response', how='left')
    diff = compare.loc[compare.Generic_Response.isnull()]
    if diff.shape[0] > 0:
        print(diff[i].unique())

DIR
DAY
LANG_1
FromTo_c1
FromTo_c2
access_mode
egress_mode
fare
farecat
cars
hh
hhwork
hisp
race_dmy_ind
race_dmy_hwi
race_dmy_blk
race_dmy_wht
race_dmy_asn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



race_dmy_mdl_estn
income
langhh
language_at_home_binary
engspk
[nan]
sch
[nan]
work
[nan]
workafter
[nan]
workbefore
[nan]
schafter
[nan]
schbefore
[nan]
hometime_c1
[nan]
gender
Mode


In [17]:
# finally, check all necessary fields are included, and export
print(var_clean.Generic_Variable.unique())
var_clean.to_csv(r'M:\Data\OnBoard\Data and Reports\Marin Transit\Final Data\vars_for_standard_dictionary.csv', index=False)

['ID' 'time_string' 'direction' 'route' 'weekpart' 'date_string'
 'time_period' 'weight' 'tweight' 'interview_language' 'orig_purp'
 'dest_purp' 'orig_lat' 'orig_lon' 'dest_lat' 'dest_lon' 'access_mode'
 'egress_mode' 'first_route_before_survey_board'
 'second_route_before_survey_board' 'third_route_before_survey_board'
 'first_route_after_survey_alight' 'second_route_after_survey_alight'
 'third_route_after_survey_alight' 'first_board_lat' 'first_board_lon'
 'last_alight_lat' 'last_alight_lon' 'fare_medium' 'fare_category'
 'vehicles' 'persons' 'workers' 'year_born_four_digit' 'hispanic'
 'race_dmy_ind' 'race_dmy_hwi' 'race_dmy_blk' 'race_dmy_wht'
 'race_dmy_asn' 'race_dmy_mdl_estn' 'household_income'
 'language_at_home_detail' 'language_at_home_binary' 'eng_proficient'
 'home_lat' 'home_lon' 'student_status' 'school_lat' 'school_lon'
 'work_status' 'workplace_lat' 'workplace_lon' 'at_work_after_dest_purp'
 'at_work_prior_to_orig_purp' 'at_school_after_dest_purp'
 'at_school_prior_to_