# TSDC Data Cleaning

This notebook is set up to intake the files from the TSDC records and process them according to the data cleaning outlined in our paper

MISSING 1 user and 1,272 trips! Not sure where they're getting missed

In [24]:
import numpy as np
import pandas as pd
from collections import defaultdict
import scaffolding

In [25]:
# Loading mapping dictionaries from mapping_dictionaries notebook
%store -r df_ei
%store -r dic_re
%store -r dic_pur
%store -r dic_fuel

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)
dic_fuel = defaultdict(lambda: 'Other',dic_fuel)

## Mini Pilot Data

In [3]:
mini_confirmed_trips = pd.read_csv('mini_pilot/data/analysis_confirmed_trip.csv')

In [4]:
print(len(mini_confirmed_trips))
print(mini_confirmed_trips.perno.nunique())
# mini_confirmed_trips.columns

#this is one more user and about 1,000 more trips than we had in our minipilot dataset 
## - but we haven't removed no labels?

3492
13


In [5]:
## remove trips with no label and count again
labeled_mini = mini_confirmed_trips[mini_confirmed_trips.data_user_input_mode_confirm.notna()]
labeled_mini = mini_confirmed_trips[mini_confirmed_trips.data_user_input_purpose_confirm.notna()]
# labeled_mini = mini_confirmed_trips[mini_confirmed_trips.data_user_input_purpose_confirm.notna()]

print(len(labeled_mini)) #only 25 over data used in paper
print(labeled_mini.perno.nunique())#same as data used in paper

2403
12


In [6]:
mini_data = labeled_mini.copy()

#first, add the cleaned mode
mini_data['Mode_confirm']= mini_data['data_user_input_mode_confirm'].map(dic_re)

#second, add the cleaned replaced mode ASSUMES PROGRAM
mini_data['Replaced_mode']= mini_data['data_user_input_replaced_mode'].map(dic_re)

#third, add the cleaned purpose
mini_data['Trip_purpose']= mini_data['data_user_input_purpose_confirm'].map(dic_pur)

In [7]:
# Combine variable categories
mini_data = mini_data.replace('Gas Car, drove alone', 'Car')
mini_data = mini_data.replace('Gas Car, with others', 'Shared Car')
mini_data = mini_data.replace('Bikeshare', 'Shared Micromobility')
mini_data = mini_data.replace('Scooter share', 'Shared Micromobility')
mini_data = mini_data.replace('Regular Bike', 'Personal Micromobility')
mini_data = mini_data.replace('Skate board', 'Personal Micromobility')
mini_data = mini_data.replace('Train', 'Transit')
mini_data = mini_data.replace('Free Shuttle', 'Transit')
mini_data = mini_data.replace('Bus', 'Transit')
mini_data = mini_data.replace('Walk', 'Walk')
mini_data = mini_data.replace('Taxi/Uber/Lyft', 'Ridehail')
mini_data = mini_data.replace('Pilot ebike', 'E-Bike')

#filter out 'not a trip' trips
mini_data = mini_data[~mini_data['Mode_confirm'].isin(['Not a Trip'])]
mini_data = mini_data[~mini_data['Replaced_mode'].isin(['Not a Trip'])]
mini_data = mini_data[~mini_data['Trip_purpose'].isin(['not_a_trip'])]

print(len(mini_data))

2354


In [8]:
mini_data.loc[mini_data['Mode_confirm']=='Personal Micromobility', 'Mode_confirm'] = 'Other'
mini_data.loc[mini_data['Mode_confirm']=='Shared Micromobility', 'Mode_confirm'] = 'Other'

t1 = mini_data.groupby(['Mode_confirm'], as_index=False).count()[['Mode_confirm','data_distance']]
t1['proportion'] = t1['data_distance'] / np.sum(t1.data_distance)
t1['trip_type'] = 'All Trips'

t2 = mini_data[mini_data['Trip_purpose']=='Work'].copy()
t2 = t2.groupby(['Mode_confirm'], as_index=False).count()[['Mode_confirm','data_distance']]
t2['proportion'] = t2['data_distance'] / np.sum(t2.data_distance)
t2['trip_type'] = 'Work Trips'
t2.loc[1.5] = 'Other', 0, 0, 'Work Trips'
t2 = t2.sort_index().reset_index(drop=True)

mini_data = pd.concat([t1,t2])
mini_data['Dataset'] = 'Minipilot'
mini_data.columns = ['Mode','Count','Proportion','Trip Type', "Dataset"]

In [9]:
mini_data #trip breakdown is really close to data used in paper!

Unnamed: 0,Mode,Count,Proportion,Trip Type,Dataset
0,Car,477,0.202634,All Trips,Minipilot
1,E-bike,776,0.329652,All Trips,Minipilot
2,Other,28,0.011895,All Trips,Minipilot
3,Ridehail,65,0.027613,All Trips,Minipilot
4,Shared Car,685,0.290994,All Trips,Minipilot
5,Transit,155,0.065845,All Trips,Minipilot
6,Walk,168,0.071368,All Trips,Minipilot
0,Car,110,0.295699,Work Trips,Minipilot
1,E-bike,134,0.360215,Work Trips,Minipilot
2,Other,0,0.0,Work Trips,Minipilot


### matching minis to survey data

In [10]:
mini_trips = pd.read_csv('mini_pilot/data/analysis_confirmed_trip.csv')
# mini_trips = labeled_mini.copy()
mini_surveys = pd.read_csv('mini_pilot/data/survey_household.csv')

print(len(mini_trips))
print(len(mini_surveys)) #15 surveys
print(mini_trips.perno.nunique()) #13 unique users

3492
15
13


In [11]:
socio_data = mini_surveys[~mini_surveys.perno.isnull()]
print(len(socio_data))
# socio_data.columns

15


In [12]:
# Deal with people who have multiple responses by using most recent
socio_data = socio_data.sort_values(by=['perno', 'timestamp'])
socio_data.drop_duplicates(subset=['perno'], keep='last', inplace=True)
socio_data['user_id_socio'] = socio_data.perno
socio_data.user_id_socio = [i.replace('-','') for i in socio_data.user_id_socio] # remove all dashes from strings
socio_data = socio_data.drop(labels='perno', axis=1)

print(len(socio_data)) #same as number of users

13


In [13]:
# Lose some trips due to people with no survey responses
mini_trips['user_id_socio'] = mini_trips.perno.astype(str)
mini_trips.user_id_socio = [i.replace('-','') for i in mini_trips.user_id_socio] # remove all dashes from strings
mini_trips = mini_trips.merge(socio_data, on='user_id_socio')

print(mini_trips.user_id_socio.nunique()) #lost one person that has no survey record -- down to 12 people
print(len(mini_trips))

12
3662


In [14]:
# mini_trips.head()

## Full Pilot Data

In [60]:
##alternate survey prep - from the whole dataset

# surveys = pd.read_csv('abby_ceo/survey_household_all.csv')
# # surveys.unique_user_id_autofilled_do_not_edit.unique()

# surveys = pd.read_csv('abby_ceo/survey_household_all.csv')
# print(len(surveys), 'total surveys')

# surveys = surveys[~surveys['unique_user_id_autofilled_do_not_edit'].isnull()]
# print(len(surveys), 'surveys after dropping null ids')

# surveys = surveys.sort_values(by=['unique_user_id_autofilled_do_not_edit', 'timestamp'])
# surveys.drop_duplicates(subset=['unique_user_id_autofilled_do_not_edit'], keep='last', inplace=True)
# print(len(surveys),'surveys', surveys['unique_user_id_autofilled_do_not_edit'].nunique(), 'users after dropping duplicates')

# #prepare survey ids for merging
# surveys['user_id_socio'] = surveys['unique_user_id_autofilled_do_not_edit'].astype(str)
# surveys['user_id_socio'] = surveys['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
# surveys = surveys.drop(labels='unique_user_id_autofilled_do_not_edit', axis=1)

# print(surveys.user_id_socio.unique())

In [61]:
#loop over
programs = ['4c', 'cc', 'fc', 'pc', 'sc', 'vail_22']
datasets = []

for program in programs:
    print('starting with ', program)
    
    #create dataset with surveys and trips
    trips = pd.read_csv('abby_ceo/' + program + '/analysis_confirmed_trip.csv')
    print(len(trips), 'trips')
    print(trips.perno.nunique(), 'people')

    surveys = pd.read_csv('abby_ceo/' + program + '/' + program + '_survey_household.csv')
    print(len(surveys), 'surveys')

    #drop any null ids
    socio_data = surveys[~surveys['unique_user_id_autofilled_do_not_edit'].isnull()]
    print(len(socio_data), 'surveys after dropping null ids')

    #drop duplicates
    socio_data = socio_data.sort_values(by=['unique_user_id_autofilled_do_not_edit', 'timestamp'])
    socio_data.drop_duplicates(subset=['unique_user_id_autofilled_do_not_edit'], keep='last', inplace=True)
    print(len(socio_data),'surveys', socio_data['unique_user_id_autofilled_do_not_edit'].nunique(), 'users after dropping duplicates')

    #prepare survey ids for merging
    socio_data['user_id_socio'] = socio_data['unique_user_id_autofilled_do_not_edit'].astype(str)
    socio_data['user_id_socio'] = socio_data['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
    socio_data['user_id_socio'] = socio_data['user_id_socio']
    socio_data = socio_data.drop(labels='unique_user_id_autofilled_do_not_edit', axis=1)

    #prepare trip ids for merging
    trips['user_id_socio'] = trips.perno.astype(str)
    trips['user_id_socio'] = trips['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
    trips.user_id_socio = [i.replace('-','') for i in trips.user_id_socio] # remove all dashes from strings

    #merge the data
    data = trips.merge(socio_data, on='user_id_socio')
    print(len(data), 'trips after merging')
    print(data.user_id_socio.nunique(), 'people after merging')
    
    data['program'] = program.split('_')[0]
    
    #add to list of datasets
    datasets.append(data)

starting with  4c
10121 trips
14 people
28 surveys
28 surveys after dropping null ids
15 surveys 15 users after dropping duplicates
8874 trips after merging
13 people after merging
starting with  cc


  trips = pd.read_csv('abby_ceo/' + program + '/analysis_confirmed_trip.csv')


75199 trips
64 people
72 surveys
72 surveys after dropping null ids
50 surveys 50 users after dropping duplicates
72260 trips after merging
47 people after merging
starting with  fc
32442 trips
30 people
47 surveys
47 surveys after dropping null ids
30 surveys 30 users after dropping duplicates
32341 trips after merging
29 people after merging
starting with  pc
51196 trips
39 people
65 surveys
65 surveys after dropping null ids
39 surveys 39 users after dropping duplicates
50693 trips after merging
38 people after merging
starting with  sc
17989 trips
22 people
29 surveys
29 surveys after dropping null ids
15 surveys 15 users after dropping duplicates
15565 trips after merging
14 people after merging
starting with  vail_22
9133 trips
12 people
11 surveys
11 surveys after dropping null ids
9 surveys 9 users after dropping duplicates
7447 trips after merging
9 people after merging


In [62]:
#merge them all together
full_data = pd.concat(datasets)
print(len(full_data), 'trips')
print(full_data.perno.nunique(), 'users')

# data = full_data.merge(surveys, on='user_id_socio')
# print(len(data), 'trips after merging with surveys')
# print(data.user_id_socio.nunique(), 'people after merging with surveys')

187180 trips
150 users


In [63]:
# trip_ids = full_data.user_id_socio.unique()
# survey_ids = surveys.user_id_socio.unique()
# for id in survey_ids:
#     if id not in trip_ids:
#         print(id)

## some surveys don't have trips, some trips don't have surveys...

In [35]:
#filter out unlabeled trips
labeled_data = full_data[full_data.data_user_input_mode_confirm.notna()]
labeled_data = labeled_data[labeled_data.data_user_input_purpose_confirm.notna()]

print(len(labeled_data), 'labeled trips')
print(labeled_data.user_id_socio.nunique(), 'users who labeled')         

73993 labeled trips
143 users who labeled


so far so good, we're looking for at least 122 users and at least 61,496 trips after ALL cleaning

In [36]:
labeled_data.rename(columns={'user_id_socio':'user_id',
                          'please_identify_which_category_represents_your_total_household_':'HHINC',
                          'how_many_motor_vehicles_are_owned_leased_or_available_for_regul':'VEH',
                            ' how_many_motor_vehicles_are_owned_leased_or_available_for_regul':'VEH',
                             'how_many_motor_vehicles_are_owned_leased_or_available_for_regul ':'VEH',
                           'in_which_year_were_you_born?':'AGE',
                          'including_yourself_how_many_people_live_in_your_home?':'HHSIZE',
                          'how_many_children_under_age_18_live_in_your_home?':'CHILDREN',
                          'what_is_your_gender?':'GENDER',
                          'if_you_were_unable_to_use_your_household_vehicles_which_of_the_':'available_modes',
                          'are_you_a_student?':'STUDENT',
                         'data_duration':'duration', 
                         'data_distance':'distance'}, inplace=True)

In [37]:
data = labeled_data.copy()

#first, add the cleaned mode
data['Mode_confirm']= data['data_user_input_mode_confirm'].map(dic_re)

#second, add the cleaned replaced mode ASSUMES PROGRAM
data['Replaced_mode']= data['data_user_input_replaced_mode'].map(dic_re)

#third, add the cleaned purpose
data['Trip_purpose']= data['data_user_input_purpose_confirm'].map(dic_pur)

# Get timestamp from known year/month/day aggregated to days
data.rename(columns={'data_start_local_dt_year':'year','data_start_local_dt_month':'month','data_start_local_dt_day':'day'}, inplace=True)
data['date_time'] = pd.to_datetime(data[['year','month','day']])

# Fix age (birth year to age)
data['AGE'] = 2022 - data['AGE']

# Number of workers (size of HH - kids)
data['WORKERS'] = data['HHSIZE'] - data['CHILDREN']

# Duration in minutes (hours to minutes)
data['duration'] = data['duration'] / 60

# duration in miles (meters to miles)
data['distance'] = data['distance'] / 1609.34

# E-bike/not E-Bike variable
data['is_ebike'] = "E-Bike Trips"
data.loc[data['Mode_confirm']!="E-bike", 'is_ebike'] = "Non E-Bike Trips"

In [38]:
#separating programs
four_corners = data[data.program == "4c"]
community_cycles = data[data.program == "cc"]
fort_collins = data[data.program == "fc"]
pueblo = data[data.program == "pc"]
smart_commute = data[data.program == "sc"]
vail = data[data.program == "vail"]

print(four_corners['user_id'].nunique())
print(community_cycles['user_id'].nunique())
print(fort_collins['user_id'].nunique())
print(pueblo['user_id'].nunique())
print(smart_commute['user_id'].nunique())
print(vail['user_id'].nunique())

print(len(four_corners))
print(len(community_cycles))
print(len(fort_collins))
print(len(pueblo))
print(len(smart_commute))
print(len(vail))

13
44
27
36
14
9
3496
27673
11701
17535
8385
5203


In [40]:
print(len(data))
print(data.user_id.nunique())

#records that had ’prefer not to say’ as a response for household income, household vehicles, and other available modes
data = data[~data['HHINC'].isin(['Prefer not to say'])]
data = data[~data['VEH'].isin(['Prefer not to say / Prefiero no decir.'])]
data = data[~data['available_modes'].isin(['None', 'Prefer not to say'])]

print(len(data))
print(data.user_id.nunique())

data['HHINC_NUM'] = data.HHINC.replace(['Less than $24,999',
                                       '$25,000-$49,999',
                                       '$50,000-$99,999',
                                       '$100,000 -$149,999',
                                       '$150,000-$199,999',
                                       '$200,000 or more'], [12500,37500,75000,125000,175000,250000])

# Calculate average income per adult in the household
data['PINC'] = data['HHINC_NUM'] / data['WORKERS']

print(len(data))
print(data.user_id.nunique())

# Combine variable categories
data = data.replace('Gas Car, drove alone', 'Car')
data = data.replace('Gas Car, with others', 'Shared Car')
data = data.replace('Bikeshare', 'Shared Micromobility')
data = data.replace('Scooter share', 'Shared Micromobility')
data = data.replace('Regular Bike', 'Personal Micromobility')
data = data.replace('Skate board', 'Personal Micromobility')
data = data.replace('Train', 'Transit')
data = data.replace('Free Shuttle', 'Transit')
data = data.replace('Bus', 'Transit')
data = data.replace('Walk', 'Walk')
data = data.replace('Taxi/Uber/Lyft', 'Ridehail')
data = data.replace('Pilot ebike', 'E-Bike')

print(len(data))
print(data.user_id.nunique())

# Categorical type will include all days/modes in groupby even if there is no data for a particular tabulation
data.user_id = pd.Categorical(data.user_id)
data.date_time = pd.Categorical(data.date_time)
data.mode_confirm = pd.Categorical(data.data_user_input_mode_confirm, ordered=True, categories=np.unique(list(dic_re.keys())))

print(len(data))
print(data.user_id.nunique())

# Add order to categorical variables
data.HHINC = pd.Categorical(data.HHINC, ordered=True)
data['Mode'] = pd.Categorical(data.Mode_confirm, ordered=True, categories=[
    'E-bike',
    'Car',
    'Shared Car',
    'Walk',
    'Transit',
    'Personal Micromobility',
    'Shared Micromobility',
    'Ridehail',
    'Other'])
data.VEH = data.VEH.astype(str)
data.VEH = pd.Categorical(data.VEH, ordered=True, categories=['0','1','2','3','4+'])
data['PINC_NUM'] = data['PINC']
data.PINC = pd.cut(data.PINC, bins=[0,10000,20000,30000,40000,50000,60000,70000,999999],
                  labels=["$0-9",
                         "$10-19",
                         "$20-29",
                         "$30-39",
                         "$40-49",
                         "$50-59",
                         "$60-69",
                         "$70+"])

print(len(data))
print(data.user_id.nunique())

73993
143
66837
132
66837
132
66837
132
66837
132
66837
132


  data.mode_confirm = pd.Categorical(data.data_user_input_mode_confirm, ordered=True, categories=np.unique(list(dic_re.keys())))


In [41]:
#separating programs
four_corners = data[data.program == "4c"]
community_cycles = data[data.program == "cc"]
fort_collins = data[data.program == "fc"]
pueblo = data[data.program == "pc"]
smart_commute = data[data.program == "sc"]
vail = data[data.program == "vail"]

print(four_corners['user_id'].nunique())
print(community_cycles['user_id'].nunique())
print(fort_collins['user_id'].nunique())
print(pueblo['user_id'].nunique())
print(smart_commute['user_id'].nunique())
print(vail['user_id'].nunique())

print(len(four_corners))
print(len(community_cycles))
print(len(fort_collins))
print(len(pueblo))
print(len(smart_commute))
print(len(vail))

11
41
25
33
14
8
2752
25822
11618
13622
8385
4638


In [43]:
#filtered out ages that were greater than 100
data = data[data['AGE'] < 100]
print(len(data))
print(data.user_id.nunique())

#filter out durations longer than 8 hours
data = data[data['duration']<480]
print(len(data))
print(data.user_id.nunique())

#distances more than 50 miles 
data = data[data['distance']<50]
print(len(data))
print(data.user_id.nunique())

#filter household sizes smaller than the number of kids
data = data[data['HHSIZE']>data['CHILDREN']]
print(len(data))
print(data.user_id.nunique())

#filter out households greater than 10
data = data[data['HHSIZE']<10]
print(len(data))
print(data.user_id.nunique())

66837
132
66689
132
65864
132
64648
130
64479
129


In [45]:
# Vehicles per driver
data = data[data['VEH'].notna()] #vails VEH nums were not strings?
data['VEH_num'] = data['VEH'].replace(['1','2','3','4+'],[1,2,3,4]).astype(int)
data['DRIVERS'] = data["including_yourself_how_many_people_have_a_driver's_license_in_y"]
data['DRIVERS_num'] = data['DRIVERS'].replace
data['veh_per_driver'] = (data['VEH_num'] / data['DRIVERS']).fillna(0)
data.loc[data['veh_per_driver']==np.inf, 'veh_per_driver'] = 0

#filter out 'not a trip' trips
data = data[~data['Mode_confirm'].isin(['Not a Trip'])]
data = data[~data['Replaced_mode'].isin(['Not a Trip'])]
data = data[~data['Trip_purpose'].isin(['not_a_trip'])]

print(len(data), 'trips after filtering') #around 63,000
print(data.user_id.nunique(), 'users after filtering') #132 it sounds like

62699 trips after filtering
129 users after filtering


# filtering out trips before first e-bike

In [47]:
data.program.unique()

array(['4c', 'cc', 'fc', 'pc', 'sc', 'vail'], dtype=object)

In [48]:
data.rename(columns = {'data_start_ts':'start_ts'}, inplace=True)

In [49]:
#separating programs
four_corners = data[data.program == "4c"]
community_cycles = data[data.program == "cc"]
fort_collins = data[data.program == "fc"]
pueblo = data[data.program == "pc"]
smart_commute = data[data.program == "sc"]
vail = data[data.program == "vail"]

print(four_corners['user_id'].nunique())
print(community_cycles['user_id'].nunique())
print(fort_collins['user_id'].nunique())
print(pueblo['user_id'].nunique())
print(smart_commute['user_id'].nunique())
print(vail['user_id'].nunique())

print(len(four_corners))
print(len(community_cycles))
print(len(fort_collins))
print(len(pueblo))
print(len(smart_commute))
print(len(vail))

10
41
25
32
13
8
2506
25071
10925
13020
6860
4317


In [50]:
#filtering each of them
from datetime import datetime

In [51]:
#smart commute filtering

#timestamp conversion
smart_commute['start_ts']= pd.to_datetime(smart_commute['start_ts'], utc=True, unit='s')

#grouping, counting unique users
trip_sep=smart_commute.groupby(['user_id','Mode_confirm']).apply(lambda x:x[x.start_ts==min(x.start_ts)])
print(trip_sep['user_id'].nunique())

#consider only trips with E-bike (to get first e-bike trip)
sc_ebike_first=trip_sep[trip_sep['Mode_confirm']=='E-bike']

#get all the trips by ysers who ever had an e-bike trip
sc_ebike_user_list= sc_ebike_first['user_id'].tolist()
smart_commute_incl_ebike = smart_commute[smart_commute['user_id'].isin(sc_ebike_user_list)]
print(smart_commute_incl_ebike['user_id'].nunique())

#filter to the earliest ebike trip
for unique_id in sc_ebike_first['user_id']:
    for date in sc_ebike_first['start_ts']:
        smart_commute_ebike_first=smart_commute_incl_ebike[(smart_commute_incl_ebike['start_ts'] >= date)]

sc_unique_ebikefirst=smart_commute_ebike_first['user_id'].unique()
print(smart_commute_ebike_first['user_id'].nunique()) #11

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smart_commute['start_ts']= pd.to_datetime(smart_commute['start_ts'], utc=True, unit='s')


13
11
11


In [52]:
#filter four corners
four_corners['start_ts']= pd.to_datetime(four_corners['start_ts'], utc=True, unit='s')

trip_sep_fc=four_corners.groupby(['user_id','Mode_confirm']).apply(lambda x:x[x.start_ts==min(x.start_ts)])
print(trip_sep_fc['user_id'].nunique())

fc_ebike_first=trip_sep_fc[trip_sep_fc['Mode_confirm']=='E-bike']

fc_ebike_user_list= fc_ebike_first['user_id'].tolist()
four_corners_incl_ebike = four_corners[four_corners['user_id'].isin(fc_ebike_user_list)]
print(four_corners_incl_ebike['user_id'].nunique())

for unique_id in fc_ebike_first['user_id']:
    for date in fc_ebike_first['start_ts']:
        four_corners_ebike_first=four_corners_incl_ebike[(four_corners_incl_ebike['start_ts'] >= date)]
        
fc_unique_ebikefirst=four_corners_ebike_first['user_id'].unique()
print(four_corners_ebike_first['user_id'].nunique())

10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  four_corners['start_ts']= pd.to_datetime(four_corners['start_ts'], utc=True, unit='s')


10
10


In [53]:
#filtering community cycles
community_cycles['start_ts']= pd.to_datetime(community_cycles['start_ts'], utc=True, unit='s')

trip_sep_cc=community_cycles.groupby(['user_id','Mode_confirm']).apply(lambda x:x[x.start_ts==min(x.start_ts)])
print(trip_sep_cc['user_id'].nunique())

cc_ebike_first=trip_sep_cc[trip_sep_cc['Mode_confirm']=='E-bike']

cc_ebike_user_list= cc_ebike_first['user_id'].tolist()
community_cycles_incl_ebike = community_cycles[community_cycles['user_id'].isin(cc_ebike_user_list)]
print(community_cycles_incl_ebike['user_id'].nunique())

for unique_id in cc_ebike_first['user_id']:
    for date in cc_ebike_first['start_ts']:
        community_cycles_ebike_first=community_cycles_incl_ebike[(community_cycles_incl_ebike['start_ts'] >= date)]

cc_unique_ebikefirst=community_cycles_ebike_first['user_id'].unique()
print(community_cycles_ebike_first['user_id'].nunique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  community_cycles['start_ts']= pd.to_datetime(community_cycles['start_ts'], utc=True, unit='s')


41
41
41


In [54]:
#filtering fort collins
fort_collins['start_ts']= pd.to_datetime(fort_collins['start_ts'], utc=True, unit='s')

trip_sep_fc=fort_collins.groupby(['user_id','Mode_confirm']).apply(lambda x:x[x.start_ts==min(x.start_ts)])
print(trip_sep_fc['user_id'].nunique())

fc_ebike_first=trip_sep_fc[trip_sep_fc['Mode_confirm']=='E-bike']

fc_ebike_user_list= fc_ebike_first['user_id'].tolist()
fort_collins_incl_ebike = fort_collins[fort_collins['user_id'].isin(fc_ebike_user_list)]
print(fort_collins_incl_ebike['user_id'].nunique())

for unique_id in fc_ebike_first['user_id']:
    for date in fc_ebike_first['start_ts']:
        fort_collins_ebike_first=fort_collins_incl_ebike[(fort_collins_incl_ebike['start_ts'] >= date)]
        
fc_unique_ebikefirst=fort_collins_ebike_first['user_id'].unique()
print(fort_collins_ebike_first['user_id'].nunique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fort_collins['start_ts']= pd.to_datetime(fort_collins['start_ts'], utc=True, unit='s')


25
22
22


In [55]:
#filtering pueblo
pueblo['start_ts']= pd.to_datetime(pueblo['start_ts'], utc=True, unit='s')

trip_sep_pu=pueblo.groupby(['user_id','Mode_confirm']).apply(lambda x:x[x.start_ts==min(x.start_ts)])
print(trip_sep_pu['user_id'].nunique())

pu_ebike_first=trip_sep_pu[trip_sep_pu['Mode_confirm']=='E-bike']

pu_ebike_user_list= pu_ebike_first['user_id'].tolist()
pueblo_incl_ebike = pueblo[pueblo['user_id'].isin(pu_ebike_user_list)]
print(pueblo_incl_ebike['user_id'].nunique())

for unique_id in pu_ebike_first['user_id']:
    for date in pu_ebike_first['start_ts']:
        pueblo_ebike_first=pueblo_incl_ebike[(pueblo_incl_ebike['start_ts'] >= date)]
        
pu_unique_ebikefirst=pueblo_ebike_first['user_id'].unique()
print(pueblo_ebike_first['user_id'].nunique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pueblo['start_ts']= pd.to_datetime(pueblo['start_ts'], utc=True, unit='s')


32
29
29


In [56]:
#filtering vail
vail['start_ts']= pd.to_datetime(vail['start_ts'], utc=True, unit='s')

trip_sep_va=vail.groupby(['user_id','Mode_confirm']).apply(lambda x:x[x.start_ts==min(x.start_ts)])
print(trip_sep_va['user_id'].nunique())

va_ebike_first=trip_sep_va[trip_sep_va['Mode_confirm']=='E-bike']

va_ebike_user_list= va_ebike_first['user_id'].tolist()
vail_incl_ebike = vail[vail['user_id'].isin(va_ebike_user_list)]
print(vail_incl_ebike['user_id'].nunique())

for unique_id in va_ebike_first['user_id']:
    for date in va_ebike_first['start_ts']:
        vail_ebike_first=vail_incl_ebike[(vail_incl_ebike['start_ts'] >= date)]
        
va_unique_ebikefirst=vail_ebike_first['user_id'].unique()
print(vail_ebike_first['user_id'].nunique())

8
8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vail['start_ts']= pd.to_datetime(vail['start_ts'], utc=True, unit='s')


8


In [57]:
#checking num users and num trips in each program
## num users is perfect
print('4c', four_corners_ebike_first['user_id'].nunique(), len(four_corners_ebike_first))
print('cc', community_cycles_ebike_first['user_id'].nunique(), len(community_cycles_ebike_first))
print('fc', fort_collins_ebike_first['user_id'].nunique(), len(fort_collins_ebike_first))
print('pc', pueblo_ebike_first['user_id'].nunique(), len(pueblo_ebike_first))
print('sc', smart_commute_ebike_first['user_id'].nunique(), len(smart_commute_ebike_first))
print('vail', vail_ebike_first['user_id'].nunique(), len(vail_ebike_first))

## num trips needs some work -- 4c is under and the rest are over ...

4c 10 1398
cc 41 24820
fc 22 10635
pc 29 12570
sc 11 6491
vail 8 4310


In [58]:
#combining the filtered datasets
filtered_merged = pd.concat([four_corners_ebike_first, community_cycles_ebike_first, fort_collins_ebike_first, 
                             pueblo_ebike_first, smart_commute_ebike_first,vail_ebike_first], axis=0)
print(len(filtered_merged)) #
print(filtered_merged['user_id'].nunique()) #

60224
121


In [59]:
# Summary statistics table
print(len(pd.unique(filtered_merged.user_id)))
stat_data = filtered_merged[['duration']]
stat_data.describe()

121


Unnamed: 0,duration
count,60224.0
mean,24.049814
std,30.644487
min,6.8e-05
25%,9.121165
50%,15.148001
75%,27.941898
max,479.495935


In [None]:
#save as a csv, to be used as input to analysis!
filtered_merged.to_csv("tsdc_filtered_merged_trips.csv")