In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import pylab
import glob, os
import scipy.stats as stats
from scipy.stats import gaussian_kde
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import chi2
from sklearn.metrics import r2_score
import timeit
%matplotlib inline

In [2]:
# Format to remove scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# options of samples

In [3]:
Sample = pd.read_csv(r'R:/Angela/fast_trips/internal dataset/SF_2012_Sep_20th.csv')

In [4]:
Sample.shape

(94460, 18)

#my_data

In [5]:
my_data = Sample

In [6]:
#Or try a small sample if needed
'''
msk = np.random.rand(len(Sample)) < 0.95
my_data = Sample[~msk]
'''

'\nmsk = np.random.rand(len(Sample)) < 0.95\nmy_data = Sample[~msk]\n'

In [7]:
def data_content(data):
    data = data[['ON', 'OFF', 'VEHNO', 'ANAME', 'STOPA', 'YR', 'HR', 'MIN', 'SEC', 'DHR', 'DMIN', 'DSEC', 'ROUTE', 'LOAD', 'date_id']]
    return data

In [8]:
test = data_content(my_data)
test.shape

(94460, 15)

In [9]:
test.columns

Index([u'ON', u'OFF', u'VEHNO', u'ANAME', u'STOPA', u'YR', u'HR', u'MIN',
       u'SEC', u'DHR', u'DMIN', u'DSEC', u'ROUTE', u'LOAD', u'date_id'],
      dtype='object')

#Prepare bus info

In [10]:
vehicles = pd.read_csv(r'R:\Angela\fast_trips\Vehicles.csv')
fleet = pd.read_csv(r'R:\Angela\fast_trips\Copy of Fleet.csv')

# Artic
vehicles.Artic = vehicles.Length.map({"60'" : 1, "40'" : 0, "30'" : 0})
vehicles.loc[:,'Artic'] = pd.Series(vehicles.Artic, index=vehicles.index)
df_artic = vehicles.set_index('Equip_Type').to_dict()['Artic']
fleet['Artic'] = fleet['Equip_Type'].map(df_artic)
df_vehnum_artic = fleet.set_index('VehNum').to_dict()['Artic']

# Low Floor
vehicles.Floor = vehicles['Low Floor'].map({'Y': 1, 'N' : 0})
vehicles.loc[:,'Floor'] = pd.Series(vehicles.Floor, index=vehicles.index)
df_floor = vehicles.set_index('Equip_Type').to_dict()['Floor']
fleet['Floor'] = fleet['Equip_Type'].map(df_floor)
df_vehnum_floor = fleet.set_index('VehNum').to_dict()['Floor']

# Door
df_doors = vehicles.set_index('Equip_Type').to_dict()['Doors']
fleet['Doors'] = fleet['Equip_Type'].map(df_doors)
df_vehnum_doors = fleet.set_index('VehNum').to_dict()['Doors']

# Capacity
vehicles.loc[:,'Total Capacity'] = pd.Series(vehicles['Total Capacity'], index=vehicles.index)
df_capacity = vehicles.set_index('Equip_Type').to_dict()['Total Capacity']
fleet['capacity'] = fleet['Equip_Type'].map(df_capacity)
df_vehnum_capacity = fleet.set_index('VehNum').to_dict()['capacity']

# Prepare route type

In [11]:
route_type = pd.read_csv(r'R:\Angela\fast_trips\MuniRouteTypes.csv')
route_type = route_type.dropna()
dict_route_type = {}
dict_route_type = route_type.set_index('APC Route ID')['Type'].to_dict()

#Step1: Prepare basic variables

In [12]:
def get_x_y(data):
    start = timeit.default_timer()
    # Before cleaning records, get the load data from the previous stop
    data['pre_load'] = data['LOAD'].shift()

    # Get rid of rows where certain fields has null/nan values
    data = data.dropna(subset = ['ON', 'OFF', 'VEHNO'])
    data = data[data['ON'] + data['OFF'] != 0]

    # COMPUTE TIMESTOP=((HR * 3600) + (MIN * 60) + SEC)
    start = timeit.default_timer()
    data['COMPUTE_TIMESTOP'] = data['HR']*3600 + data['MIN']*60 + data['SEC']
    # COMPUTE DOORCLOSE=(( DHR * 3600) + (DMIN * 60) + DSEC)
    data['COMPUTE_DOORCOLSE'] = data['DHR']*3600 + data['DMIN']*60 + data['DSEC']
    # COMPUTE DOORDWELL=DOORCLOSE - TIMESTOP
    data['COMPUTE_DOORDWELL'] = data['COMPUTE_DOORCOLSE'] - data['COMPUTE_TIMESTOP']
    # Appling door dwell time less than 120 secs
    data = data.loc[data['COMPUTE_DOORDWELL'] <= 90]
    data = data[data['COMPUTE_DOORDWELL'] != 0]
    stop = timeit.default_timer()
    print 'compute dwell time:', stop - start

    # Keep rows that satisfy a query:
    start = timeit.default_timer()
    data['Doors'] = data['VEHNO'].map(df_vehnum_doors) 
    data['Artic'] = data['VEHNO'].map(df_vehnum_artic)
    data['Floor'] = data['VEHNO'].map(df_vehnum_floor)
    data['capacity'] = data['VEHNO'].map(df_vehnum_capacity)
    data['two_doors'] = data['Doors'].map({2: 1, 3: 0})
    data['three_doors'] = data['Doors'].map({2: 0, 3: 1})
    #data['all_door_boarding']= data.apply(lambda x: x['mo'] > 6, axis=1).map({False: 0, True: 1})
    
    # Create dummie variables for route id
    data['Route Type'] = data['ROUTE'].map(dict_route_type)
    just_dummies_route = pd.get_dummies(data['Route Type'])
    step_1 = pd.concat([data, just_dummies_route], axis=1)
    step_1.drop(['Local'], inplace=True, axis=1)
    data = step_1
    stop = timeit.default_timer()
    print 'add veh&route info:', stop - start

    # Create interaction variables
    start = timeit.default_timer()
    data['on_threedoors'] = data['ON']*data['three_doors']
    data['off_threedoors'] = data['OFF']*data['three_doors']
    data['on_floor'] = data['ON']*data['Floor']
    data['off_floor'] = data['OFF']*data['Floor']
    data['floor_threedoors'] = data['Floor']*data['three_doors']
    data['floor_twodoors'] = data['Floor']*data['two_doors']
    #data['on_all_door_boarding'] = data['ON']*data['all_door_boarding']
    #data['off_all_door_boarding'] = data['OFF']*data['all_door_boarding']
    data['on_express'] = data['ON']*data['Express']
    data['off_express'] = data['OFF']*data['Express']
    data['on_rapid'] = data['ON']*data['Rapid']
    data['off_rapid'] = data['OFF']*data['Rapid']
    data['on_owl'] = data['ON']*data['OWL']
    data['off_owl'] = data['OFF']*data['OWL']
    stop = timeit.default_timer()
    print 'add interaction variables:', stop - start

    return data

## Step2: Adding more passenger activity variables

In [13]:
def passenger_var(data):
    start = timeit.default_timer()
    data['max_pasg'] = data[['ON', 'OFF']].max(axis=1)
    print 'data shape:', data.shape
    data['abs_pasg'] = (data['ON'] - data['OFF']).abs()
    print 'data shape:', data.shape
    
    # A passenger friction factor was constructed to account for passenger activity on buses with standees. 
    # It was posited that heavily loaded buses have greater dwell times. 
    # STANDEES are the number of passengers when LOAD minus 60% of bus capacity is positive. 
    data['pre_standees']= data['pre_load'] - 0.60 * data['capacity']
    data['pre_crowding']= data.apply(lambda x: x['pre_standees'] > 0, axis=1).map({False: 0, True: 1})
    # A proxy variable was constructed by adding ONS, OFFS, and STANDEES.
    data['friction'] = ((data['ON'] + data['OFF'] + (data['pre_standees']).abs()) * data['pre_crowding']).abs()
    print 'data shape:', data.shape
    stop = timeit.default_timer()
    print 'add passenger activity variables:', stop - start

    # Remove the corner data, which is the first and last stop data
    start = timeit.default_timer()
    # COMPUTE EOL = RINDEX(ANAME,' - EOL') 
    data['eol'] = data.apply(lambda x: '- EOL' in x['ANAME'], axis=1).map({False: 1, True: 0})
    # Remove the last stop
    data = data.loc[data['eol'] == 1]
    # Remove the first stop
    data = data.loc[data['STOPA'] != 1]
    stop = timeit.default_timer()
    print 'remove corner data:', stop - start
    
    return data

In [14]:
step1 = get_x_y(test)

compute dwell time: 0.0587139951669
add veh&route info: 0.0807351013807
add interaction variables: 0.00731280022264


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
step2 = passenger_var(step1)

data shape: (47150, 42)
data shape: (47150, 43)
data shape: (47150, 46)
add passenger activity variables: 0.693607646098
remove corner data: 0.696235509173


# Step3: Prepare vehicle ID variables

In [16]:
# Create dummie variables for bus id 
def create_vehID_day(data):
    start = timeit.default_timer()
    data['vehno_date'] = data.VEHNO.astype(str) + '_' + data.date_id.astype(str)
    #person_table['person_id'] = person_table.hh_id.astype(str) + '_' + person_table.pno.astype(str) 
    just_dummies_veh = pd.get_dummies(data['vehno_date'])
    data = pd.concat([data, just_dummies_veh], axis=1)
    #get rid of one dummy variable to avoid the dummy variable trap
    #step_1.drop([8515], inplace=True, axis=1)
    stop = timeit.default_timer()
    print 'data shape:', data.shape
    print 'add vehid&day variables:', stop - start
    
    return data

In [17]:
step3 = create_vehID_day(step2)

data shape: (46568, 224)
add vehid&day variables: 0.375615084767


In [18]:
step3.columns

Index([u'ON', u'OFF', u'VEHNO', u'ANAME', u'STOPA', u'YR', u'HR', u'MIN',
       u'SEC', u'DHR',
       ...
       u'8414_3', u'8415_3', u'8416_3', u'8451_3', u'8503_3', u'8504_3',
       u'8505_3', u'8506_3', u'8508_3', u'8520_3'],
      dtype='object', length=224)

# Step4: Delete a vehicle variable in each route type

In [19]:
# Get three dataframes for three bus type
test = step3
df_art = test.loc[test.Artic == 1]
df_std_low = test.loc[(test.Artic == 0) & (test.Floor == 1)]
df_std_high = test.loc[(test.Artic == 0) & (test.Floor == 0)]

In [20]:
# Get the bus IDs, who runs one route type in a day. 
def delet_dict_1(df):
    del_dict = {}
    df = df.dropna()
    print len(df)
    for veh_day in np.unique(df.vehno_date):
        df_veh = df.loc[df.vehno_date == veh_day]
        if (len(np.unique(df_veh['Route Type'])) == 1):
            rte_type = np.unique(df_veh['Route Type'])
            rte_type = str(rte_type)
            del_dict[rte_type] = veh_day
    print del_dict
    return del_dict

In [21]:
# Get the bus IDs, who runs three route types in a day. 
def delet_dict_3(df):
    del_dict = {}
    df = df.dropna()
    print len(df)
    for veh_day in np.unique(df.vehno_date):
        df_veh = df.loc[df.vehno_date == veh_day]
        if (len(np.unique(df_veh['Route Type'])) == 3):
            rte_type = np.unique(df_veh['Route Type'])
            rte_type = str(rte_type)
            del_dict[rte_type] = veh_day
    print del_dict
    return del_dict

In [22]:
print np.unique(df_art['Route Type'])
print np.unique(df_std_low['Route Type'])
print np.unique(df_std_high['Route Type'])

[nan 'Express' 'Local' 'Rapid']
[nan 'Express' 'Local' 'OWL' 'Rapid']
[nan 'Express' 'Local' 'Rapid']


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [23]:
print 'Articulated bus:'
del_art_1 = delet_dict_1(df_art)
print 'Standard bus with low floor:'
del_std_low_1 = delet_dict_1(df_std_low)
del_std_low_3 = delet_dict_3(df_std_low)
print 'Standard bus with high floor:'
del_std_high_1 = delet_dict_1(df_std_high)
del_std_high_3 = delet_dict_3(df_std_high)

Articulated bus:
12123
{"['Express']": '6403_3', "['Local']": '7123_3', "['Rapid']": '7117_3'}
Standard bus with low floor:
5528
{"['Local']": '8520_3'}
5528
{"['Local' 'OWL' 'Rapid']": '8415_3'}
Standard bus with high floor:
28737
{"['Express']": '8234_3', "['Local']": '8228_3'}
28737
{"['Express' 'Local' 'Rapid']": '8235_3'}


In [27]:
# Create a dictionary that including the bus_day_id that need to be dropped
del_bus = {}
# Articulated bus
del_bus['Express1'] = '6403_3'
del_bus['Local1'] = '7123_3'
del_bus['Rapid1'] = '7117_3'
#Standard bus with low floor
del_bus['Local2'] = '8415_3' #Including Rapid2, OWL2, no Express in this type
#Standard bus with high floor
del_bus['Express3'] = '8235_3' #including Local3 and Rapid3, no OWL in this type
print del_bus.values()

['8415_3', '7123_3', '6403_3', '8235_3', '7117_3']


In [28]:
print step3.shape
step4 = step3.drop(del_bus.values(), 1)
print step4.shape

(46568, 224)
(46568, 219)


In [26]:
#Delete data (dwell_time = 0) if needed
'''
step4 = step4.loc[step4['COMPUTE_DOORDWELL'] > 0]
step4.shape
'''

"\nstep4 = step4.loc[step4['COMPUTE_DOORDWELL'] > 0]\nstep4.shape\n"

In [29]:
# you may need drop some columns
#data = step4.drop(['Unnamed: 0'], axis = 1)
print 'step4 shape', step4.shape
data = step4
#drop the first row, since the NaN value
data = data[1:]
print 'data shape', data.shape

step4 shape (46568, 219)
data shape (46567, 219)


# Save this one day as validation dataset 

In [30]:
data.to_csv(r'R:/Angela/fast_trips/internal dataset/SF_2012_Sep_20th_validation.csv')

In [None]:
print 'Congrats! go for a qucik run at Twin Peak!'