"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [1]:
import sys
sys.path.append('../src/')
sys.path.append('../data/')
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from modeling_helpers import *

In [2]:
df = pd.read_pickle('../data/block4_updated.pkl')

agg_dict={
    'xtrack_dist': ['mean','std'],
    'avg_surr_speed': ['mean','std'],
    'lanes':['mean'],
    'len':['mean'],
    'speed':['mean','std'],
    'lon_acc': ['mean','std'],
    'lat_acc': ['mean','std']
}

df = df[list(agg_dict.keys()) + ['type']]
df.index = [df.index.map(lambda idx: f'{idx[0]}_{idx[1]}'),  
            df.index.get_level_values(2)]
df.index.names = ['id','road']

df = df.groupby(['id','road']).filter(lambda grp: len(grp) >= 50)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist,avg_surr_speed,lanes,len,speed,lon_acc,lat_acc,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4_1_1,250699362_250699984,-1.883401,10.464171,5.4,97.581,11.9046,-0.1145,0.0138,Taxi
4_1_1,250699362_250699984,-1.980795,10.457843,5.4,97.581,11.8975,-0.1007,0.0147,Taxi
4_1_1,250699362_250699984,-1.937041,10.452857,5.4,97.581,11.8919,-0.0918,0.0157,Taxi
4_1_1,250699362_250699984,-1.893287,10.448586,5.4,97.581,11.8871,-0.0869,0.0167,Taxi
4_1_1,250699362_250699984,-1.941984,10.444986,5.4,97.581,11.8831,-0.0784,0.0176,Taxi
...,...,...,...,...,...,...,...,...,...
1_3_2116,300400248_8446047162,0.658316,26.005850,6.5,106.837,25.3482,-0.3011,0.3234,Car
1_3_2116,300400248_8446047162,0.764478,26.008150,6.5,106.837,25.3217,-0.3032,0.3216,Car
1_3_2116,300400248_8446047162,0.736857,26.010500,6.5,106.837,25.2946,-0.3012,0.3180,Car
1_3_2116,300400248_8446047162,0.783032,26.012950,6.5,106.837,25.2667,-0.3054,0.3124,Car


In [3]:
# df_car = df[df.type == 'Car'].copy()
# samp = df_car.reset_index()['id'].drop_duplicates().sample(frac = 0.5)
# df_car.loc[df_car.reset_index('road', drop=True).index.isin(samp),'type']='Taxi'
# df_car

In [7]:
df_agg = downsample(df, window=50, overlap=0.3, min_speed_ratio=0.75, agg_dict=agg_dict)
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist_mean,xtrack_dist_std,avg_surr_speed_mean,avg_surr_speed_std,lanes_mean,len_mean,speed_mean,speed_std,lon_acc_mean,lon_acc_std,lat_acc_mean,lat_acc_std,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1_2_1013,250691795_250699359,-3.140617,0.078220,35.321736,2.017878,13.0,171.188,38.236730,1.958387,0.961714,0.260654,0.233384,0.041331,Car
1_2_1013,250691795_250699359,-3.520525,0.252957,34.856470,1.595792,13.0,171.188,41.317574,0.636626,0.288190,0.326024,0.204148,0.048433,Car
1_2_1013,250691795_250699359,-4.265499,0.337519,37.134833,2.682016,13.0,171.188,41.828390,0.106399,0.029634,0.056158,-0.105316,0.200778,Car
1_2_1013,250691795_250699359,-4.852599,0.154550,39.273232,1.084950,13.0,171.188,41.278516,0.785931,-0.323640,0.282848,-0.149556,0.125057,Car
1_2_1013,250691795_250699359,-5.148796,0.107915,37.126010,1.979998,13.0,171.188,38.470946,1.490240,-0.712118,0.093721,-0.035446,0.024488,Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4_3_994,250706958_388172075,-2.149278,0.045035,24.210812,1.955157,6.0,87.080,23.724252,1.541217,0.686184,0.212786,0.278212,0.137730,Car
4_3_994,250706958_388172075,-2.033916,0.088273,25.925756,0.374007,6.0,87.080,25.925756,0.374007,0.227184,0.139680,-0.017088,0.091710,Car
4_3_994,250706958_388172075,-1.900893,0.040787,26.713990,0.276544,6.0,87.080,26.713990,0.276544,0.086160,0.155832,-0.008224,0.075023,Car
4_3_994,250706958_388172075,-1.768401,0.098736,26.676008,0.151060,6.0,87.080,26.676008,0.151060,-0.075006,0.064038,0.133912,0.035031,Car


In [8]:
def workflow(df, test_ratio):
    model = Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())])

    (X_train,y_train), (X_test,y_test) = train_test_split_vehicles(df_agg, test_ratio)

    model.fit(X_train, y_train)

    return accuracy(model, X_test, y_test)

In [9]:
accs = np.zeros(10)

for i in range(0,10):  
    accs[i] = workflow(df, test_ratio=0.2)
    print(accs[i])
    
print('car v taxi:', accs.mean(), accs.std())

0.6054421768707483
0.6530612244897959
0.6462585034013606
0.6938775510204082
0.6190476190476191
0.6326530612244898
0.6700680272108843
0.6326530612244898
0.6666666666666666
0.6054421768707483
car v taxi: 0.6425170068027211 0.02756781213326819


In [45]:
accs = np.zeros(10)
base = np.zeros(10)

for i in range(0,10):  
    accs[i] = workflow(df)
    print("car, taxi accuracy:", accs[i])
    
    base[i] = workflow(df_car)
    print("car, car accuracy:", base[i], '\n')
    
print('car v taxi:', accs.mean(), accs.std())
print('car v car:', base.mean(), base.std())

car, taxi accuracy: 0.6091549295774648
car, car accuracy: 0.5030120481927711 

car, taxi accuracy: 0.6148409893992933
car, car accuracy: 0.4570552147239264 

car, taxi accuracy: 0.5907473309608541
car, car accuracy: 0.524390243902439 

car, taxi accuracy: 0.648936170212766
car, car accuracy: 0.5398773006134969 

car, taxi accuracy: 0.6334519572953736
car, car accuracy: 0.46646341463414637 

car, taxi accuracy: 0.5950704225352113
car, car accuracy: 0.5152439024390244 

car, taxi accuracy: 0.5964912280701754
car, car accuracy: 0.5104477611940299 

car, taxi accuracy: 0.6321428571428571
car, car accuracy: 0.4652567975830816 

car, taxi accuracy: 0.5644599303135889
car, car accuracy: 0.5197568389057751 

car, taxi accuracy: 0.6341463414634146
car, car accuracy: 0.47575757575757577 

with filtering:
car v taxi: 0.6119442156970999 0.024445170423738383
car v car: 0.49772610979462656 0.02763703514547632
