"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [1]:
import sys
sys.path.append('../src/')
sys.path.append('../data/')
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from modeling_helpers import *

In [37]:
df = pd.read_pickle('../data/block4_updated.pkl')

df['speed_bool'] = (df.speed>0).astype(int)

agg_dict={
    'xtrack_dist': ['std'],
    'avg_surr_speed': ['mean','std'],
    'lanes':['mean'],
    'len':['mean'],
    'speed':['mean','std'],
    'speed_bool': ['mean'],
    'vehicle_density': ['mean'],
    'lon_acc': ['mean','std'],
    'lat_acc': ['mean','std']
}

df = df[list(agg_dict.keys()) + ['type']]
df.index = [df.index.map(lambda idx: f'{idx[0]}_{idx[1]}'),  
            df.index.get_level_values(2)]
df.index.names = ['id','road']

df = df.groupby(['id','road']).filter(lambda grp: len(grp) >= 50)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist,avg_surr_speed,lanes,len,speed,speed_bool,vehicle_density,lon_acc,lat_acc,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4_1_1,250699362_250699984,-1.883401,10.464171,5.4,97.581,11.9046,1,7,-0.1145,0.0138,Taxi
4_1_1,250699362_250699984,-1.980795,10.457843,5.4,97.581,11.8975,1,7,-0.1007,0.0147,Taxi
4_1_1,250699362_250699984,-1.937041,10.452857,5.4,97.581,11.8919,1,7,-0.0918,0.0157,Taxi
4_1_1,250699362_250699984,-1.893287,10.448586,5.4,97.581,11.8871,1,7,-0.0869,0.0167,Taxi
4_1_1,250699362_250699984,-1.941984,10.444986,5.4,97.581,11.8831,1,7,-0.0784,0.0176,Taxi
...,...,...,...,...,...,...,...,...,...,...,...
1_3_2116,300400248_8446047162,0.658316,26.005850,6.5,106.837,25.3482,1,2,-0.3011,0.3234,Car
1_3_2116,300400248_8446047162,0.764478,26.008150,6.5,106.837,25.3217,1,2,-0.3032,0.3216,Car
1_3_2116,300400248_8446047162,0.736857,26.010500,6.5,106.837,25.2946,1,2,-0.3012,0.3180,Car
1_3_2116,300400248_8446047162,0.783032,26.012950,6.5,106.837,25.2667,1,2,-0.3054,0.3124,Car


In [38]:
df_agg = downsample(df, 50, 0.3, agg_dict)
df_agg = df_agg[df_agg.speed_bool_mean >= 0.75]
df_agg.drop('speed_bool_mean', axis=1, inplace=True)
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist_std,avg_surr_speed_mean,avg_surr_speed_std,lanes_mean,len_mean,speed_mean,speed_std,vehicle_density_mean,lon_acc_mean,lon_acc_std,lat_acc_mean,lat_acc_std,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1_2_1013,250691795_250699359,0.078220,35.321736,2.017878,13.0,171.188,38.236730,1.958387,4.60,0.961714,0.260654,0.233384,0.041331,Car
1_2_1013,250691795_250699359,0.252957,34.856470,1.595792,13.0,171.188,41.317574,0.636626,4.76,0.288190,0.326024,0.204148,0.048433,Car
1_2_1013,250691795_250699359,0.337519,37.134833,2.682016,13.0,171.188,41.828390,0.106399,4.42,0.029634,0.056158,-0.105316,0.200778,Car
1_2_1013,250691795_250699359,0.154550,39.273232,1.084950,13.0,171.188,41.278516,0.785931,3.88,-0.323640,0.282848,-0.149556,0.125057,Car
1_2_1013,250691795_250699359,0.107915,37.126010,1.979998,13.0,171.188,38.470946,1.490240,4.04,-0.712118,0.093721,-0.035446,0.024488,Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4_3_994,250706958_388172075,0.045035,24.210812,1.955157,6.0,87.080,23.724252,1.541217,1.08,0.686184,0.212786,0.278212,0.137730,Car
4_3_994,250706958_388172075,0.088273,25.925756,0.374007,6.0,87.080,25.925756,0.374007,1.00,0.227184,0.139680,-0.017088,0.091710,Car
4_3_994,250706958_388172075,0.040787,26.713990,0.276544,6.0,87.080,26.713990,0.276544,1.00,0.086160,0.155832,-0.008224,0.075023,Car
4_3_994,250706958_388172075,0.098736,26.676008,0.151060,6.0,87.080,26.676008,0.151060,1.00,-0.075006,0.064038,0.133912,0.035031,Car


In [39]:
(X_train,y_train), (X_test,y_test) = train_test_split_vehicles(df_agg, 150)

model = Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())])
model.fit(X_train, y_train)

accuracy(model, X_test, y_test)

0.5333333333333333

### KMeans

In [40]:
(X_train,y_train), (X_test,y_test) = train_test_split_vehicles(df_agg, 150)

k = 3

kmeans = KMeans(k).fit(X_train)
X_train['cluster'] = kmeans.labels_

car_ratios = np.zeros(k)
for c in range(k):
    counts = y_train[X_train.cluster==c].value_counts()
    ratio = counts['Car'] / (counts['Car'] + counts['Taxi'])
    car_ratios[c] = ratio
    
c_car = car_ratios.argmax()
c_taxi = car_ratios.argmin()
    
c_hat = kmeans.predict(X_test)
X_test['cluster'] = c_hat
idx_confident = np.isin(X_test.cluster, [c_car,c_taxi])

y_test = y_test[idx_confident]
y_hat = pd.Series(c_hat[idx_confident]).map({c_car:'Car', c_taxi:'Taxi'})
y_hat.index = y_test.index

y_hat = y_hat.groupby(['id','road']).agg(lambda x: x.mode()[0])
y_hat = y_hat.groupby('id').agg(lambda x: x.mode()[0])

y_test = y_test.groupby('id').first()

accuracy_score(y_test, y_hat)

0.5241379310344828

In [41]:
y_test.value_counts()

Taxi    146
Car     144
Name: type, dtype: int64