"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [9]:
import sys
sys.path.append('../src/')
sys.path.append('../data/')
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from modeling_helpers import *

In [10]:
df = pd.read_pickle('../data/block4_updated.pkl')

agg_dict={
    'xtrack_dist': ['mean','std'],
    'avg_surr_speed': ['mean','std'],
    'lanes':['mean'],
    'len':['mean'],
    'speed':['mean','std'],
    'lon_acc': ['mean','std'],
    'lat_acc': ['mean','std']
}

df = df[list(agg_dict.keys()) + ['type']]
df.index = [df.index.map(lambda idx: f'{idx[0]}_{idx[1]}'),  
            df.index.get_level_values(2)]
df.index.names = ['id','road']

df = df.groupby(['id','road']).filter(lambda grp: len(grp) >= 50)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist,avg_surr_speed,lanes,len,speed,lon_acc,lat_acc,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4_1_1,250699362_250699984,-1.883401,10.464171,5.4,97.581,11.9046,-0.1145,0.0138,Taxi
4_1_1,250699362_250699984,-1.980795,10.457843,5.4,97.581,11.8975,-0.1007,0.0147,Taxi
4_1_1,250699362_250699984,-1.937041,10.452857,5.4,97.581,11.8919,-0.0918,0.0157,Taxi
4_1_1,250699362_250699984,-1.893287,10.448586,5.4,97.581,11.8871,-0.0869,0.0167,Taxi
4_1_1,250699362_250699984,-1.941984,10.444986,5.4,97.581,11.8831,-0.0784,0.0176,Taxi
...,...,...,...,...,...,...,...,...,...
1_3_2116,300400248_8446047162,0.658316,26.005850,6.5,106.837,25.3482,-0.3011,0.3234,Car
1_3_2116,300400248_8446047162,0.764478,26.008150,6.5,106.837,25.3217,-0.3032,0.3216,Car
1_3_2116,300400248_8446047162,0.736857,26.010500,6.5,106.837,25.2946,-0.3012,0.3180,Car
1_3_2116,300400248_8446047162,0.783032,26.012950,6.5,106.837,25.2667,-0.3054,0.3124,Car


In [18]:
df_car = df[df.type == 'Car'].copy()
samp = df_car.reset_index()['id'].drop_duplicates().sample(frac = 0.5)
df_car.loc[df_car.reset_index('road', drop=True).index.isin(samp),'type']='Taxi'
df_car

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist,avg_surr_speed,lanes,len,speed,lon_acc,lat_acc,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4_1_29,250706958_388172075,-1.091186,21.98620,6.0,87.080,21.9862,0.3065,0.1114,Taxi
4_1_29,250706958_388172075,-1.120755,22.03120,6.0,87.080,22.0312,0.3189,0.1173,Taxi
4_1_29,250706958_388172075,-1.076202,22.07790,6.0,87.080,22.0779,0.3297,0.1233,Taxi
4_1_29,250706958_388172075,-1.165108,22.12600,6.0,87.080,22.1260,0.3386,0.1293,Taxi
4_1_29,250706958_388172075,-1.120555,22.17620,6.0,87.080,22.1762,0.3579,0.1354,Taxi
...,...,...,...,...,...,...,...,...,...
1_3_2116,300400248_8446047162,0.658316,26.00585,6.5,106.837,25.3482,-0.3011,0.3234,Taxi
1_3_2116,300400248_8446047162,0.764478,26.00815,6.5,106.837,25.3217,-0.3032,0.3216,Taxi
1_3_2116,300400248_8446047162,0.736857,26.01050,6.5,106.837,25.2946,-0.3012,0.3180,Taxi
1_3_2116,300400248_8446047162,0.783032,26.01295,6.5,106.837,25.2667,-0.3054,0.3124,Taxi


In [5]:
def filter_by_percentile(df,percentile):
    # remove top and bottom 'percentile' of data from dataframe
    top_le = 1-(percentile/100)
    bottom_le = percentile/100
    df_top = df.quantile(top_le).reset_index()
    df_top['cond'] ='('+df_top['index']+" <= "+df_top[top_le].astype(str)+')'
    df_bottom = df.quantile(bottom_le).reset_index()
    df_bottom['cond'] ='('+df_bottom['index']+" >= "+df_bottom[bottom_le].astype(str)+')'
    df = df.query(df_top.cond.str.cat(sep=' & '))
    df = df.query(df_bottom.cond.str.cat(sep=' & '))
    
    return df 

In [6]:
df_agg = filter_by_percentile(df_agg,1)
df_agg_car = filter_by_percentile(df_agg_car,1)

In [21]:
def workflow(df):
    model = Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())])

    df_train,df_test = train_test_split_vehicles(df, test_ratio)

    X_train,y_train = get_xy(df_train, window, overlap, agg_dict, min_speed_ratio, balance_roads=True)
    X_test,y_test = get_xy(df_test, window, overlap, agg_dict, min_speed_ratio)

    model.fit(X_train, y_train)

    return accuracy(model, X_test, y_test)

In [22]:
for i in range(0,10):  
    print("car, taxi accuracy:", workflow(df))
    print("car, car accuracy:", workflow(df_car))

car, taxi accuracy: 0.6182432432432432
car, car accuracy: 0.4897959183673469
car, taxi accuracy: 0.6033898305084746
car, car accuracy: 0.502906976744186
car, taxi accuracy: 0.6216216216216216
car, car accuracy: 0.5072463768115942
car, taxi accuracy: 0.5966101694915255
car, car accuracy: 0.4492753623188406
car, taxi accuracy: 0.5952380952380952
car, car accuracy: 0.527536231884058
car, taxi accuracy: 0.6081081081081081
car, car accuracy: 0.48985507246376814
car, taxi accuracy: 0.6406779661016949
car, car accuracy: 0.5565217391304348
car, taxi accuracy: 0.6428571428571429
car, car accuracy: 0.5072886297376094
car, taxi accuracy: 0.6542372881355932
car, car accuracy: 0.49854227405247814
car, taxi accuracy: 0.6232876712328768
car, car accuracy: 0.5217391304347826


### KMeans

In [10]:
# for i in range(0,3):
#     for k in range(17,20):

#         (X_train,y_train), (X_test,y_test) = train_test_split_vehicles(df_agg, 150)

#         kmeans = KMeans(k).fit(X_train)
#         X_train['cluster'] = kmeans.labels_

#         car_ratios = np.zeros(k)
#         for c in range(k):
#             counts = y_train[X_train.cluster==c].value_counts()
#             ratio = counts['Car'] / (counts['Car'] + counts['Taxi'])
#             car_ratios[c] = ratio


#         c_car = car_ratios.argmax()
#         c_taxi = car_ratios.argmin()

#         c_hat = kmeans.predict(X_test)
#         X_test['cluster'] = c_hat
#         idx_confident = np.isin(X_test.cluster, [c_car,c_taxi])

#         y_test = y_test[idx_confident]
#         y_hat = pd.Series(c_hat[idx_confident]).map({c_car:'Car', c_taxi:'Taxi'})
#         y_hat.index = y_test.index

#         y_hat = y_hat.groupby(['id','road']).agg(lambda x: x.mode()[0])
#         y_hat = y_hat.groupby('id').agg(lambda x: x.mode()[0])

#         y_test = y_test.groupby('id').first()

#         print("",accuracy_score(y_test, y_hat))

#     print("\n")

 0.5034965034965035
 0.5616438356164384
 0.5929203539823009


 0.5259259259259259
 0.4925373134328358
 0.5


 0.4803921568627451
 0.5234899328859061
 0.34615384615384615


