"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [1]:
import sys
sys.path.append('../src/')
sys.path.append('../data/')
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from modeling_helpers import *
import optuna

In [2]:
df = pd.read_pickle('../data/block4_updated.pkl')

agg_dict={
    'xtrack_dist': ['mean','std'],
    'avg_surr_speed': ['mean','std'],
    'lanes':['mean'],
    'len':['mean'],
    'speed':['mean','std'],
    'lon_acc': ['mean','std'],
    'lat_acc': ['mean','std']
}

df = df[list(agg_dict.keys()) + ['type']]
df.index = [df.index.map(lambda idx: f'{idx[0]}_{idx[1]}'),  
            df.index.get_level_values(2)]
df.index.names = ['id','road']

df = df.groupby(['id','road']).filter(lambda grp: len(grp) >= 50)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist,avg_surr_speed,lanes,len,speed,lon_acc,lat_acc,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4_1_1,250699362_250699984,-1.883401,10.464171,5.4,97.581,11.9046,-0.1145,0.0138,Taxi
4_1_1,250699362_250699984,-1.980795,10.457843,5.4,97.581,11.8975,-0.1007,0.0147,Taxi
4_1_1,250699362_250699984,-1.937041,10.452857,5.4,97.581,11.8919,-0.0918,0.0157,Taxi
4_1_1,250699362_250699984,-1.893287,10.448586,5.4,97.581,11.8871,-0.0869,0.0167,Taxi
4_1_1,250699362_250699984,-1.941984,10.444986,5.4,97.581,11.8831,-0.0784,0.0176,Taxi
...,...,...,...,...,...,...,...,...,...
1_3_2116,300400248_8446047162,0.658316,26.005850,6.5,106.837,25.3482,-0.3011,0.3234,Car
1_3_2116,300400248_8446047162,0.764478,26.008150,6.5,106.837,25.3217,-0.3032,0.3216,Car
1_3_2116,300400248_8446047162,0.736857,26.010500,6.5,106.837,25.2946,-0.3012,0.3180,Car
1_3_2116,300400248_8446047162,0.783032,26.012950,6.5,106.837,25.2667,-0.3054,0.3124,Car


In [3]:
df_agg = downsample(df, window=50, overlap=0.3, min_speed_ratio=0.75, agg_dict=agg_dict)

In [5]:
%%time

print('car vs. taxi:')
accs_mean,accs_std = workflow(
    df_agg        = df_agg, 
    model         = Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())]), 
#     splitter_obj  = StratifiedKFold(shuffle=True), 
    splitter_obj  = RepeatedStratifiedKFold(5,3),
#     metric        = confusion_matrix, 
    metric        = f1_score,
    metric_kwargs = {'pos_label':'Car'},
    balance_train = 'overall',
    balance_test  = False,
    parallel      = True
)
accs_mean,accs_std

car vs. taxi:




CPU times: user 155 ms, sys: 95.3 ms, total: 251 ms
Wall time: 55.7 s


(0.7282081257878964, 0.019652612113353868)