"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [1]:
import warnings; warnings.simplefilter('ignore')

In [346]:
import sys
sys.path.append('../../Lib/')
import pandas as pd
import numpy as np
from feature_eng import split_trajectories
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time

In [347]:
df = pd.read_pickle('../../Data/block4_concat_lane.pkl')  \
    .set_index('edge_id', append=True) \
    .reorder_levels((0,1,3,2))

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,dir,xtrack_dist,time_stamp,edge_progress_intervals,len,lanes,node_veh_dist,edge_seg,vehicle_density,avg_surr_speed
file_name,id,edge_id,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
4_1,1,250699362_250699984,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,0,-1.883401,42.0,0.3,97.581,5.4,29.81433,1.0,7,10.464171
4_1,1,250699362_250699984,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,0,-1.980795,42.04,0.3,97.581,5.4,29.67483,1.0,7,10.457843
4_1,1,250699362_250699984,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,0,-1.937041,42.08,0.3,97.581,5.4,29.537753,1.0,7,10.452857
4_1,1,250699362_250699984,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,0,-1.893287,42.12,0.3,97.581,5.4,29.400718,1.0,7,10.448586
4_1,1,250699362_250699984,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,0,-1.941984,42.16,0.3,97.581,5.4,29.330986,1.0,7,10.444986


In [348]:
def rolling_agg(df, agg_dict, window_size=100, step=25):
    # rolling agg with step size = 1
    df_agg = df.groupby(df.index.names[:-1]) \
                .rolling(window_size) \
                .agg(agg_dict) \
                .dropna()
    
    # select a subset of above computations to achieve custom step size
    df_agg = df_agg.groupby(df_agg.index.names, 
                            as_index=False, 
                            group_keys=False) \
                .apply(lambda x: x[::step])
    
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    
    # add 'type' column
    vehicle_types = df.type.groupby(df.index.names[:-1]).first()
    return df_agg.join(vehicle_types)
    

In [367]:
min_traj_len = 300
min_speed_ratio = 0.75

def speed_ratio(grp, min_speed=0):
    return len(grp[grp.speed > min_speed]) / len(grp)


df_train = df.groupby(df.index.names[:-1]) \
            .filter(lambda grp: (len(grp) >= min_traj_len) & (speed_ratio(grp) >= min_speed_ratio))


df_train_agg = rolling_agg(df_train, window_size=150, step=150, agg_dict={ 
    'xtrack_dist': ['std'],
    'avg_surr_speed': ['mean','std'],
#     'lanes':['median'],
    'len':['median'],
    'speed':['mean','std'],
    'vehicle_density': ['median'],
    'lon_acc': ['mean','std'],
    'lat_acc': ['mean','std']
})

g = df_train_agg.groupby('type', group_keys=False)
df_train_agg = g.apply(lambda grp: grp.sample(g.size().min()))

X,y = df_train_agg.drop('type', axis=1), df_train_agg.type
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,xtrack_dist_std,avg_surr_speed_mean,avg_surr_speed_std,len_median,speed_mean,speed_std,vehicle_density_median,lon_acc_mean,lon_acc_std,lat_acc_mean,lat_acc_std
file_name,id,edge_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2_4,1724,250700083_250700084,0.201693,29.512269,3.120714,115.003,28.708367,3.337739,2.0,-2.455560e-01,0.883798,4.269467e-02,0.080279
2_1,413,250700083_388172056,0.213179,8.562221,2.550699,77.404,12.171530,5.046953,1.0,-7.388033e-01,0.270407,1.848800e-02,0.094524
4_3,610,250699613_250699614,0.632994,23.962951,6.181683,111.016,24.386715,4.743964,2.0,-3.618980e-01,1.135285,-6.187200e-02,0.218619
4_1,768,250699362_250699984,0.086513,8.545655,4.210971,97.581,7.044725,5.863809,4.0,7.496853e-01,0.719642,1.331200e-02,0.012717
1_2,464,250691795_250699359,0.132861,28.248498,4.057700,171.188,27.020399,1.886277,5.0,1.225987e-01,0.490441,8.456000e-03,0.138126
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4_2,2,250699362_250708641,0.000000,0.000067,0.000024,112.555,0.000134,0.000048,2.0,4.736952e-17,0.000000,-3.053113e-18,0.000000
1_2,1072,250700084_250700085,0.032849,0.352043,0.418820,77.438,0.505264,0.730617,3.0,-5.642000e-03,0.329546,1.131933e-02,0.024324
2_5,250,250699362_250708641,0.067254,2.919351,1.596806,112.555,4.120228,3.455727,4.0,2.643327e-01,0.689821,-2.864200e-02,0.088096
2_4,1398,250699613_250699614,0.178047,3.553470,0.681385,111.016,4.111007,0.655643,2.0,-5.397467e-02,0.242352,-6.207333e-03,0.049777


In [369]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
model = GradientBoostingClassifier()

accs = np.zeros(5)

for i, (train_index, test_index) in enumerate(kf.split(X,y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train,y_train)
    y_hat = model.predict(X_test)
    y_hat = pd.DataFrame(index=X_test.index, data=y_hat, columns=['type'])
    
    y_hat = y_hat.groupby(y_hat.index.names).agg(lambda x: x.mode()[0])
    y_test = y_test.groupby(y_test.index.names).agg('first')
    
    accs[i] = accuracy_score(y_test, y_hat)
    print(accs[i])
    
accs.mean(), accs.std()

0.5560747663551402
0.543778801843318
0.5537383177570093
0.5879629629629629
0.5487528344671202


(0.55806153667711, 0.015539505933190784)