"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [1]:
import warnings; warnings.simplefilter('ignore')

In [84]:
import sys
sys.path.append('../../src/')
sys.path.append('../../data/')
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from time import time

### load data

In [85]:
df = pd.read_pickle('../../data/block4_updated.pkl')

agg_dict={
    'xtrack_dist': ['std'],
    'avg_surr_speed': ['mean','std'],
    'lanes':['mean'],
    'len':['mean'],
    'speed':['mean','std'],
    'vehicle_density': ['mean'],
    'lon_acc': ['mean','std'],
    'lat_acc': ['mean','std']
}

df = df[list(agg_dict.keys()) + ['type']]
df.index = [df.index.map(lambda idx: f'{idx[0]}_{idx[1]}'),  
            df.index.get_level_values(2),  
            df.index.get_level_values(3)]
df.index.names = ['id','road','time']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,xtrack_dist,avg_surr_speed,lanes,len,speed,vehicle_density,lon_acc,lat_acc,type
id,road,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4_1_1,250699362_250699984,42.00,-1.883401,10.464171,5.4,97.581,11.9046,7,-0.1145,0.0138,Taxi
4_1_1,250699362_250699984,42.04,-1.980795,10.457843,5.4,97.581,11.8975,7,-0.1007,0.0147,Taxi
4_1_1,250699362_250699984,42.08,-1.937041,10.452857,5.4,97.581,11.8919,7,-0.0918,0.0157,Taxi
4_1_1,250699362_250699984,42.12,-1.893287,10.448586,5.4,97.581,11.8871,7,-0.0869,0.0167,Taxi
4_1_1,250699362_250699984,42.16,-1.941984,10.444986,5.4,97.581,11.8831,7,-0.0784,0.0176,Taxi
...,...,...,...,...,...,...,...,...,...,...,...
1_3_2116,300400248_8446047162,921.64,0.658316,26.005850,6.5,106.837,25.3482,2,-0.3011,0.3234,Car
1_3_2116,300400248_8446047162,921.68,0.764478,26.008150,6.5,106.837,25.3217,2,-0.3032,0.3216,Car
1_3_2116,300400248_8446047162,921.72,0.736857,26.010500,6.5,106.837,25.2946,2,-0.3012,0.3180,Car
1_3_2116,300400248_8446047162,921.76,0.783032,26.012950,6.5,106.837,25.2667,2,-0.3054,0.3124,Car


In [86]:
feature_list = ['_'.join([key,val]) for key,vals in agg_dict.items() for val in vals]
feature_list

['xtrack_dist_std',
 'avg_surr_speed_mean',
 'avg_surr_speed_std',
 'lanes_mean',
 'len_mean',
 'speed_mean',
 'speed_std',
 'vehicle_density_mean',
 'lon_acc_mean',
 'lon_acc_std',
 'lat_acc_mean',
 'lat_acc_std']

### create df_agg

In [609]:
%%time

window = 60
overlap = 0.3

step = int((1-overlap)*window)

def f(grp):
    lst = [grp.iloc[i:i+window] for i in range(0, (len(grp)-window), step)]
    if len(lst)==0:
        return None
    a = np.array(lst)
    a = np.concatenate( (a.mean(axis=1), a.std(axis=1) ), axis=1 )
    a = pd.DataFrame(a)
    return a

df_agg = df.drop('type',axis=1).groupby(['id','road']).apply(f)

df_agg.columns = ['_'.join([col,agg]) for agg in ['mean','std'] for col in df.drop('type',axis=1).columns]
df_agg = df_agg[feature_list]

vehicle_types = df.type.groupby(['id','road']).first()
df_agg = df_agg.reset_index(-1, drop=True).join(vehicle_types)

df_agg

CPU times: user 14.3 s, sys: 483 ms, total: 14.8 s
Wall time: 14.8 s


Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist_std,avg_surr_speed_mean,avg_surr_speed_std,lanes_mean,len_mean,speed_mean,speed_std,vehicle_density_mean,lon_acc_mean,lon_acc_std,lat_acc_mean,lat_acc_std,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1_2_1013,250691795_250699359,0.122811,35.059769,1.933372,13.0,171.188,38.784775,2.168499,4.516667,0.866777,0.324213,0.233228,0.038109,Car
1_2_1013,250691795_250699359,0.370038,35.278806,2.120265,13.0,171.188,41.595718,0.342866,4.550000,0.166950,0.255385,0.111577,0.166141,Car
1_2_1013,250691795_250699359,0.289755,38.755763,1.861978,13.0,171.188,41.682442,0.374225,3.800000,-0.139460,0.263612,-0.174677,0.113386,Car
1_2_1013,250691795_250699359,0.131347,37.777329,2.013567,13.0,171.188,39.320633,1.649412,3.850000,-0.655498,0.125555,-0.048875,0.041278,Car
1_2_1013,250691795_250699359,0.072858,34.481129,1.851485,13.0,171.188,34.691568,2.136059,4.183333,-0.851812,0.046481,-0.043438,0.014847,Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4_3_994,250706958_388172075,0.363527,20.812448,2.786976,6.0,87.080,20.406982,1.959130,1.066667,0.809315,0.064972,0.639498,0.265865,Car
4_3_994,250706958_388172075,0.068874,25.045568,1.526220,6.0,87.080,24.640102,1.353915,1.066667,0.506415,0.322240,0.163750,0.166080,Car
4_3_994,250706958_388172075,0.069196,26.424028,0.420449,6.0,87.080,26.424028,0.420449,1.000000,0.135133,0.093416,-0.041227,0.052561,Car
4_3_994,250706958_388172075,0.088929,26.748915,0.167431,6.0,87.080,26.748915,0.167431,1.000000,-0.032203,0.089135,0.102667,0.063771,Car


### create df_agg_test by selecting subset from df_agg

In [610]:
test_size = 150 # per each calss

df_reset = df_agg.reset_index(level=1)

df_list = []
for t in ['Car','Taxi']:
    idx = df_reset[df_reset.type==t] \
        .index \
        .unique() \
        .to_series() \
        .sample(test_size) \
        .values
    df_list.append(df_reset.loc[idx])

df_agg_test = pd.concat(df_list).set_index('road', append=True)
df_agg_test

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist_std,avg_surr_speed_mean,avg_surr_speed_std,lanes_mean,len_mean,speed_mean,speed_std,vehicle_density_mean,lon_acc_mean,lon_acc_std,lat_acc_mean,lat_acc_std,type
id,road,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4_2_548,250691795_250699359,1.298570e-01,30.477133,3.468300,13.0,171.188,34.428525,0.744232,1.800000,0.317982,0.274318,-0.132857,0.062013,Car
4_2_548,250691795_250699359,7.046338e-02,32.611561,2.735729,13.0,171.188,34.312080,0.988924,1.450000,-0.271460,0.311266,0.045858,0.067521,Car
4_2_548,250691795_250699359,1.475961e-01,33.134488,0.434294,13.0,171.188,32.833403,0.199601,1.283333,-0.090353,0.198205,0.024790,0.051165,Car
4_2_548,250691795_250699359,1.369450e-01,33.176370,3.214610,13.0,171.188,34.414890,1.933998,1.633333,0.714038,0.635446,-0.022787,0.007811,Car
4_2_548,250691795_250699359,1.156415e-01,34.626345,4.662314,13.0,171.188,40.029502,2.139049,2.050000,0.754143,0.663672,-0.082842,0.055219,Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2_2_1104,250699362_250699984,8.881784e-16,0.001784,0.005182,5.4,97.581,0.000000,0.000000,3.000000,0.000003,0.000018,0.000000,0.000000,Taxi
2_2_1104,250699362_250699984,8.881784e-16,0.772122,0.923498,5.4,97.581,0.224767,0.516831,3.000000,0.303548,0.578945,-0.001758,0.005176,Taxi
2_2_1104,250699362_250699984,6.365605e-02,5.072059,2.376686,5.4,97.581,5.407220,3.791094,2.683333,1.383558,0.569641,-0.067495,0.054407,Taxi
2_2_1104,250699362_250699984,7.148317e-02,10.003212,2.493481,5.4,97.581,12.545518,2.071278,1.933333,0.860790,0.194831,-0.019510,0.050690,Taxi


In [611]:
df_agg_test.groupby(['id']).first().type.value_counts()

Car     150
Taxi    150
Name: type, dtype: int64

In [612]:
df.groupby(['id']).first().type.value_counts()

Car     1794
Taxi     756
Name: type, dtype: int64

### create df_agg_train by selecting df_agg - df_agg_test

In [613]:
df_agg_train = df_agg.drop(df_agg_test.index)
# df_agg_train

def balance_road(road):
    class_counts = road.groupby('id').first().type.value_counts()
    n_resample = class_counts.max() - class_counts.min()
    road.reset_index('road', inplace=True, drop=True)
    idx_resample = road[road.type==class_counts.idxmin()] \
        .index \
        .unique() \
        .to_series() \
        .sample(n_resample, replace=True) \
        .values
    resample = road.loc[idx_resample]
    return pd.concat([road,resample])

df_agg_train = df_agg_train.groupby(['road']).apply(balance_road)
df_agg_train

# class_counts = df_agg_train.groupby(['road']).first().type.value_counts()
# n_resample = class_counts['Car'] - class_counts['Taxi']

# df_agg_train.reset_index((0,2), inplace=True)

# idx_resample = df_agg_train[df_agg_train.type=='Taxi'] \
#     .index \
#     .unique() \
#     .to_series() \
#     .sample(n_resample, replace=True) \
#     .values

# resample = df_agg_train.loc[idx_resample]
# # resample.index = [ resample.index.map(lambda idx: f'{idx[0]}r'), resample.index.get_level_values(1) ]
# df_agg_train = pd.concat([df_agg_train, resample]).set_index(['id','level_2'], append=True)
# df_agg_train

Unnamed: 0_level_0,Unnamed: 1_level_0,xtrack_dist_std,avg_surr_speed_mean,avg_surr_speed_std,lanes_mean,len_mean,speed_mean,speed_std,vehicle_density_mean,lon_acc_mean,lon_acc_std,lat_acc_mean,lat_acc_std,type
road,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
250691795_250699359,1_2_1013,0.122811,35.059769,1.933372,13.0,171.188,38.784775,2.168499,4.516667,0.866777,0.324213,0.233228,0.038109,Car
250691795_250699359,1_2_1013,0.370038,35.278806,2.120265,13.0,171.188,41.595718,0.342866,4.550000,0.166950,0.255385,0.111577,0.166141,Car
250691795_250699359,1_2_1013,0.289755,38.755763,1.861978,13.0,171.188,41.682442,0.374225,3.800000,-0.139460,0.263612,-0.174677,0.113386,Car
250691795_250699359,1_2_1013,0.131347,37.777329,2.013567,13.0,171.188,39.320633,1.649412,3.850000,-0.655498,0.125555,-0.048875,0.041278,Car
250691795_250699359,1_2_1013,0.072858,34.481129,1.851485,13.0,171.188,34.691568,2.136059,4.183333,-0.851812,0.046481,-0.043438,0.014847,Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300400248_8446047162,2_5_1817,0.063600,1.735426,0.296432,6.5,106.837,1.968463,0.394792,3.000000,0.146953,0.095425,-0.003718,0.185823,Taxi
300400248_8446047162,2_5_1817,0.035743,2.181723,0.158817,6.5,106.837,2.834837,0.346734,3.000000,0.165045,0.112049,0.019433,0.109077,Taxi
300400248_8446047162,2_5_1817,0.043693,2.875334,0.385075,6.5,106.837,3.639413,0.261834,2.516667,0.081893,0.149164,0.059712,0.077071,Taxi
300400248_8446047162,2_5_1817,0.036555,3.747191,0.489107,6.5,106.837,3.935720,0.514547,2.000000,0.228673,0.427553,-0.016013,0.053153,Taxi


In [614]:
class_counts

Car     13
Taxi     5
Name: type, dtype: int64

In [615]:
# df_agg_train.groupby(['id','road']).first().type.value_counts()

### define accuracy score by two levels of voting

In [616]:
def accuracy(y, y_hat):
    y_hat = y_hat.groupby(['id','road']).agg(lambda x: x.mode()[0])
    y_hat = y_hat.groupby('id').agg(lambda x: x.mode()[0])
#     y_hat = y_hat.groupby(['id','road']).agg('mean')
#     y_hat = y_hat.groupby('id').agg('mean')
#     y_hat = y_hat.type.map(lambda x: 'Car' if x>=0.5 else 'Taxi')

    y = y.groupby('id').first()

    return accuracy_score(y, y_hat)

### train

In [617]:
# baseline
# n_cars, n_taxis = df_agg_train.groupby('id').first().type.value_counts()
# n_cars / (n_cars+n_taxis)

In [618]:
model = GradientBoostingClassifier()
X,y = df_agg_train.drop('type', axis=1), df_agg_train.type
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)

model.fit(X_train,y_train)

y_hat = model.predict(X_test)
y_hat = pd.DataFrame(index=X_test.index, data=y_hat, columns=['type'])

accuracy(y_test, y_hat)

0.6257180733539549

### test

In [619]:
X,y = df_agg_test.drop('type', axis=1), df_agg_test.type

y_hat = model.predict(X)
y_hat = pd.DataFrame(index=X.index, data=y_hat, columns=['type'])

accuracy(y, y_hat)

0.6066666666666667