In [None]:
"""ensemble_classifier.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [50]:
import sys
sys.path.append('../Data/')
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [4]:
# df = pd.concat([pd.read_pickle('../Data/block4_29_{}_feat.pkl'.format(i)) for i in [800,830,930]])
df = pd.read_pickle('../Data/block4_29_comp_feat.pkl')

In [24]:
df_agg = df[np.isin(df['type'], ['Car','Taxi'])] \
    .groupby(['id','traj']).agg({
        'xtrack_dist': 'std',
        'avg_surr_speed': ['mean','std'],
        'lon_acc': ['mean','std'],
        'lat_acc': ['mean','std'],
        'type': 'first'
    }) \
    .reset_index(drop=True)

df_agg.columns = ['_'.join(col) for col in df_agg.columns]
df_agg.rename(columns={'type_first':'type'}, inplace=True)
len(df_agg[df_agg['type'] == 'Car']), len(df_agg[df_agg['type'] == 'Taxi'])

(1789, 653)

In [53]:
n_taxi_copies = 2
taxi = df_agg[df_agg['type']=='Taxi']
taxi = pd.concat([taxi for i in range(n_taxi_copies)])
df2 = pd.concat([df_agg, taxi])
g = df2.groupby('type')
df2 = g.apply(lambda x: x.sample(g.size().min())).reset_index(drop=True)
X,y = df2.drop('type', axis=1), df2['type']

k = 10
kf = KFold(n_splits=k, shuffle=True)
accs = np.zeros(k)

models = {
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'SVM': SVC(),
    'Log Regression': LogisticRegression()
}

df_acc = pd.DataFrame(index=models.keys(), columns=['mean','std'])

for name, model in models.items():
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        a = y_hat==y_test
        accs[i] = len(a[a==True]) / len(y_test)

    df_acc.loc[name]['mean'] = round(100*accs.mean(), 3)
    df_acc.loc[name]['std'] = round(100*accs.std(), 3)

    print(name, 'complete.')

df_acc

Random Forest complete.
AdaBoost complete.
SVM complete.
Log Regression complete.


Unnamed: 0,mean,std
Random Forest,86.948,1.905
AdaBoost,70.375,2.413
SVM,66.07,1.93
Log Regression,64.143,2.696
