classical_models.ipynb

by: Archie Gertsman (arkadiy2@illinois.edu) Lloyd Fernandes (lloydf2@illinois.edu)

Project director: Richard Sowers

r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/

Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import sys

import pandas as pd
import numpy as np
from feature_eng import split_trajectories
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time
from sklearn.preprocessing import PolynomialFeatures
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from model_functions import *
import seaborn as sns

In [38]:
#initial parameters


models = {
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
        'AdaBoost':Pipeline([('scaler', StandardScaler()), ('abc', AdaBoostClassifier())]) ,
        'SVM': Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=10000,probability = True))]) ,
        'GBM': Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())]),
        'MLP': Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(hidden_layer_sizes = (250,100,25),max_iter=1000,\
                                                                             learning_rate = 'adaptive',early_stopping = True,n_iter_no_change = 10))])                 
        }
agg_dict = {
            'xtrack_diff': ['mean','std'],
            'xtrack_dist': ['mean','std'],
            'avg_surr_speed': ['mean','std'],
            'lanes':['mean'],
            'len':['mean'],
            'speed':['mean','std'],
            'speed_bool': ['count','sum'],
            'acc_edge': ['mean','std'],
            'acc_per_edge': ['mean','std']
            }

features_to_select = 10
df_acc = pd.DataFrame(index=pd.MultiIndex.from_product([models.keys(),['f1_score','accuracy'], ['mean']]))
overlap = 0.3
min_movement_limit = 1
speed_limit = 0
k = 5
test_ratio = 0.2
validation_ratio = 0.2
kf = StratifiedKFold(n_splits=k, shuffle=True)
accs = np.zeros(k)
f1 = np.zeros(k)

traj_lens = np.arange(50,250 ,step=50)
df_acc = pd.DataFrame(columns = pd.MultiIndex.from_product([[1],traj_lens,['train','val','test_voting_mean','test_voting_model']]), \
                    index=pd.MultiIndex.from_product([models.keys(),['accuracy','accuracy_baseline'], ['mean']]))

ensemble_models = {
                    'ensemble_2': ensemble(2,'val'),
                   'ensemble_3': ensemble(3,'val')
                  }


validation_ratio = 0.2
test_ratio = 0.2
pca = PCA(n_components=5)
is_pca = False
vehicle_density = 1
is_log_model_voting = True
feature_list = ['xtrack_diff','xtrack_dist','avg_surr_speed','speed','acc_edge','acc_per_edge']
col_factor = 20
window = '0.2S'
window_len = 5


In [2]:
#gridsearch Model Parameters

model_params = {
    'SVM': {
        'model': Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=10000,probability = True))]) ,
        'params' : {
            'svc__C': [1,10,100,1000],
            'svc__gamma': ['scale','auto'],
            'svc__kernel': ['linear','poly','rbf','sigmoid']

        }  
    },
    'Random Forest': {
        'model': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
        'params' : {

          
            'rf__n_estimators':[50, 100, 150,250,500],
            'rf__max_depth':[2,4,7],
            'rf__min_samples_split':[2,4], 
            'rf__min_samples_leaf':[1,3],
            'rf__max_features':[4, 5, 6]
        }
        
    },
    'AdaBoost' : {
        'model': Pipeline([('scaler', StandardScaler()), ('abc', AdaBoostClassifier())]),
        'params': {
            'abc__n_estimators':[50,100,150,250,500]
        }
    },
   'GBM' : {
        'model': Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())]),
        'params': {
            
            'gbm__learning_rate':[0.15,0.1,0.05], 
            'gbm__n_estimators':[50, 100, 150,250,500],
            'gbm__max_depth':[2,4,7],
            'gbm__min_samples_split':[2,4], 
            'gbm__min_samples_leaf':[1,3],
            'gbm__max_features':[4, 5, 6]
        }
    },
    'Log Regression' : {
        'model':  Pipeline([('pf',PolynomialFeatures()),('scaler', StandardScaler()),('lr', LogisticRegression(max_iter=10000,solver = 'liblinear'))]),
        'params': {
            'lr__penalty': ['l1','l2'],
            'lr__C': [1,10,100,1000],
            'pf__degree':[1,3,4]
        }
    },
    'MLP' : {
        'model':  Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=10000,early_stopping = True,n_iter_no_change = 10,activation = 'relu',solver = 'adam'))]),
        'params': {
            'mlp__hidden_layer_sizes': [ (250,100,25),(100,25),(300,200,100,25),(300,200,100,5)],
            'mlp__alpha': [0.0001, 0.01, 0.05]

        }
    }    
}

In [28]:

#function adds data to accuracy dataframe df
def fill_no_of_vehicle(df,X,vehicle,vehicle_density,traj_len,column):
    
    df.loc[('traj_len','Car_'+vehicle,'total'), (vehicle_density,traj_len,column)] = len(X)
    df.loc[('traj_len','Car_'+vehicle+'_percent','Car'), (vehicle_density,traj_len,column)] = sum(X == 'Car')/len(X)
    df.loc[('traj_len','Car_'+vehicle+'_percent',vehicle), (vehicle_density,traj_len,column)] = sum(X == vehicle)/ len(X)
    return df
  

In [None]:
# Car and Taxi classification with trajectory splitting and aggregation
model_gridsearch_save = {}
for vehicle in ['Taxi','Car_1']:
    model_gridsearch_save[vehicle] = {}
    if vehicle == 'Car_1':
        df_type = df[df.type == 'Car']
        accuracy_metric = 'accuracy_baseline'
        
    else : 
        df_type = df.copy()
        accuracy_metric = 'accuracy'
        
    for traj_len in traj_lens:
        col_factor = traj_len
        df_filtered = df_type.groupby(df_type.index.names[:-1]) \
                .filter(lambda grp: (len(grp) >= traj_len) )

        df_filtered['speed_bool'] = (df_filtered['speed']>speed_limit).astype(int)
        
        if vehicle == 'Car_1':
            #sample 50% of cars and label them as car_1
            df_index = df_filtered.reset_index()[['file_name','id']].drop_duplicates()
            df_filtered.loc[df_filtered.reset_index(['edge_id', 'time'],drop = True).index.isin(df_index.sample(frac = 0.5).set_index(['file_name','id']).index),'type']=vehicle
 
        df_train_val,df_test = split_train_test(df_filtered,validation_ratio)
        df_train,df_val = split_train_test(df_train_val,test_ratio)

        #aggregate trajectories
        #to train models
        X_train,y_train = get_xy(df_train,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict,outlier_limit = 1,balance = 'by_edge')
        #to pick better performing models
        X_val,y_val = get_xy(df_val,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict,balance = 'by_type')
        #to train voting model
        X_val_voting,y_val_voting = get_xy(df_val,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict)
        #to test ensemble and voting model
        X_test,y_test = get_xy(df_test,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict)

        #pca to downsample aggregate features
        if is_pca:
            pca.fit(X_train)
            X_test_voting = pd.DataFrame(data = pca.transform(X_test_voting),index = X_test_voting.index)
            X_train = pd.DataFrame(data = pca.transform(X_train),index = X_train.index)
            X_test = pd.DataFrame(data = pca.transform(X_test),index = X_test.index)
            X_val = pd.DataFrame(data = pca.transform(X_val),index = X_val.index)

        print("with traj_len = ",traj_len)
        print("No of trajectories: ",len(X_train))
        print("No of Car trajectories: ",sum(y_train == 'Car'))
        print("No of "+vehicle+" trajectories: ",sum(y_train == vehicle))
        print("\n")
        
        #fill number of cars and taxis/cars and car_1's in the result dataframe (df_acc)
        df_acc = fill_no_of_vehicle(df_acc,y_test,vehicle,vehicle_density,traj_len,'val')
        
        id_list = y_val.reset_index(['edge_id'],drop = True).reset_index().drop_duplicates()
        df_acc = fill_no_of_vehicle(df_acc,id_list.type,vehicle,vehicle_density,traj_len,'test_voting_mean')
        df_acc = fill_no_of_vehicle(df_acc,id_list.type,vehicle,vehicle_density,traj_len,'test_voting_model')
        
        model_dict = {}
        
        # fit different models
        for name, model in models.items():
           
            mp = model_params[name]
            
            #fit the model on training set
            model = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
            model.fit(X_train,y_train)
            model_gridsearch_save[vehicle][model] = model
            df_acc.loc[(name, accuracy_metric,'mean'),  (vehicle_density,traj_len,'train')] = round(100*model.best_score_, 3)
            #test the model on validation set consisting of trajectories and save accuracy estimate as test (this accuracy estimate will be used to find ensemble) 
            val_accs,_ = basic_accuracy(X_val,y_val,model)                                   
            df_acc.loc[(name, accuracy_metric,'mean'),  (vehicle_density,traj_len,'val')] = round(100*val_accs, 3)
            
            #find accuracy of the model on test set by voting among trajectories in an id using mean
            test_accs,_ = voting_accuracy(X_test,y_test, model,predict_proba = True)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_mean')] = round(100*test_accs, 3)
            #plt.savefig("traj_len"+str(traj_len)+name+".png")
            
            #train voting model for voting among trajectories using the validation set with equal number of vehicle id's
            voting_m = voting_model(model,X_val_voting,y_val_voting)
            #find the accuracy of the model on validation set with voting using logistic regression
            val_accs,_ = voting_m.accuracy(X_test,y_test)#, voting_m, predict_proba = False)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_model')] = round(100*val_accs, 3)

            #save model in dictionary for ensemble
            model_dict[name] = model

        for name,ensemble_model in ensemble_models.items():
            
            #generate ensembles with 2,3 and 5 models
            ensemble_model.find_ensemble(df_acc,traj_len,vehicle_density,True)
            ensemble_model.fit(X_train,y_train,model_dict)

            #test accuracy of ensembles on test set
            test_accs,_ = voting_accuracy(X_test,y_test, ensemble_model)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_mean')] = round(100*test_accs, 3)

            #test accuracy of ensembles on validation using voting_model (trained on validation set)
            voting_m = voting_model(ensemble_model,X_val_voting,y_val_voting)
            test_accs,_ = voting_m.accuracy(X_test,y_test)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_model')] = round(100*test_accs, 3)

df_acc.fillna(0,inplace = True)


In [14]:
df_acc.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,50,50,50,50,100,100,100,100,150,150,150,150,200,200,200,200,250,250,250,250
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,train,val,test_voting_mean,test_voting_model,train,val,test_voting_mean,test_voting_model,train,val,test_voting_mean,test_voting_model,train,val,test_voting_mean,test_voting_model,train,val,test_voting_mean,test_voting_model
AdaBoost,accuracy,mean,52.589,56.391,54.054,48.986,54.147,56.131,52.113,53.169,53.308,53.988,54.104,51.866,54.865,48.521,52.4,52.4,52.774,59.273,50.22,48.899
AdaBoost,accuracy_baseline,mean,52.124,50.959,48.696,48.986,50.434,49.276,50.157,46.082,51.06,46.043,54.698,46.644,51.065,50.598,48.897,49.265,52.594,53.774,50.0,53.252
GBM,accuracy,mean,56.438,59.529,65.203,65.541,56.47,56.482,60.915,52.113,54.648,52.821,56.716,57.09,58.139,53.107,57.6,52.0,54.287,55.091,57.709,53.304
GBM,accuracy_baseline,mean,52.671,49.088,51.594,49.565,52.512,49.05,49.53,52.038,52.895,44.749,53.691,50.0,53.591,51.196,48.897,47.794,54.182,52.201,53.252,49.593
Log Regression,accuracy,mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Log Regression,accuracy_baseline,mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MLP,accuracy,mean,54.063,57.96,61.824,60.811,53.553,57.035,56.338,51.408,53.882,56.712,57.09,52.985,54.475,54.586,57.6,52.8,52.497,59.818,57.269,51.542
MLP,accuracy_baseline,mean,51.094,48.55,49.855,51.304,50.335,48.281,50.157,49.843,51.299,52.435,50.336,48.658,53.156,51.196,45.221,51.471,51.588,52.83,52.033,41.057
Random Forest,accuracy,mean,55.809,59.252,63.176,58.784,56.23,58.744,58.803,56.338,55.286,53.113,58.582,54.478,57.912,54.586,56.8,58.0,54.24,52.727,58.15,51.982
Random Forest,accuracy_baseline,mean,52.006,49.018,48.406,51.594,51.801,50.543,52.038,50.157,52.109,44.292,53.02,51.678,52.801,54.067,48.529,48.897,52.539,51.73,52.439,55.285
