classical_models.ipynb

by: Archie Gertsman (arkadiy2@illinois.edu)
Lloyd Fernandes (lloydf2@illinois.edu)

Project director: Richard Sowers

r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/

Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license


In [1]:
import warnings; warnings.simplefilter('ignore')

In [2]:
import sys
sys.path.append('../../Lib/')
import pandas as pd
import numpy as np
from feature_eng import split_trajectories
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time
from sklearn.preprocessing import PolynomialFeatures
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from model_functions import *
import seaborn as sns

In [3]:
df = pd.read_pickle('block4_updated.pkl') 
    
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,vehicle_density,avg_surr_speed,edge_bearing,acc_edge,acc_per_edge,xtrack_diff,xtrack_diff_sq,acc_edge_sq,acc_per_edge_sq,vehicle_density_by_lane
file_name,id,edge_id,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
4_1,1,250699362_250699984,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,7,10.464171,-2.83013,0.11322,0.021953,0.0,0.0,0.012819,0.000482,1.296296
4_1,1,250699362_250699984,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,7,10.457843,-2.83013,0.10036,0.016867,0.0,0.0,0.010072,0.000284,1.296296
4_1,1,250699362_250699984,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,7,10.452857,-2.83013,0.092194,0.013188,0.0,0.0,0.0085,0.000174,1.296296
4_1,1,250699362_250699984,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,7,10.448586,-2.83013,0.087837,0.010734,0.0,0.0,0.007715,0.000115,1.296296
4_1,1,250699362_250699984,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,7,10.444986,-2.83013,0.080021,0.007273,0.0,0.0,0.006403,5.3e-05,1.296296


In [None]:
# def __xtrack_dist_diff(df):
#     """splits a vehicle trajectory into smaller trajectories of fixed size and removes
#     the last (len(df) mod size) rows
#     """

#     df['xtrack_diff'] = df.xtrack_dist \
#     .groupby(df.index.names[-1]) \
#     .apply(lambda x: (x - x.shift(-1)).fillna(0))
    
#     return df

# df = df.groupby(['file_name','id','edge_id'], as_index=False, group_keys=False) \
#             .apply(__xtrack_dist_diff)


In [4]:
#initial parameters

models = {
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
        'AdaBoost':Pipeline([('scaler', StandardScaler()), ('abc', AdaBoostClassifier())]) ,
        'SVM': Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=10000,probability = True))]) ,
        'Log Regression': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=10000))]) ,
        'GBM': Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())]),
        'MLP': Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(hidden_layer_sizes = (250,100,25),max_iter=1000,\
                                                                             learning_rate = 'adaptive',early_stopping = True,n_iter_no_change = 10))])                 
        }

agg_dict = {
            'xtrack_diff': ['mean','std'],
            'xtrack_dist': ['mean','std'],
            'avg_surr_speed': ['mean','std'],
            'lanes':['mean'],
            'len':['mean'],
            'speed':['mean','std'],
            'speed_bool': ['count','sum'],
            'acc_edge': ['mean','std'],
            'acc_per_edge': ['mean','std']
            }

features_to_select = 10
df_acc = pd.DataFrame(index=pd.MultiIndex.from_product([models.keys(),['f1_score','accuracy'], ['mean']]))
overlap = 0.7
min_movement_limit = 1
speed_limit = 0
k = 5
test_ratio = 0.2
validation_ratio = 0.2
kf = StratifiedKFold(n_splits=k, shuffle=True)
accs = np.zeros(k)
f1 = np.zeros(k)

traj_lens = np.arange(50,300, step=50)
df_acc = pd.DataFrame(columns = pd.MultiIndex.from_product([[1],traj_lens,['val','test_voting_mean','test_voting_model']]), \
                      index=pd.MultiIndex.from_product([models.keys(),['accuracy','accuracy_baseline'], ['mean']]))

ensemble_models = {
                    'ensemble_2': ensemble(2,'val'),
                   'ensemble_3': ensemble(3,'val'),
                   'ensemble_5': ensemble(5,'val')
                  }


validation_ratio = 0.2
test_ratio = 0.2
pca = PCA(n_components=5)
is_pca = False
vehicle_density = 1
is_log_model_voting = True



df['xtrack_diff_sq'] = df['xtrack_diff']**2
df['acc_edge_sq'] = df['acc_edge']**2
df['acc_per_edge_sq'] = df['acc_per_edge']**2
df['vehicle_density_by_lane'] = df['vehicle_density']/df['lanes']

# agg_dict = {'xtrack_diff': ['mean','std','skew',pd.DataFrame.kurt],
#             'xtrack_dist': ['mean','std','skew',pd.DataFrame.kurt],
#             'avg_surr_speed': ['mean','std','skew',pd.DataFrame.kurt],
#             'lanes':['mean'],
#             'len':['mean'],
#             'speed':['mean','std','skew',pd.DataFrame.kurt],
#             'acc_edge': ['mean','std','skew',pd.DataFrame.kurt],
#             'acc_per_edge': ['mean','std','skew',pd.DataFrame.kurt],
#             'xtrack_diff_sq': ['mean','std','skew',pd.DataFrame.kurt],
#             'acc_edge_sq': ['mean','std','skew',pd.DataFrame.kurt],
#             'acc_per_edge_sq': ['mean','std','skew',pd.DataFrame.kurt],
#             'vehicle_density_by_lane':['mean','std','skew',pd.DataFrame.kurt] }


In [None]:
df.to_pickle('block4_updated.pkl')

In [5]:
def fill_no_of_vehicle(df,X,vehicle,vehicle_density,traj_len,column):
    
    df.loc[('traj_len','Car_'+vehicle,'total'), (vehicle_density,traj_len,column)] = len(X)
    df.loc[('traj_len','Car_'+vehicle+'_percent','Car'), (vehicle_density,traj_len,column)] = sum(X == 'Car')/len(X)
    df.loc[('traj_len','Car_'+vehicle+'_percent',vehicle), (vehicle_density,traj_len,column)] = sum(X == vehicle)/ len(X)
    return df


In [None]:
# Car and Taxi classification

for vehicle in ['Taxi','Car_1']:
    
    if vehicle == 'Car_1':
        df_type = df[df.type == 'Car']
        accuracy_metric = 'accuracy_baseline'
        
    else : 
        df_type = df.copy()
        accuracy_metric = 'accuracy'
        
    for traj_len in traj_lens:

        df_filtered = df_type.groupby(df_type.index.names[:-1]) \
                .filter(lambda grp: (len(grp) >= traj_len) )
        
        df_filtered['speed_bool'] = (df_filtered['speed']>speed_limit).astype(int)
        
        if vehicle == 'Car_1':
            #sample 50% of cars and label them as car_1
            df_index = df_filtered.reset_index()[['file_name','id']].drop_duplicates()
            df_filtered.loc[df_filtered.reset_index(['edge_id', 'time'],drop = True).index.isin(df_index.sample(frac = 0.5).set_index(['file_name','id']).index),'type']=vehicle
 
        df_train_val,df_test = split_train_test(df_filtered,validation_ratio)
        df_train,df_val = split_train_test(df_train_val,test_ratio)

        #aggregate trajectories
        #to train models
        X_train,y_train = get_xy(df_train,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict,outlier_limit = 1,balance = 'by_edge')
        #to pick better performing models
        X_val,y_val = get_xy(df_val,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict,balance = 'by_type')
        #to train voting model
        X_val_voting,y_val_voting = get_xy(df_val,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict)
        #to test ensemble and voting model
        X_test,y_test = get_xy(df_test,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict)

        #pca to downsample aggregate features
        if is_pca:
            pca.fit(X_train)
            X_test_voting = pd.DataFrame(data = pca.transform(X_test_voting),index = X_test_voting.index)
            X_train = pd.DataFrame(data = pca.transform(X_train),index = X_train.index)
            X_test = pd.DataFrame(data = pca.transform(X_test),index = X_test.index)
            X_val = pd.DataFrame(data = pca.transform(X_val),index = X_val.index)

        print("with traj_len = ",traj_len)
        print("No of trajectories: ",len(X_train))
        print("No of Car trajectories: ",sum(y_train == 'Car'))
        print("No of "+vehicle+" trajectories: ",sum(y_train == vehicle))
        print("\n")
        
        #fill number of cars and taxis/cars and car_1's in the result dataframe (df_acc)
        df_acc = fill_no_of_vehicle(df_acc,y_test,vehicle,vehicle_density,traj_len,'val')
        
        id_list = y_val.reset_index(['edge_id'],drop = True).reset_index().drop_duplicates()
        df_acc = fill_no_of_vehicle(df_acc,id_list.type,vehicle,vehicle_density,traj_len,'test_voting_mean')
        df_acc = fill_no_of_vehicle(df_acc,id_list.type,vehicle,vehicle_density,traj_len,'test_voting_model')
        
        model_dict = {}
        
        # fit different models
        for name, model in models.items():

            #fit the model on training set
            model.fit(X_train,y_train)

            #test the model on validation set consisting of trajectories and save accuracy estimate as test (this accuracy estimate will be used to find ensemble) 
            val_accs,_ = basic_accuracy(X_val,y_val,model)                                   
            df_acc.loc[(name, accuracy_metric,'mean'),  (vehicle_density,traj_len,'val')] = round(100*val_accs, 3)
            
            #find accuracy of the model on test set by voting among trajectories in an id using mean
            test_accs,_ = voting_accuracy(X_test,y_test, model,predict_proba = True)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_mean')] = round(100*test_accs, 3)
            #plt.savefig("traj_len"+str(traj_len)+name+".png")
            
            #train voting model for voting among trajectories using the validation set with equal number of vehicle id's
            voting_m = voting_model(model,X_val_voting,y_val_voting)
            #find the accuracy of the model on validation set with voting using logistic regression
            val_accs,_ = voting_m.accuracy(X_test,y_test)#, voting_m, predict_proba = False)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_model')] = round(100*val_accs, 3)

            #save model in dictionary for ensemble
            model_dict[name] = model

        for name,ensemble_model in ensemble_models.items():
            #generate ensembles with 2,3 and 5 models
            ensemble_model.find_ensemble(df_acc,traj_len,vehicle_density,True)
            ensemble_model.fit(X_train,y_train,model_dict)

            #test accuracy of ensembles on test set
            test_accs,_ = voting_accuracy(X_test,y_test, ensemble_model)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_mean')] = round(100*test_accs, 3)

            #test accuracy of ensembles on validation using voting_model (trained on validation set)
            voting_m = voting_model(ensemble_model,X_val_voting,y_val_voting)
            test_accs,_ = voting_m.accuracy(X_test,y_test)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'test_voting_model')] = round(100*test_accs, 3)

df_acc.fillna(0,inplace = True)

df_acc

with traj_len =  50
No of trajectories:  41805
No of Car trajectories:  20914
No of Taxi trajectories:  20891


with traj_len =  100
No of trajectories:  17510
No of Car trajectories:  8761
No of Taxi trajectories:  8749


with traj_len =  150
No of trajectories:  9920
No of Car trajectories:  4964
No of Taxi trajectories:  4956


with traj_len =  200
No of trajectories:  6294
No of Car trajectories:  3147
No of Taxi trajectories:  3147


with traj_len =  250
No of trajectories:  4257
No of Car trajectories:  2173
No of Taxi trajectories:  2084


with traj_len =  50
No of trajectories:  36964
No of Car trajectories:  18482
No of Car_1 trajectories:  18482




In [8]:
#df_acc.sort_index()#.to_csv("accuracy_block4_100_traj_len.csv")
__accuracy(y_test,y_test)

NameError: name '__accuracy' is not defined