In [None]:
"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [112]:

import pandas as pd
import numpy as np
from feature_eng import split_trajectories
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [28]:
df = pd.read_pickle('block4_concat_lane.pkl')
#df = pd.read_pickle('block4_edge_filter.pkl')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,xtrack_dist,time_stamp,edge_progress_intervals,edge_id,len,lanes,node_veh_dist,edge_seg,vehicle_density,avg_surr_speed
file_name,id,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
4_1,1,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,-1.883401,42.0,0.3,250699362_250699984,97.581,5.4,29.81433,1.0,7,10.464171
4_1,1,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,-1.980795,42.04,0.3,250699362_250699984,97.581,5.4,29.67483,1.0,7,10.457843
4_1,1,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,-1.937041,42.08,0.3,250699362_250699984,97.581,5.4,29.537753,1.0,7,10.452857
4_1,1,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,-1.893287,42.12,0.3,250699362_250699984,97.581,5.4,29.400718,1.0,7,10.448586
4_1,1,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,-1.941984,42.16,0.3,250699362_250699984,97.581,5.4,29.330986,1.0,7,10.444986


In [118]:
#def lane_changes(df,xtrack_dist = 1):
def __xtrack_dist(df):
    """splits a vehicle trajectory into smaller trajectories of fixed size and removes
    the last (len(df) mod size) riws
    """
    
    df["xtrack_diff"] = df.loc[:,['xtrack_dist']]- df.loc[:,['xtrack_dist']].shift(-1)
    df["xtrack_diff"]=df['xtrack_diff'].fillna(0)
    return df


df = df.groupby(['file_name','id'], as_index=False, group_keys=False) \
            .apply(__xtrack_dist)

In [29]:
def reset_traj_and_split(df, traj_len):
    #df.index = df.index.droplevel(1)
    return split_trajectories(df, traj_len)

In [140]:
def agg(df):
    df['xtrack_diff_sq'] = df['xtrack_diff']**2
    df_agg = df[np.isin(df['type'], ['Car','Taxi'])] \
        .groupby(['id', 'traj']).agg({
            'xtrack_diff_sq': ['mean','std','skew','max','min',pd.DataFrame.kurt,'sum'],
            'xtrack_diff': ['mean','std','skew',pd.DataFrame.kurt],
            'xtrack_dist': ['mean','std','skew',pd.DataFrame.kurt],
            'avg_surr_speed': ['mean','std','skew',pd.DataFrame.kurt],
            'lanes':['mean'],
            'len':['mean'],
            'speed':['mean','std','skew',pd.DataFrame.kurt,'sum'],
            'vehicle_density': ['mean','std','skew',pd.DataFrame.kurt],
            'lon_acc': ['mean','std','max','min','skew', pd.DataFrame.kurt],
            'lat_acc': ['mean','std','max','min', 'skew', pd.DataFrame.kurt],
            'type': 'first'
        }) \
        .reset_index(drop=True)
    #df_agg1 = df_agg.copy()
    
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    df_agg.speed_sum = df_agg.speed_sum*0.04
    df_agg.rename(columns={'type_first':'type'}, inplace=True)
    g = df_agg.groupby('type')
    df_agg = g.apply(lambda group: group.sample(g.size().min())).reset_index(drop=True)
    X,y = df_agg.drop('type', axis=1), df_agg['type']
    
    return X,y


In [143]:
def train_and_accuracy(X, y, model):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    #print(y_hat)
    a = y_hat==y_test
    
    f = f1_score((y_test == 'Car').astype(int),(y_hat == 'Car').astype(int))
    return len(a[a==True]) / len(y_test),f


In [144]:
#traj_lens = [349]
traj_lens = np.arange(250,500, step=50)
models = {
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
        'AdaBoost':Pipeline([('scaler', StandardScaler()), ('abc', AdaBoostClassifier())]) ,
        'SVM': Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=10000))]) ,
        'Log Regression': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=10000))]) 
    }
    
df_acc = pd.DataFrame(index=pd.MultiIndex.from_product([models.keys(),['f1_score','accuracy'], ['mean','std']]), columns=traj_lens)

k = 5
kf = StratifiedKFold(n_splits=k, shuffle=True)
accs = np.zeros(k)
f1 = np.zeros(k)
    
for traj_len in traj_lens:

    df_traj_list = df.groupby(['id','file_name']).count()['lat'].reset_index()
    df_traj_list = df_traj_list[df_traj_list.lat >= traj_len]
    df2 = df[df.index.droplevel('time').isin(df_traj_list.set_index(['file_name','id']).index.to_list())].copy()
    
    df2 = reset_traj_and_split(df2, traj_len-1)
    
    df3 = df2.reset_index()[["id","file_name","traj","speed"]]
    df3["speed_bool"]= df3["speed"]>0.001
    df3 = df3.groupby(["file_name","id","traj"]).sum(["speed_bool"])
    df2 = df2[df2.index.droplevel(3).isin(df3[df3.speed_bool >= traj_len*0.90].index.to_list())]

    X,y = agg(df2)
    print("No of trajectories: ",len(X))
    df_acc.loc[('traj_len','traj_len','total'), traj_len] = len(X)
    df_acc.loc[('traj_len','traj_len','Car'), traj_len] = sum(y == 'Car')
    df_acc.loc[('traj_len','traj_len','Taxi'), traj_len] = sum(y == 'Taxi')
    #print(df_acc)
    for name, model in models.items():
        for i, (train_index, test_index) in enumerate(kf.split(X,y)):
            accs[i],f1[i] = train_and_accuracy(X, y, model)
            

        df_acc.loc[(name, 'accuracy','mean'), traj_len] = round(100*accs.mean(), 3)
        df_acc.loc[(name, 'accuracy','std'), traj_len] = round(100*accs.std(), 3)
        df_acc.loc[(name, 'f1_score','mean'), traj_len] = round(100*f1.mean(), 3)
        df_acc.loc[(name, 'f1_score','std'), traj_len] = round(100*f1.std(), 3)

        print(name, 'complete.')
        
    print('trajectory length', traj_len, 'complete.')
df_acc

No of trajectories:  2428


  result = self._run_cell(
  result = self._run_cell(


Random Forest complete.


  result = self._run_cell(


AdaBoost complete.


  result = self._run_cell(


SVM complete.


  result = self._run_cell(


Log Regression complete.
trajectory length 250 complete.
No of trajectories:  1878


  result = self._run_cell(
  result = self._run_cell(


Random Forest complete.


  result = self._run_cell(


AdaBoost complete.


  result = self._run_cell(


SVM complete.


  result = self._run_cell(


Log Regression complete.
trajectory length 300 complete.
No of trajectories:  1438


  result = self._run_cell(
  result = self._run_cell(


Random Forest complete.


  result = self._run_cell(


AdaBoost complete.


  result = self._run_cell(


SVM complete.


  result = self._run_cell(


Log Regression complete.
trajectory length 350 complete.
No of trajectories:  1120


  result = self._run_cell(
  result = self._run_cell(


Random Forest complete.


  result = self._run_cell(


AdaBoost complete.


  result = self._run_cell(


SVM complete.


  result = self._run_cell(


Log Regression complete.
trajectory length 400 complete.
No of trajectories:  872


  result = self._run_cell(
  result = self._run_cell(


Random Forest complete.


  result = self._run_cell(


AdaBoost complete.


  result = self._run_cell(


SVM complete.
Log Regression complete.
trajectory length 450 complete.


  result = self._run_cell(


Unnamed: 0,Unnamed: 1,Unnamed: 2,250,300,350,400,450
Random Forest,f1_score,mean,57.685,58.442,57.886,57.282,58.37
Random Forest,f1_score,std,1.766,3.119,3.863,2.739,3.736
Random Forest,accuracy,mean,58.567,59.583,59.46,58.393,60.668
Random Forest,accuracy,std,1.638,2.28,3.215,2.301,3.035
AdaBoost,f1_score,mean,55.62,54.572,55.625,54.132,56.215
AdaBoost,f1_score,std,2.416,2.107,3.333,3.328,3.853
AdaBoost,accuracy,mean,57.455,56.549,55.913,55.089,57.456
AdaBoost,accuracy,std,3.281,2.31,2.889,2.124,3.332
SVM,f1_score,mean,53.007,55.001,54.965,53.714,54.182
SVM,f1_score,std,4.779,2.247,2.846,4.181,6.924


In [59]:
df_acc.to_pickle("acc_50p_0_lane_filter_f1.pkl")