"""classical_models.ipynb
by: Archie Gertsman (arkadiy2@illinois.edu)
Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license
"""

In [1]:
import warnings; warnings.simplefilter('ignore')

In [2]:

import pandas as pd
import numpy as np
from feature_eng import split_trajectories
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_pickle('block4_concat_lane.pkl')
#df = pd.read_pickle('block4_edge_filter.pkl')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,xtrack_dist,time_stamp,edge_progress_intervals,edge_id,len,lanes,node_veh_dist,edge_seg,vehicle_density,avg_surr_speed
file_name,id,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
4_1,1,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,-1.883401,42.0,0.3,250699362_250699984,97.581,5.4,29.81433,1.0,7,10.464171
4_1,1,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,-1.980795,42.04,0.3,250699362_250699984,97.581,5.4,29.67483,1.0,7,10.457843
4_1,1,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,-1.937041,42.08,0.3,250699362_250699984,97.581,5.4,29.537753,1.0,7,10.452857
4_1,1,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,-1.893287,42.12,0.3,250699362_250699984,97.581,5.4,29.400718,1.0,7,10.448586
4_1,1,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,-1.941984,42.16,0.3,250699362_250699984,97.581,5.4,29.330986,1.0,7,10.444986


In [4]:
#adding edge_id to the index
df.index = df.reset_index().set_index(['file_name','id','edge_id','time']).index
df.drop(['edge_id'],inplace = True,axis = 1)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,dir,xtrack_dist,time_stamp,edge_progress_intervals,len,lanes,node_veh_dist,edge_seg,vehicle_density,avg_surr_speed
file_name,id,edge_id,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
4_1,1,250699362_250699984,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,0,-1.883401,42.0,0.3,97.581,5.4,29.81433,1.0,7,10.464171
4_1,1,250699362_250699984,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,0,-1.980795,42.04,0.3,97.581,5.4,29.67483,1.0,7,10.457843
4_1,1,250699362_250699984,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,0,-1.937041,42.08,0.3,97.581,5.4,29.537753,1.0,7,10.452857
4_1,1,250699362_250699984,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,0,-1.893287,42.12,0.3,97.581,5.4,29.400718,1.0,7,10.448586
4_1,1,250699362_250699984,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,0,-1.941984,42.16,0.3,97.581,5.4,29.330986,1.0,7,10.444986


In [5]:
#finding difference between cross track distance between adjacent rows
def __xtrack_dist_diff(df):
    """splits a vehicle trajectory into smaller trajectories of fixed size and removes
    the last (len(df) mod size) riws
    """
    
    df["xtrack_diff"] = df.loc[:,['xtrack_dist']]- df.loc[:,['xtrack_dist']].shift(-1)
    df["xtrack_diff"]=df['xtrack_diff'].fillna(0)
    return df


df = df.groupby(['file_name','id','edge_id'], as_index=False, group_keys=False) \
            .apply(__xtrack_dist_diff)

In [6]:
#Split trajectories with overlap
def split_trajectories_overlap(df, size, overlap):
    """splits each vehicle's trajectory into smaller trajectories of fixed size,
    adding another dimension to the multiindex. Data is truncated to be a multiple
    of `size` in length. 
    Example usage:
        df = csv_to_df('sample.csv')
        df = split_trajectories(df, 3000)
    """
    overlap = 1-overlap
    if overlap == 0:
        return split_trajectories(df,size)
    else:
        df1 = df.groupby(['id','file_name',"edge_id"], as_index=False, group_keys=False) \
                .apply(__split_vehicle, size)

        for i in range(1,int(1/overlap)):
            """ remove x rows from each group and then split the trajectory. 
            eg: say the overlap is 50% with traj_lens = 300, when i = 1, delete first 150 rows and then split trajectory from 
            150-450,450-750 etc
            """
            df2 = df.groupby(['id','file_name',"edge_id"], as_index=False, group_keys=False) \
                .apply(__remove_initial_rows, int(i*overlap*size))
            df2 = df2.groupby(['id','file_name',"edge_id"], as_index=False, group_keys=False) \
                .apply(__split_vehicle, size)
            
            #give different trajectory name for each overlap iteration
            df2.index = df2.index.set_levels(df2.index.levels[3].astype(str) + '_'+str(i), level=3)
            
            #concatenate dataframes one below the other
            df1 = pd.concat([df1,df2],axis = 0)

        return df1

def __remove_initial_rows(df,ele):
    #if ele > number of rows, remedy the error by returning none
    try:
        df1 = df[ele:]
    except TypeError:
        df1 = None
  
    return df1

def __split_vehicle(df, size):
    """splits a vehicle trajectory into smaller trajectories of fixed size and removes
    the last (len(df) mod size) riws
    """
    df2 = df.copy()
    df2['traj'] = None
    df2.loc[::size, 'traj'] = np.arange(len(df2[::size]), dtype=int)
    df2['traj'].ffill(inplace=True)
    df2.set_index('traj', append=True, inplace=True)
    df2 = __truncate_trajectory(df2, size)
    df2 = df2.reorder_levels([0,1,2,4,3])
    return df2


def __truncate_to_multiple(n, m):
    return m * (n // m)

def __truncate_trajectory(traj, size):
    n = len(traj)
    new_len = __truncate_to_multiple(n, size)
    return traj[:new_len]


In [13]:
def agg(df,is_validation_set = False):
    
    df['xtrack_diff_sq'] = df['xtrack_diff']**2
    df_agg = df[np.isin(df['type'], ['Car','Taxi'])] \
        .groupby(['file_name','id', 'edge_id','traj']).agg({
            'xtrack_diff_sq': ['mean','std','skew','max','min',pd.DataFrame.kurt,'sum'],
            'xtrack_diff': ['mean','std','skew',pd.DataFrame.kurt],
            'xtrack_dist': ['mean','std','skew',pd.DataFrame.kurt],
            'avg_surr_speed': ['mean','std','skew',pd.DataFrame.kurt],
            'lanes':['mean'],
            'len':['mean'],
            'speed':['mean','std','skew',pd.DataFrame.kurt,'sum'],
            'vehicle_density': ['mean','std','skew',pd.DataFrame.kurt],
            'lon_acc': ['mean','std','max','min','skew', pd.DataFrame.kurt],
            'lat_acc': ['mean','std','max','min', 'skew', pd.DataFrame.kurt],
            'type': 'first'
        }) \
        
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    df_agg.speed_sum = df_agg.speed_sum*0.04
    df_agg.rename(columns={'type_first':'type'}, inplace=True)
    
    if is_validation_set == False:
        g = df_agg.groupby('type')
        df_agg = g.apply(lambda group: group.sample(g.size().min())).reset_index(drop=True)
        X,y = df_agg.drop('type', axis=1), df_agg['type']
        return X,y
    
    else:
        #df_agg = g.apply(lambda group: group.sample(g.size().min()))
        X,y = df_agg.drop('type', axis=1), df_agg['type']
        return X,y


In [38]:
def train_and_accuracy(X_train,y_train,X_test,y_test, model):

    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    a = y_hat==y_test
    
    f = f1_score((y_test == 'Car').astype(int),(y_hat == 'Car').astype(int))
    return len(a[a==True]) / len(y_test),f

def val_voting_accuracy(X_train,y_train,X_val,y_val, model,by_edge = False):

    model.fit(X_train, y_train)
    y_hat = model.predict(X_val)
    if by_edge == False:
        y_hat = pd.DataFrame(index = y_val.index,data = y_hat,columns = ['type'])

        #predicted value for the entire trajectory would be the mode of the predicted labels
        y_hat = y_hat.groupby(['file_name','id']).apply(lambda group: pd.Series.mode(group['type'])[0])
        y_test = y_val.groupby(['file_name','id']).first(['type'])
    else:
        y_hat = pd.DataFrame(index = y_val.index,data = y_hat,columns = ['type'])

        #predicted value for the entire trajectory would be the mode of the predicted labels
        y_hat = y_hat.groupby(['file_name','id','edge_id']).apply(lambda group: pd.Series.mode(group['type'])[0])
        y_test = y_val.groupby(['file_name','id','edge_id']).first(['type'])

    a = y_hat==y_test
   
    f = f1_score((y_test == 'Car').astype(int),(y_hat == 'Car').astype(int))
    return len(a[a==True]) / len(y_test),f


In [50]:

def validation_set(df,test_size):
    """dataframe is split based on their vehicle id's"""
    df1 = df.reset_index()[["file_name",'id','type']].drop_duplicates()
    X = df1[["file_name","id"]]
    y = df1['type']
    X_train,X_test,_,y_test = train_test_split(X, y, test_size=test_size, random_state=4, stratify=y) 
    df_train = df[df.index.droplevel(['time','edge_id']).isin(X_train.set_index(['file_name','id']).index)]
    X_test['type'] = y_test
    g = X_test.groupby('type')
    X_test = g.apply(lambda group: group.sample(g.size().min())).reset_index(drop = True)
    df_test = df[df.index.droplevel(['time','edge_id']).isin(X_test.set_index(['file_name','id']).index)]
    return df_train,df_test

def process_traj(df2,traj_len,overlap,speed_limit,min_movement_limit,vehicle_density_limit = None):
    """ the trajectories are split into smaller trajectories of fixed size traj_len"""
    df2 = split_trajectories_overlap(df2, traj_len, overlap)
    df3 = df2.reset_index()[["id","file_name","edge_id","traj","speed"]]
    
    #consider only those trajectories which move at a minimum speed of speed_limit for min_movement_limit%
    df3["speed_bool"]= df3["speed"]>speed_limit
    df3 = df3.groupby(["file_name","id","edge_id","traj"]).sum(["speed_bool"])
    df2 = df2[df2.index.droplevel(4).isin(df3[df3.speed_bool >= traj_len*min_movement_limit].index.to_list())]
    
    #filter trajectory by minimum vehicle_density_limit
    if vehicle_density_limit == None:
        return df2
    
    else:
        df3 = df2.reset_index()[["id","file_name","edge_id","traj","vehicle_density"]]
        df3 = df3.groupby(["file_name","id","edge_id","traj"]).mean(["vehicle_density"])
        df2 = df2[df2.index.droplevel(4).isin(df3[df3.vehicle_density >= vehicle_density_limit].index.to_list())]
        
        return df2
    
    

In [48]:

traj_lens = np.arange(300,450, step=50)
models = {
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
        'AdaBoost':Pipeline([('scaler', StandardScaler()), ('abc', AdaBoostClassifier())]) ,
        'SVM': Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=10000))]) ,
        'Log Regression': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=10000))]) 
    }

df_acc = pd.DataFrame(index=pd.MultiIndex.from_product([models.keys(),['f1_score','accuracy'], ['mean','std']]))
overlap = 0.6
min_movement_limit = 0.75
speed_limit = 0
k = 5
validation_ratio = 0.2
kf = StratifiedKFold(n_splits=k, shuffle=True)
accs = np.zeros(k)
f1 = np.zeros(k)


for traj_len in traj_lens:

    #get length of trajectory in each id
    df_traj_list = df.groupby(['file_name','id','edge_id',]).count()['lat']
    
    #limit trajectory length to greater than traj_len
    df_traj_list = df_traj_list[df_traj_list>= traj_len]
    df2 = df[df.index.droplevel('time').isin(df_traj_list.index.to_list())].copy()
    df_train,df_val = validation_set(df2,validation_ratio)
    
    #process trajectories further
    df_train = process_traj(df_train,traj_len,overlap,speed_limit,min_movement_limit)
    df_val = process_traj(df_val,traj_len,overlap,speed_limit,min_movement_limit)

    #aggregate trajectories
    X,y = agg(df_train)
    #include index for validation dataframes (needed for voting method)
    X_val,y_val = agg(df_val,True)

    #store percent cars and taxis
    print("No of trajectories: ",len(X))
    df_acc.loc[('traj_len','traj_len','total'), (str(traj_len)+'_kfold')] = len(X)
    df_acc.loc[('traj_len','percent','Car'), (str(traj_len)+'_kfold')] = sum(y == 'Car')/len(X)
    df_acc.loc[('traj_len','percent','Taxi'), (str(traj_len)+'_kfold')] = sum(y == 'Taxi')/ len(X)
    
    woedge_count = y_val.reset_index(['traj','edge_id'],drop = True).reset_index().drop_duplicates()
    df_acc.loc[('traj_len','traj_len','total'), (str(traj_len)+'_val_woedge')] = len(woedge_count)
    df_acc.loc[('traj_len','percent','Car'), (str(traj_len)+'_val_woedge')] = sum(woedge_count.type == 'Car')/len(woedge_count)
    df_acc.loc[('traj_len','percent','Taxi'), (str(traj_len)+'_val_woedge')] =sum(woedge_count.type == 'Taxi')/len(woedge_count)
    
    by_edge_count = y_val.reset_index('traj',drop = True).reset_index().drop_duplicates()
    df_acc.loc[('traj_len','traj_len','total'), (str(traj_len)+'_val_by_edge')] = len(by_edge_count)
    df_acc.loc[('traj_len','percent','Car'), (str(traj_len)+'_val_by_edge')] = sum(by_edge_count.type == 'Car')/len(by_edge_count)
    df_acc.loc[('traj_len','percent','Taxi'), (str(traj_len)+'_val_by_edge')] = sum(by_edge_count.type == 'Taxi')/len(by_edge_count)
    
    # fit different models
    for name, model in models.items():
        for i, (train_index, test_index) in enumerate(kf.split(X,y)):
            
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            accs[i],f1[i] = train_and_accuracy(X_train, y_train,X_test,y_test, model)
        
        df_acc.loc[(name, 'accuracy','mean'), (str(traj_len)+'_kfold')] = round(100*accs.mean(), 3)
        df_acc.loc[(name, 'accuracy','std'), (str(traj_len)+'_kfold')] = round(100*accs.std(), 3)
        df_acc.loc[(name, 'f1_score','mean'),(str(traj_len)+'_kfold')] = round(100*f1.mean(), 3)
        df_acc.loc[(name, 'f1_score','std'), (str(traj_len)+'_kfold')] = round(100*f1.std(), 3)

        val_accs,val_f1 = val_voting_accuracy(X, y,X_val,y_val, model)
        df_acc.loc[(name, 'accuracy','mean'), (str(traj_len)+'_val_woedge')] = round(100*val_accs, 3)
        df_acc.loc[(name, 'f1_score','mean'),(str(traj_len)+'_val_woedge')] = round(100*val_f1, 3)
        
        val_accs,val_f1 = val_voting_accuracy(X, y,X_val,y_val, model,by_edge = True)
        df_acc.loc[(name, 'accuracy','mean'), (str(traj_len)+'_val_by_edge')] = round(100*val_accs, 3)
        df_acc.loc[(name, 'f1_score','mean'),(str(traj_len)+'_val_by_edge')] = round(100*val_f1, 3)

        print(name, 'complete.')

    print('trajectory length', traj_len, 'complete.')

df_acc

No of Cars in validation set 105, No of Taxis in validation set 105
No of trajectories:  2834
Random Forest complete.
AdaBoost complete.
SVM complete.
Log Regression complete.
trajectory length 300 complete.
No of Cars in validation set 95, No of Taxis in validation set 95
No of trajectories:  2058
Random Forest complete.
AdaBoost complete.
SVM complete.
Log Regression complete.
trajectory length 350 complete.
No of Cars in validation set 85, No of Taxis in validation set 85
No of trajectories:  1536
Random Forest complete.
AdaBoost complete.
SVM complete.
Log Regression complete.
trajectory length 400 complete.


Unnamed: 0,Unnamed: 1,Unnamed: 2,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,61.249,61.611,58.824,62.608,57.923,54.98,58.631,56.757,53.333
Random Forest,f1_score,std,2.346,,,2.72,,,4.567,,
Random Forest,accuracy,mean,62.173,60.099,56.818,64.189,56.497,54.98,60.413,56.463,54.95
Random Forest,accuracy,std,1.546,,,1.723,,,3.886,,
AdaBoost,f1_score,mean,58.278,61.165,60.256,61.518,58.14,56.432,56.08,51.471,52.747
AdaBoost,f1_score,std,0.946,,,1.31,,,2.463,,
AdaBoost,accuracy,mean,60.339,60.591,59.74,63.314,59.322,58.167,58.141,55.102,57.426
AdaBoost,accuracy,std,1.317,,,1.027,,,3.052,,
SVM,f1_score,mean,55.334,54.737,54.417,57.992,52.5,47.847,52.883,53.226,51.534
SVM,f1_score,std,2.823,,,3.813,,,4.682,,


In [52]:
from IPython.display import display
traj_lens = np.arange(200,450, step=50)
df_acc = pd.DataFrame(index=pd.MultiIndex.from_product([models.keys(),['f1_score','accuracy'], ['mean','std']]))
overlap = 0.7
min_movement_limit = 0.75
speed_limit = 0
k = 5
validation_ratio = 0.2
kf = StratifiedKFold(n_splits=k, shuffle=True)
accs = np.zeros(k)
f1 = np.zeros(k)

for j in range(0,6):
    
    for traj_len in traj_lens:

        #get length of trajectory in each id
        df_traj_list = df.groupby(['file_name','id','edge_id',]).count()['lat']

        #limit trajectory length to greater than traj_len
        df_traj_list = df_traj_list[df_traj_list>= traj_len]
        df2 = df[df.index.droplevel('time').isin(df_traj_list.index.to_list())].copy()
        df_train,df_val = validation_set(df2,validation_ratio)

        #process trajectories further
        df_train = process_traj(df_train,traj_len,overlap,speed_limit,min_movement_limit,j)
        df_val = process_traj(df_val,traj_len,overlap,speed_limit,min_movement_limit,j)

        #aggregate trajectories
        X,y = agg(df_train)
        #include index for validation dataframes (needed for voting method)
        X_val,y_val = agg(df_val,True)

        #store percent cars and taxis
       
        df_acc.loc[('traj_len','traj_len','total'), (str(traj_len)+'_kfold')] = len(X)
        df_acc.loc[('traj_len','percent','Car'), (str(traj_len)+'_kfold')] = sum(y == 'Car')/len(X)
        df_acc.loc[('traj_len','percent','Taxi'), (str(traj_len)+'_kfold')] = sum(y == 'Taxi')/ len(X)

        woedge_count = y_val.reset_index(['traj','edge_id'],drop = True).reset_index().drop_duplicates()
        df_acc.loc[('traj_len','traj_len','total'), (str(traj_len)+'_val_woedge')] = len(woedge_count)
        df_acc.loc[('traj_len','percent','Car'), (str(traj_len)+'_val_woedge')] = sum(woedge_count.type == 'Car')/len(woedge_count)
        df_acc.loc[('traj_len','percent','Taxi'), (str(traj_len)+'_val_woedge')] =sum(woedge_count.type == 'Taxi')/len(woedge_count)

        by_edge_count = y_val.reset_index('traj',drop = True).reset_index().drop_duplicates()
        df_acc.loc[('traj_len','traj_len','total'), (str(traj_len)+'_val_by_edge')] = len(by_edge_count)
        df_acc.loc[('traj_len','percent','Car'), (str(traj_len)+'_val_by_edge')] = sum(by_edge_count.type == 'Car')/len(by_edge_count)
        df_acc.loc[('traj_len','percent','Taxi'), (str(traj_len)+'_val_by_edge')] = sum(by_edge_count.type == 'Taxi')/len(by_edge_count)

        # fit different models
        for name, model in models.items():
            for i, (train_index, test_index) in enumerate(kf.split(X,y)):

                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                accs[i],f1[i] = train_and_accuracy(X_train, y_train,X_test,y_test, model)

            df_acc.loc[(name, 'accuracy','mean'), (str(traj_len)+'_kfold')] = round(100*accs.mean(), 3)
            df_acc.loc[(name, 'accuracy','std'), (str(traj_len)+'_kfold')] = round(100*accs.std(), 3)
            df_acc.loc[(name, 'f1_score','mean'),(str(traj_len)+'_kfold')] = round(100*f1.mean(), 3)
            df_acc.loc[(name, 'f1_score','std'), (str(traj_len)+'_kfold')] = round(100*f1.std(), 3)

            val_accs,val_f1 = val_voting_accuracy(X, y,X_val,y_val, model)
            df_acc.loc[(name, 'accuracy','mean'), (str(traj_len)+'_val_woedge')] = round(100*val_accs, 3)
            df_acc.loc[(name, 'f1_score','mean'),(str(traj_len)+'_val_woedge')] = round(100*val_f1, 3)

            val_accs,val_f1 = val_voting_accuracy(X, y,X_val,y_val, model,by_edge = True)
            df_acc.loc[(name, 'accuracy','mean'), (str(traj_len)+'_val_by_edge')] = round(100*val_accs, 3)
            df_acc.loc[(name, 'f1_score','mean'),(str(traj_len)+'_val_by_edge')] = round(100*val_f1, 3)

            #print(name, 'complete.')
            
        print('model fitting for ', traj_len, ' complete.')

    print("\n vehicle density >= ",j)
    display(df_acc)

model fitting for  200  complete.
model fitting for  250  complete.
model fitting for  300  complete.
model fitting for  350  complete.
model fitting for  400  complete.

 vehicle density >=  0


Unnamed: 0,Unnamed: 1,Unnamed: 2,200_kfold,200_val_woedge,200_val_by_edge,250_kfold,250_val_woedge,250_val_by_edge,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,65.233,65.942,61.538,64.306,59.289,61.749,64.647,60.194,60.328,65.761,56.995,56.589,63.236,60.759,60.274
Random Forest,f1_score,std,0.722,,,2.206,,,0.631,,,2.32,,,2.083,,
Random Forest,accuracy,mean,66.531,62.249,59.397,65.474,54.425,58.084,66.11,59.406,59.259,67.192,52.841,55.556,65.078,59.74,59.155
Random Forest,accuracy,std,0.54,,,2.257,,,0.731,,,2.215,,,1.791,,
AdaBoost,f1_score,mean,59.76,61.654,60.137,60.094,59.504,58.382,59.323,58.128,55.862,59.863,58.824,56.652,57.491,60.0,56.716
AdaBoost,f1_score,std,1.44,,,1.161,,,2.745,,,1.961,,,1.248,,
AdaBoost,accuracy,mean,62.035,59.036,59.397,61.465,56.637,56.886,61.897,57.921,56.902,62.134,60.227,59.921,59.627,61.039,59.155
AdaBoost,accuracy,std,1.037,,,1.049,,,2.131,,,1.623,,,1.262,,
SVM,f1_score,mean,58.221,61.355,57.416,58.121,62.447,62.275,59.488,53.886,53.091,58.913,55.0,51.376,55.927,57.143,53.631
SVM,f1_score,std,1.194,,,1.616,,,0.854,,,1.899,,,2.172,,


model fitting for  200  complete.
model fitting for  250  complete.
model fitting for  300  complete.
model fitting for  350  complete.
model fitting for  400  complete.

 vehicle density >=  1


Unnamed: 0,Unnamed: 1,Unnamed: 2,200_kfold,200_val_woedge,200_val_by_edge,250_kfold,250_val_woedge,250_val_by_edge,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,64.648,65.152,62.896,64.975,61.905,60.641,63.857,64.815,67.524,63.83,58.696,56.031,62.492,54.667,53.0
Random Forest,f1_score,std,1.018,,,1.516,,,1.33,,,1.307,,,1.509,,
Random Forest,accuracy,mean,66.174,62.449,61.682,65.98,56.951,58.589,65.442,62.376,65.878,66.166,56.818,55.686,64.146,54.967,54.369
Random Forest,accuracy,std,0.908,,,1.207,,,0.854,,,1.023,,,0.953,,
AdaBoost,f1_score,mean,58.234,59.35,56.872,59.111,63.415,60.694,59.921,61.165,59.31,58.319,56.977,58.065,57.222,60.131,57.0
AdaBoost,f1_score,std,0.98,,,0.8,,,1.243,,,1.57,,,2.939,,
AdaBoost,accuracy,mean,61.22,59.184,57.477,61.24,59.641,58.282,62.333,60.396,60.135,60.3,57.955,59.216,58.839,59.603,58.252
AdaBoost,accuracy,std,0.853,,,0.55,,,0.451,,,1.205,,,2.801,,
SVM,f1_score,mean,57.815,59.504,56.235,59.077,63.559,60.923,57.004,61.307,60.993,55.858,52.98,51.163,56.427,57.353,56.0
SVM,f1_score,std,1.24,,,2.024,,,0.702,,,1.852,,,1.165,,


model fitting for  200  complete.
model fitting for  250  complete.
model fitting for  300  complete.
model fitting for  350  complete.
model fitting for  400  complete.

 vehicle density >=  2


Unnamed: 0,Unnamed: 1,Unnamed: 2,200_kfold,200_val_woedge,200_val_by_edge,250_kfold,250_val_woedge,250_val_by_edge,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,66.535,62.626,56.705,65.942,63.102,58.871,65.14,57.516,58.768,63.751,57.746,59.375,60.016,58.065,52.229
Random Forest,f1_score,std,1.909,,,1.496,,,1.664,,,1.247,,,2.476,,
Random Forest,accuracy,mean,67.4,58.192,55.336,66.143,57.407,55.263,66.437,56.081,58.768,65.3,55.224,57.609,61.369,58.4,53.988
Random Forest,accuracy,std,1.791,,,1.657,,,1.153,,,1.38,,,1.296,,
AdaBoost,f1_score,mean,59.786,58.242,56.574,61.059,60.541,57.613,61.146,58.065,56.075,58.698,53.623,53.191,55.097,55.385,54.545
AdaBoost,f1_score,std,1.87,,,1.39,,,0.782,,,2.595,,,5.6,,
AdaBoost,accuracy,mean,61.391,57.062,56.917,61.95,54.938,54.825,62.046,56.081,55.45,60.248,52.239,52.174,56.528,53.6,53.988
AdaBoost,accuracy,std,1.582,,,1.171,,,1.187,,,1.366,,,5.011,,
SVM,f1_score,mean,58.421,58.427,55.319,58.624,59.756,59.193,56.662,52.174,50.777,55.769,55.118,52.761,56.109,54.054,52.555
SVM,f1_score,std,1.489,,,2.481,,,3.173,,,1.55,,,2.478,,


model fitting for  200  complete.
model fitting for  250  complete.
model fitting for  300  complete.
model fitting for  350  complete.
model fitting for  400  complete.

 vehicle density >=  3


Unnamed: 0,Unnamed: 1,Unnamed: 2,200_kfold,200_val_woedge,200_val_by_edge,250_kfold,250_val_woedge,250_val_by_edge,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,65.098,57.778,57.627,64.307,62.903,56.552,64.455,58.407,54.839,62.295,51.923,50.769,61.157,54.054,54.118
Random Forest,f1_score,std,1.087,,,3.263,,,2.764,,,0.634,,,2.818,,
Random Forest,accuracy,mean,64.61,54.4,52.532,63.85,57.407,52.632,63.699,55.238,53.719,60.787,48.98,45.763,59.891,50.725,51.852
Random Forest,accuracy,std,0.827,,,3.114,,,2.602,,,1.95,,,3.529,,
AdaBoost,f1_score,mean,60.617,58.156,56.18,59.903,54.386,54.015,57.842,58.182,58.065,55.98,58.252,57.377,53.054,52.174,48.718
AdaBoost,f1_score,std,1.481,,,1.856,,,3.429,,,3.256,,,2.206,,
AdaBoost,accuracy,mean,60.773,52.8,50.633,60.111,51.852,52.632,58.208,56.19,57.025,56.6,56.122,55.932,53.478,52.174,50.617
AdaBoost,accuracy,std,1.738,,,1.747,,,2.841,,,2.306,,,2.079,,
SVM,f1_score,mean,59.414,58.647,56.098,57.963,55.172,52.941,58.908,52.336,47.458,60.561,54.545,52.632,55.978,48.387,45.455
SVM,f1_score,std,1.918,,,1.952,,,1.585,,,4.513,,,1.627,,


model fitting for  200  complete.
model fitting for  250  complete.
model fitting for  300  complete.
model fitting for  350  complete.
model fitting for  400  complete.

 vehicle density >=  4


Unnamed: 0,Unnamed: 1,Unnamed: 2,200_kfold,200_val_woedge,200_val_by_edge,250_kfold,250_val_woedge,250_val_by_edge,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,65.611,53.846,57.447,67.948,59.524,57.447,64.614,57.971,58.667,62.637,64.286,59.649,61.723,51.163,53.061
Random Forest,f1_score,std,1.626,,,1.428,,,2.268,,,4.153,,,6.746,,
Random Forest,accuracy,mean,65.213,48.571,51.22,67.088,49.254,47.368,63.011,50.847,50.0,61.669,57.447,55.769,60.937,53.333,54.0
Random Forest,accuracy,std,1.296,,,0.461,,,1.087,,,3.82,,,6.437,,
AdaBoost,f1_score,mean,62.771,48.649,50.0,65.126,56.098,55.556,57.041,58.824,59.155,54.559,54.902,54.545,58.342,51.282,50.0
AdaBoost,f1_score,std,2.196,,,2.525,,,3.003,,,5.965,,,4.098,,
AdaBoost,accuracy,mean,63.201,45.714,46.341,64.649,46.269,47.368,57.586,52.542,53.226,55.239,51.064,51.923,59.164,57.778,56.0
AdaBoost,accuracy,std,2.591,,,1.892,,,3.173,,,5.302,,,3.503,,
SVM,f1_score,mean,61.464,53.333,49.412,62.645,55.0,53.933,60.116,58.824,60.0,58.565,57.692,58.182,55.63,51.064,50.98
SVM,f1_score,std,3.391,,,2.424,,,4.173,,,1.99,,,4.461,,


model fitting for  200  complete.
model fitting for  250  complete.
model fitting for  300  complete.
model fitting for  350  complete.
model fitting for  400  complete.

 vehicle density >=  5


Unnamed: 0,Unnamed: 1,Unnamed: 2,200_kfold,200_val_woedge,200_val_by_edge,250_kfold,250_val_woedge,250_val_by_edge,300_kfold,300_val_woedge,300_val_by_edge,350_kfold,350_val_woedge,350_val_by_edge,400_kfold,400_val_woedge,400_val_by_edge
Random Forest,f1_score,mean,67.244,51.282,46.512,66.267,65.217,58.333,64.102,60.0,59.459,62.394,62.857,58.824,55.289,45.455,44.444
Random Forest,f1_score,std,0.863,,,4.524,,,3.507,,,4.98,,,7.466,,
Random Forest,accuracy,mean,66.396,44.118,41.026,65.197,52.941,42.857,62.267,50.0,53.125,60.606,51.852,51.724,55.038,52.0,42.308
Random Forest,accuracy,std,1.407,,,4.171,,,3.58,,,5.669,,,4.631,,
AdaBoost,f1_score,mean,64.668,54.054,48.78,61.345,68.182,68.182,62.253,55.556,55.556,58.698,50.0,50.0,54.888,33.333,38.462
AdaBoost,f1_score,std,3.619,,,5.375,,,5.143,,,6.634,,,4.731,,
AdaBoost,accuracy,mean,64.215,50.0,46.154,62.158,58.824,60.0,63.592,50.0,50.0,58.788,48.148,51.724,55.392,36.0,38.462
AdaBoost,accuracy,std,3.51,,,4.231,,,3.864,,,5.016,,,3.866,,
SVM,f1_score,mean,61.519,55.556,50.0,64.525,65.217,63.83,63.16,57.895,57.895,61.483,64.516,62.5,54.769,60.0,63.636
SVM,f1_score,std,2.889,,,5.61,,,3.872,,,2.321,,,2.978,,
