classical_models.ipynb

by: Archie Gertsman (arkadiy2@illinois.edu)
Lloyd Fernandes (lloydf2@illinois.edu)

Project director: Richard Sowers

r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/

Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license


In [1]:
import warnings; warnings.simplefilter('ignore')

In [2]:
import sys
sys.path.append('../../Lib/')
import pandas as pd
import numpy as np
from feature_eng import split_trajectories
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time
from sklearn.preprocessing import PolynomialFeatures
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
import seaborn as sns

In [3]:
df = pd.read_pickle('block4_concat_lane.pkl')  \
    .set_index('edge_id', append=True) \
    .reorder_levels((0,1,3,2))

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,edge_progress_intervals,len,lanes,node_veh_dist,edge_seg,vehicle_density,avg_surr_speed,edge_bearing,acc_edge,acc_per_edge
file_name,id,edge_id,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
4_1,1,250699362_250699984,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,0.3,97.581,5.4,29.81433,1.0,7,10.464171,-2.83013,0.11322,0.021953
4_1,1,250699362_250699984,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,0.3,97.581,5.4,29.67483,1.0,7,10.457843,-2.83013,0.10036,0.016867
4_1,1,250699362_250699984,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,0.3,97.581,5.4,29.537753,1.0,7,10.452857,-2.83013,0.092194,0.013188
4_1,1,250699362_250699984,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,0.3,97.581,5.4,29.400718,1.0,7,10.448586,-2.83013,0.087837,0.010734
4_1,1,250699362_250699984,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,0.3,97.581,5.4,29.330986,1.0,7,10.444986,-2.83013,0.080021,0.007273


In [60]:
def rolling_agg(df, agg_dict, window_size=100, step=25):
    # rolling agg with step size = 1
    df_agg = df.groupby(df.index.names[:-1]) \
                .rolling(window_size) \
                .agg(agg_dict) \
                .dropna()
    #print(df_agg)
    # select a subset of above computations to achieve custom step size
    df_agg = df_agg.groupby(df_agg.index.names, 
                            as_index=False, 
                            group_keys=False) \
                .apply(lambda x: x[::step])
    #print(df_agg)
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    
    
    # add 'type' column
    vehicle_types = df.type.groupby(df.index.names[:-1]).first()
    #print(vehicle_types)
    return df_agg.join(vehicle_types)
  
def speed_ratio(grp, min_speed=0):
   
    return len(grp[grp.speed > min_speed]) / len(grp)

def validation_set(df,test_size):
    """dataframe is split based on their vehicle id's"""
    df_val = df.reset_index()[["file_name",'id','type']].drop_duplicates()
    X,y = df_val[["file_name","id"]],df_val['type']
    X_train,X_test,_,y_test = train_test_split(X, y, test_size=test_size, random_state=4, stratify=y) 
    df_train = df[df.index.droplevel(['time','edge_id']).isin(X_train.set_index(['file_name','id']).index)]
    X_test['type'] = y_test
    g = X_test.groupby('type')
    X_test = g.apply(lambda group: group.sample(g.size().min())).reset_index(drop = True)
    df_test = df[df.index.droplevel(['time','edge_id']).isin(X_test.set_index(['file_name','id']).index)]
    return df_train,df_test

def train_and_accuracy(X_test,y_test, model):

    #model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    a = y_hat==y_test
    
    f = f1_score((y_test == 'Car').astype(int),(y_hat == 'Car').astype(int))
    return len(a[a==True]) / len(y_test),f

def val_voting_accuracy(X_val,y_val, model,by_edge = False,predict_proba = False,display = False):

    if predict_proba == True:
        
        y_hat = pd.DataFrame(index = y_val.index,data = model.predict_proba(X_val),columns = model.classes_)
        y_hat_orig = y_hat.copy()
        #print(y_hat_orig)
        if by_edge == False:
            
            #predicted value for the entire trajectory would be the mode of the predicted labels
            y_hat = y_hat.groupby(['file_name','id']).mean()
            y_hat = y_hat.idxmax(axis=1)#.to_numpy()
            y_test = y_val.groupby(['file_name','id']).first(['type'])
             
        else:

            #predicted value for the entire trajectory would be the mode of the predicted labels
            y_hat = y_hat.groupby(['file_name','id','edge_id']).mean()
            y_hat = y_hat.idxmax(axis=1)#.to_numpy()
            y_test = y_val.groupby(['file_name','id','edge_id']).first(['type'])

    else:
        y_hat = model.predict(X_val)
        y_hat = pd.DataFrame(index = y_val.index,data = y_hat,columns = ['type'])
        y_hat_orig = y_hat.copy()
        if by_edge == False:
            
            #predicted value for the entire trajectory would be the mode of the predicted labels
            y_hat = y_hat.groupby(['file_name','id']).apply(lambda group: pd.Series.mode(group['type'])[0])
            y_test = y_val.groupby(['file_name','id']).first(['type'])
        else:
            
            #predicted value for the entire trajectory would be the mode of the predicted labels
            y_hat = y_hat.groupby(['file_name','id','edge_id']).apply(lambda group: pd.Series.mode(group['type'])[0])
            y_test = y_val.groupby(['file_name','id','edge_id']).first(['type'])

    if display:
        y_hat_orig['id_traj'] = list(range(len(y_hat_orig)))
        #y_hat_orig.set_index(['id_traj'], inplace = True,append = True)
        
        x_plot_num = 5
        y_plot_num = int(sum(y_hat!=y_test)/x_plot_num) +1
        fig, axes = plt.subplots(y_plot_num,x_plot_num, sharey = True, figsize=(5*x_plot_num,5*(y_plot_num)))
        axes = axes.ravel()
        i = 0
        
        for file_name,idx in X_val.index.droplevel((2)).unique():
            if str(y_hat.loc[(file_name,idx)]) == str(y_test.loc[(file_name,idx)]):
                continue
            
            axes[i].set(ylim=(0,1))
            type_str = "predicted: "+str(y_hat.loc[(file_name,idx)]) + ", actual: "+str(y_test.loc[(file_name,idx)])
            sns.barplot(y = 'Car',x = 'id_traj',data = y_hat_orig.loc[(file_name,idx)],ax = axes[i]).set_title("file_name: "+str(file_name)+", id "+str(idx)+" \n "+type_str)
            i+=1
            
        fig.tight_layout(h_pad=2)
        #plt.show()
        
        
    a = y_hat==y_test
   
    f = f1_score((y_test == 'Car').astype(int),(y_hat == 'Car').astype(int))
   
    return len(a[a==True]) / len(y_test),f
            
#val_voting_accuracy(X_val,y_val, model,predict_proba = True, display = True)       
#plt.savefig('fig.png',dpi = 100)       

def get_xy(df,overlap = None,traj_len = None,agg_dict = None,min_movement_limit = 0.75,outlier_limit=None,balance = None,downsample_feature_list = None,col_factor = None,window = None):
    
    if agg_dict is not None:
        df_agg =rolling_agg(df, window_size=traj_len, step=int((1 - overlap)*traj_len),agg_dict = agg_dict)
        df_agg = df_agg[df_agg.speed_bool_count*min_movement_limit <= df_agg.speed_bool_sum]
        df_agg.drop(['speed_bool_count','speed_bool_sum'],inplace= True,axis = 1)
        if outlier_limit is not None:
            df_agg = filter_by_percentile(df_agg,outlier_limit)
        if balance == 'by_edge':

            df_agg['type_count'] = df_agg['type']
            g_count = df_agg.groupby(['edge_id','type'], group_keys=False).count()['type_count']
            g = df_agg.groupby(['type','edge_id'], group_keys=False)
            df_agg = g.apply(lambda grp: grp.sample(min(g_count.loc[(grp.index.get_level_values(2)[0],slice(None))])))
            df_agg.drop('type_count',inplace = True,axis = 1)

        if balance == 'by_type':
            g = df_agg.groupby('type', group_keys=False)
            df_agg = g.apply(lambda grp: grp.sample(g.size().min()))
            
    elif downsample_feature_list is not None:
        
        df_agg = downsample(df,downsample_feature_list,col_factor,window)
        
        if outlier_limit is not None:
            df_agg = filter_by_percentile(df_agg,outlier_limit)
        if balance == 'by_edge':

            df_agg['type_count'] = df_agg['type']
            g_count = df_agg.groupby(['edge_id','type'], group_keys=False).count()['type_count']
            g = df_agg.groupby(['type','edge_id'], group_keys=False)
            df_agg = g.apply(lambda grp: grp.sample(min(g_count.loc[(grp.index.get_level_values(2)[0],slice(None))])))
            df_agg.drop('type_count',inplace = True,axis = 1)

        if balance == 'by_type':
            g = df_agg.groupby('type', group_keys=False)
            df_agg = g.apply(lambda grp: grp.sample(g.size().min()))
        
    X,y = df_agg.drop('type', axis=1), df_agg.type
    return X,y
  
def filter_by_percentile(df,percentile):
    
    top_le = 1-(percentile/100)
    bottom_le = percentile/100
    df_top = df.quantile(top_le).reset_index()
    df_top['cond'] ='('+df_top['index']+" <= "+df_top[top_le].astype(str)+')'
    df_bottom = df.quantile(bottom_le).reset_index()
    df_bottom['cond'] ='('+df_bottom['index']+" >= "+df_bottom[bottom_le].astype(str)+')'
    df = df.query(df_top.cond.str.cat(sep=' & '))
    df = df.query(df_bottom.cond.str.cat(sep=' & '))
    
    return df  

def __xtrack_dist_diff(df):
    """splits a vehicle trajectory into smaller trajectories of fixed size and removes
    the last (len(df) mod size) riws
    """
    
    #df["xtrack_diff"] = df.loc[:,['xtrack_dist']]- df.loc[:,['xtrack_dist']].shift(-1)
    #df["xtrack_diff"]=df['xtrack_diff'].fillna(0)
    df['xtrack_diff'] = df.xtrack_dist \
    .groupby(df.index.names[-1]) \
    .apply(lambda x: (x - x.shift(-1)).fillna(0))
    
    return df

def pivot(A, col_factor):
    c = A.shape[1]
    if A.size < col_factor*c:
        return None
    r_new = A.size // (col_factor*c)
    A = A[:col_factor*r_new]
    return A.to_numpy().reshape(r_new, col_factor*c)

def f(grp,window,col_factor):
    grp = grp.reset_index(level=(0,1,2), drop=True)
    grp.index = pd.TimedeltaIndex(grp.index,unit='s')
    grp = grp.resample(window).mean().reset_index(drop=True)
    return pd.DataFrame(pivot(grp,col_factor))


def downsample(df,feature_list,col_factor,window):
    df_lane_len = df[['len','lanes','type']].droplevel(3).reset_index().drop_duplicates().set_index(df.index.names[:-1])
    df = df[feature_list] \
        .groupby(df[feature_list].index.names[:-1]) \
        .apply(lambda grp: f(grp,window,col_factor)) \
        .dropna() \
        .reset_index(level=-1, drop=True)
   # print(df)
    df.columns = [feature+'_'+str(i) for i in range(col_factor) for feature in feature_list]
   
    df = df.join(df_lane_len)
    
    return df



In [35]:

df = __xtrack_dist_diff(df)
#df['xtrack_diff_sq'] = df['xtrack_diff']**2
#df['acc_edge_sq'] = df['acc_edge']**2
#df['acc_per_edge_sq'] = df['acc_per_edge']**2
#df['vehicle_density_by_lane'] = df['vehicle_density']/df['lanes']

In [36]:
class voting_model():
    def __init__(self,model,X,y):
        self.model = model
        self.voting_model = self.fit(X,y)
        
        
    def fit(self,X,y):
        """fit quadratic weighted function on model output using X,y"""
        
        model_output = self.generate_op_df(X)
        X_log = model_output.groupby(['file_name','id']).mean()
        
        Y_log = y.groupby(['file_name','id']).first(['type']).apply(lambda x: 1 if (self.model.classes_[0] == x) else -1 )
        
        model = LogisticRegression(penalty = 'none')
        return model.fit(X_log,Y_log)
    
    
    def generate_op_df(self,X):
        model_output = pd.DataFrame(data = self.model.predict_proba(X)[:,0],index = X.index,columns = ['x_1'])
        
        model_output['x_2'] = model_output['x_1']**2
        model_output['x_3'] = model_output['x_1']**3
        model_output['const'] = 1
        model_output['x_4'] = model_output['x_1']**4
        
        return model_output
    
    def predict(self,X):
        model_output = self.generate_op_df(X)
        X_test = model_output.groupby(['file_name','id']).mean()
        model_output = self.voting_model.predict(X_test)
        model_output = np.vectorize(lambda x: self.model.classes_[0] if (x>=0) else self.model.classes_[1])(model_output)
        
        return model_output
    
    def accuracy(self,X,y):
        y_test = y.groupby(['file_name','id']).first(['type'])
        y_hat = self.predict(X)
        y_hat = pd.DataFrame(index = y_test.index,data = y_hat,columns = ['type'])
        
        #predicted value for the entire trajectory would be the mode of the predicted labels
        #y_hat = y_hat.groupby(['file_name','id']).apply(lambda group: pd.Series.mode(group['type'])[0])
        #y_test = y_val.groupby(['file_name','id']).first(['type'])
        
        a = y_hat['type']==y_test
   
        f = f1_score((y_test == 'Car').astype(int),(y_hat == 'Car').astype(int))
        return len(a[a==True]) / len(y_test),f
        


In [37]:
class ensemble():
    def __init__(self,model_num,accuracy_measure,model_list = None):
        self.model_num = model_num
        self.accuracy_measure = accuracy_measure
        self.model_list = model_list
        
        
    def find_ensemble(self,df_acc,traj_len,vehicle_density,predict_proba = False):
        self.is_predict_proba = predict_proba
        self.model_list = df_acc.loc[(slice(None),'accuracy','mean'),(vehicle_density,traj_len,self.accuracy_measure)].sort_values(ascending = False).index.get_level_values(0)[:self.model_num].to_list()
      
    def fit(self,X,y,model_dict=None):
        self.model_dict = model_dict
        
        if model_dict == None:
            self.model_dict = {}
            for model in self.model_list:
                self.model_dict[model] = model.fit(X,y)
                
        values_view = model_dict.values()
        value_iterator = iter(values_view)
        self.classes_ = next(value_iterator).classes_  
                
    
    def predict(self,X):
        label_list = []
        df_model = pd.DataFrame(columns = self.model_list)
        
        if self.is_predict_proba == False:
            for model in self.model_list:
                df_model[model] = self.model_dict[model].predict(X)
            return df_model.apply(lambda x : x.mode(),axis = 1)[0].to_numpy()
            
        else:
            return self.predict_proba(X,get_label = True)
    
    def predict_proba(self,X,get_label = False):
        label_list = []
        model = list(self.model_dict.values())[0]
        df_model = pd.DataFrame(columns = pd.MultiIndex.from_product([self.model_list,model.classes_]))#,index = np.arange(0,len(X)))
        #df_model.loc[:,('MLP',model.classes_)] =  model.predict_proba(X)
        for name in self.model_list:
            model = self.model_dict[name]
            
            df_model.loc[:,(name,model.classes_)] = model.predict_proba(X)
            
        df_model = df_model.mean(axis=1, level=[1])
        
        if get_label == True:
            return df_model.idxmax(axis=1).to_numpy()
        else:
            return df_model.to_numpy()
        

In [42]:
#initial parameters
features_to_select = 10
models = {
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
        'AdaBoost':Pipeline([('scaler', StandardScaler()), ('abc', AdaBoostClassifier())]) ,
        'SVM': Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=10000,probability = True))]) ,
        'Log Regression': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=10000))]) ,
        'GBM': Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())]),
        'MLP': Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(hidden_layer_sizes = (250,100,25),max_iter=1000,\
                                                                             learning_rate = 'adaptive',early_stopping = True,n_iter_no_change = 10))])
                        
    }


df_acc = pd.DataFrame(index=pd.MultiIndex.from_product([models.keys(),['f1_score','accuracy'], ['mean']]))
overlap = 0.7
min_movement_limit = 0.75
speed_limit = 0
k = 5
validation_ratio = 0.2
kf = StratifiedKFold(n_splits=k, shuffle=True)
accs = np.zeros(k)
f1 = np.zeros(k)

agg_dict = {'xtrack_diff': ['mean','std'],
            'xtrack_dist': ['mean','std'],
            'avg_surr_speed': ['mean','std'],
            'lanes':['mean'],
            'len':['mean'],
            'speed':['mean','std'],
            #'speed_bool': ['count','sum'],
            'acc_edge': ['mean','std'],
            'acc_per_edge': ['mean','std']
            }
            
feature_list = ['xtrack_diff','xtrack_dist','avg_surr_speed','speed','acc_edge','acc_per_edge']



# factor by which the number of columns will increase after pivoting
col_factor = 2

# size of aggregating window in seconds
window = '5S'


# agg_dict = {'xtrack_diff': ['mean','std','skew',pd.DataFrame.kurt],
#             'xtrack_dist': ['mean','std','skew',pd.DataFrame.kurt],
#             'avg_surr_speed': ['mean','std','skew',pd.DataFrame.kurt],
#             'lanes':['mean'],
#             'len':['mean'],
#             'speed':['mean','std','skew',pd.DataFrame.kurt],
#             'acc_edge': ['mean','std','skew',pd.DataFrame.kurt],
#             'acc_per_edge': ['mean','std','skew',pd.DataFrame.kurt],
#             'xtrack_diff_sq': ['mean','std','skew',pd.DataFrame.kurt],
#             'acc_edge_sq': ['mean','std','skew',pd.DataFrame.kurt],
#             'acc_per_edge_sq': ['mean','std','skew',pd.DataFrame.kurt],
#             'vehicle_density_by_lane':['mean','std','skew',pd.DataFrame.kurt] }


In [76]:
# Car and Taxi classification
#plt.ioff()
col_factors = np.arange(10,60, step=10)
df_acc = pd.DataFrame(columns = pd.MultiIndex.from_product([[1],col_factors,['test','val_mean','val_log_voting']]),index=pd.MultiIndex.from_product([models.keys(),['accuracy','accuracy_baseline'], ['mean']]))
ensemble_2 = ensemble(2,'test')
ensemble_3 = ensemble(3,'test')
ensemble_5 = ensemble(5,'test')
validation_ratio = 0.2
test_ratio = 0.2
pca = PCA(n_components=5)
is_pca = False
vehicle_density = 1
is_log_model_voting = True
feature_generation_method = 'downsample'
col_factor = 30
window = '0.12S'
min_movement_limit = 1
for vehicle in ['Taxi','Car_1']:
    
    if vehicle == 'Car_1':
        df_type = df[df.type == 'Car']
        accuracy_metric = 'accuracy_baseline'
    else : 
        df_type = df.copy()
        accuracy_metric = 'accuracy'
        
    for col_factor in col_factors:
        traj_len = col_factor
        df_filtered = df_type.groupby(df_type.index.names[:-1]) \
                .filter(lambda grp: (len(grp) >= col_factor*3)&(speed_ratio(grp,speed_limit) >=min_movement_limit)) 
        
        #df_filtered['speed_bool'] = (df_filtered['speed']>speed_limit).astype(int)
        
        if vehicle == 'Car_1':
            #sample 50% of cars and label them as car_1
            df_index = df_filtered.reset_index()[['file_name','id']].drop_duplicates()
            df_filtered.loc[df_filtered.reset_index(['edge_id', 'time'],drop = True).index.isin(df_index.sample(frac = 0.5).set_index(['file_name','id']).index),'type']=vehicle
 
        df_train_test,df_val = validation_set(df_filtered,validation_ratio)
        df_train,df_test = validation_set(df_train_test,test_ratio)

        #aggregate trajectories
        #X,y = get_xy(df_train_test,overlap,traj_len,agg_dict,1)
        X_train,y_train = get_xy(df_train,outlier_limit = 1,balance = 'by_edge',downsample_feature_list = feature_list,col_factor = col_factor, window =window)
        X_test,y_test = get_xy(df_test,balance = 'by_type',downsample_feature_list = feature_list,col_factor =col_factor, window = window)
        X_test_voting,y_test_voting = get_xy(df_test,downsample_feature_list = feature_list,col_factor = col_factor, window = window)
        X_val,y_val = get_xy(df_val,downsample_feature_list = feature_list,col_factor = col_factor, window = window)

        if is_pca:
            pca.fit(X_train)
            X_test_voting = pd.DataFrame(data = pca.transform(X_test_voting),index = X_test_voting.index)
            X_train = pd.DataFrame(data = pca.transform(X_train),index = X_train.index)
            X_test = pd.DataFrame(data = pca.transform(X_test),index = X_test.index)
            X_val = pd.DataFrame(data = pca.transform(X_val),index = X_val.index)

        #store percent cars and taxis
        print("No of trajectories: ",len(X_train))
        print("No of Car trajectories: ",sum(y_train == 'Car'))
        print("No of "+vehicle+" trajectories: ",sum(y_train == vehicle))
        
        df_acc.loc[('traj_len','Car_'+vehicle,'total'), (vehicle_density,traj_len,'test')] = len(X_test)
        df_acc.loc[('traj_len','Car_'+vehicle+'_percent','Car'), (vehicle_density,traj_len,'test')] = sum(y_test == 'Car')/len(X_test)
        df_acc.loc[('traj_len','Car_'+vehicle+'_percent',vehicle), (vehicle_density,traj_len,'test')] = sum(y_test == vehicle)/ len(X_test)

        woedge_count = y_val.reset_index(['edge_id'],drop = True).reset_index().drop_duplicates()
        df_acc.loc[('traj_len','Car_'+vehicle,'total'), (vehicle_density,traj_len,'val_mean')] = len(woedge_count)
        df_acc.loc[('traj_len','Car_'+vehicle+'_percent','Car'), (vehicle_density,traj_len,'val_mean')] = sum(woedge_count.type == 'Car')/len(woedge_count)
        df_acc.loc[('traj_len','Car_'+vehicle+'_percent',vehicle), (vehicle_density,traj_len,'val_mean')] =sum(woedge_count.type == vehicle)/len(woedge_count)

        #by_edge_count = y_val.reset_index().drop_duplicates()
        df_acc.loc[('traj_len','Car_'+vehicle,'total'), (vehicle_density,traj_len,'val_log_voting')] = len(woedge_count)
        df_acc.loc[('traj_len','Car_'+vehicle+'_percent','Car'), (vehicle_density,traj_len,'val_log_voting')] = sum(woedge_count.type == 'Car')/len(woedge_count)
        df_acc.loc[('traj_len','Car_'+vehicle+'_percent',vehicle), (vehicle_density,traj_len,'val_log_voting')] = sum(woedge_count.type == vehicle)/len(woedge_count)

        model_dict = {}
        
        # fit different models
        for name, model in models.items():

            #fit the model on training set
            model.fit(X_train,y_train)

            #test the model on testing set and save accuracy estimate as test (this accuracy estimate will be used to find ensemble) 
            val_accs,val_f1 = train_and_accuracy(X_test,y_test,model)                                   
            df_acc.loc[(name, accuracy_metric,'mean'),  (vehicle_density,traj_len,'test')] = round(100*val_accs, 3)
            
            #find accuracy of the model on validation set with voting using mean
            val_accs,val_f1 = val_voting_accuracy(X_val,y_val, model,predict_proba = True)
            df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'val_mean')] = round(100*val_accs, 3)
            #plt.savefig("traj_len"+str(traj_len)+name+".png")
            
            if is_log_model_voting:
            #train logistic regression for voting using the test set and training model
                voting_m = voting_model(model,X_test_voting,y_test_voting)
            #find the accuracy of the model on validation set with voting using logistic regression
                val_accs,val_f1 = voting_m.accuracy(X_val,y_val)#, voting_m, predict_proba = False)
                df_acc.loc[(name, accuracy_metric,'mean'), (vehicle_density,traj_len,'val_log_voting')] = round(100*val_accs, 3)
    
            #save model in dictionary for ensemble
            model_dict[name] = model

        #generate ensembles with 2,3 and 5 models
        ensemble_2.find_ensemble(df_acc,traj_len,vehicle_density,True)
        ensemble_2.fit(X_train,y_train,model_dict)
        ensemble_3.find_ensemble(df_acc,traj_len,vehicle_density,True)
        ensemble_3.fit(X_train,y_train,model_dict)
        ensemble_5.find_ensemble(df_acc,traj_len,vehicle_density,True)
        ensemble_5.fit(X_train,y_train,model_dict)

        #test accuracy of ensembles on validation set with mean 
        val_accs,val_f1 = val_voting_accuracy(X_val,y_val, ensemble_2)
        df_acc.loc[('ensemble_2', accuracy_metric,'mean'), (vehicle_density,traj_len,'val_mean')] = round(100*val_accs, 3)
        val_accs,val_f1 = val_voting_accuracy(X_val,y_val, ensemble_3)
        df_acc.loc[('ensemble_3', accuracy_metric,'mean'), (vehicle_density,traj_len,'val_mean')] = round(100*val_accs, 3)
        val_accs,val_f1 = val_voting_accuracy(X_val,y_val, ensemble_5)
        df_acc.loc[('ensemble_5', accuracy_metric,'mean'), (vehicle_density,traj_len,'val_mean')] = round(100*val_accs, 3)

        if is_log_model_voting:
            #test accuracy of ensembles on validation using logistic voting (trained on testing set)
            voting_m = voting_model(ensemble_2,X_test_voting,y_test_voting)
            val_accs,val_f1 = voting_m.accuracy(X_val,y_val)
            df_acc.loc[('ensemble_2', accuracy_metric,'mean'), (vehicle_density,traj_len,'val_log_voting')] = round(100*val_accs, 3)
            voting_m = voting_model(ensemble_3,X_test_voting,y_test_voting)
            val_accs,val_f1 = voting_m.accuracy(X_val,y_val)
            df_acc.loc[('ensemble_3', accuracy_metric,'mean'), (vehicle_density,traj_len,'val_log_voting')] = round(100*val_accs, 3)
            voting_m = voting_model(ensemble_5,X_test_voting,y_test_voting)
            val_accs,val_f1 = voting_m.accuracy(X_val,y_val)
            df_acc.loc[('ensemble_5', accuracy_metric,'mean'), (vehicle_density,traj_len,'val_log_voting')] = round(100*val_accs, 3)




No of trajectories:  10261
No of Car trajectories:  5136
No of Taxi trajectories:  5125
No of trajectories:  4481
No of Car trajectories:  2246
No of Taxi trajectories:  2235
No of trajectories:  2672
No of Car trajectories:  1336
No of Taxi trajectories:  1336
No of trajectories:  1877
No of Car trajectories:  939
No of Taxi trajectories:  938
No of trajectories:  1320
No of Car trajectories:  661
No of Taxi trajectories:  659
No of trajectories:  9437
No of Car trajectories:  4711
No of Car_1 trajectories:  4726
No of trajectories:  4249
No of Car trajectories:  2119
No of Car_1 trajectories:  2130
No of trajectories:  2403
No of Car trajectories:  1199
No of Car_1 trajectories:  1204
No of trajectories:  1565
No of Car trajectories:  783
No of Car_1 trajectories:  782
No of trajectories:  1115
No of Car trajectories:  557
No of Car_1 trajectories:  558


In [79]:
df_acc.sort_index()#.to_csv("accuracy_block4_downsample.csv")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,10,10,10,20,20,20,30,30,30,40,40,40,50,50,50
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,test,val_mean,val_log_voting,test,val_mean,val_log_voting,test,val_mean,val_log_voting,test,val_mean,val_log_voting,test,val_mean,val_log_voting
AdaBoost,accuracy,mean,53.737,55.2,52.0,54.758,59.426,55.738,52.524,56.034,56.466,51.976,56.364,55.455,52.273,52.381,50.476
AdaBoost,accuracy_baseline,mean,49.339,51.014,48.311,48.918,47.902,43.007,52.273,48.872,47.368,50.526,52.033,45.935,54.762,46.087,50.435
GBM,accuracy,mean,59.942,60.0,63.2,55.514,58.607,59.836,57.571,61.207,60.776,56.917,57.273,53.636,59.091,58.095,48.571
GBM,accuracy_baseline,mean,49.617,50.676,48.311,48.532,50.699,48.951,51.136,47.744,51.504,50.175,50.407,51.626,52.381,45.652,52.174
Log Regression,accuracy,mean,50.581,50.4,52.8,50.982,53.689,52.869,52.681,49.569,51.724,51.383,51.818,54.091,53.788,53.333,53.81
Log Regression,accuracy_baseline,mean,50.104,50.676,46.284,52.628,48.601,45.455,51.136,50.752,48.872,47.895,52.846,50.813,51.667,49.565,46.522
MLP,accuracy,mean,56.495,58.8,58.8,55.891,60.246,61.885,54.732,61.207,58.621,55.731,56.364,62.273,54.04,54.762,55.714
MLP,accuracy_baseline,mean,50.0,50.338,51.689,51.468,53.497,53.846,51.515,51.88,43.233,49.298,53.252,46.748,43.81,50.0,51.739
Random Forest,accuracy,mean,57.293,58.8,56.0,56.269,58.197,58.197,55.994,60.345,56.466,56.126,64.545,59.091,56.566,59.048,58.095
Random Forest,accuracy_baseline,mean,49.826,47.635,44.932,51.7,51.399,46.503,52.525,49.624,52.256,49.298,45.935,54.065,50.714,44.783,46.522
