classical_models.ipynb

by: Archie Gertsman (arkadiy2@illinois.edu) Lloyd Fernandes (lloydf2@illinois.edu)

Project director: Richard Sowers

r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/

Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license

In [None]:
####Simple_workflow

In [3]:

import sys
sys.path.append('../src/')
sys.path.append('../data/')

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from feature_eng import split_trajectories
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from model_functions import *
import seaborn as sns

In [5]:
df = pd.read_pickle('../data/block4_updated.pkl')
 
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,...,vehicle_density,avg_surr_speed,edge_bearing,acc_edge,acc_per_edge,xtrack_diff,xtrack_diff_sq,acc_edge_sq,acc_per_edge_sq,vehicle_density_by_lane
file_name,id,edge_id,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
4_1,1,250699362_250699984,42.0,37.982746,23.732961,11.9046,-0.1145,0.0138,Taxi,182.37,9.740748,1.570795,250699362,...,7,10.464171,-2.83013,0.11322,0.021953,0.0,0.0,0.012819,0.000482,1.296296
4_1,1,250699362_250699984,42.04,37.982746,23.732963,11.8975,-0.1007,0.0147,Taxi,182.37,9.740748,0.168572,250699362,...,7,10.457843,-2.83013,0.10036,0.016867,0.0,0.0,0.010072,0.000284,1.296296
4_1,1,250699362_250699984,42.08,37.982747,23.732964,11.8919,-0.0918,0.0157,Taxi,182.37,9.740748,0.168573,250699362,...,7,10.452857,-2.83013,0.092194,0.013188,0.0,0.0,0.0085,0.000174,1.296296
4_1,1,250699362_250699984,42.12,37.982748,23.732965,11.8871,-0.0869,0.0167,Taxi,182.37,9.740748,1.570796,250699362,...,7,10.448586,-2.83013,0.087837,0.010734,0.0,0.0,0.007715,0.000115,1.296296
4_1,1,250699362_250699984,42.16,37.982748,23.732966,11.8831,-0.0784,0.0176,Taxi,182.37,9.740748,0.32808,250699362,...,7,10.444986,-2.83013,0.080021,0.007273,0.0,0.0,0.006403,5.3e-05,1.296296


In [8]:
#initial parameters

agg_dict = {
            'xtrack_diff': ['mean','std'],
            'xtrack_dist': ['mean','std'],
            'avg_surr_speed': ['mean','std'],
            'vehicle_density':['mean','std'],
            'lanes':['mean'],
            'len':['mean'],
            'speed':['mean','std'],
            'speed_bool': ['count','sum'],
            'lat_acc': ['mean','std'],
            'lon_acc': ['mean','std']
            }

overlap = 0.3
min_movement_limit = 1
speed_limit = 0
k = 5

traj_lens = np.arange(50,250 ,step=50)

validation_ratio = 0.2
vehicle_density = 1


In [9]:
for i in range(0,10):
    traj_len = 50
    model = Pipeline([('scaler', StandardScaler()), ('gbm', GradientBoostingClassifier())])
    """ 
    Fit model to classify among cars and taxis. 
    1. Remove vehicles with length less than traj_len
    2. split trajectory to training and testing set
    3. get training and testing aggregates 
    4. fit the model
    """
    df_filtered = df.groupby(df.index.names[:-1]) \
            .filter(lambda grp: (len(grp) >= traj_len) )

    df_filtered['speed_bool'] = (df_filtered['speed']>speed_limit).astype(int)
    
    df_train,df_test = split_train_test(df_filtered,validation_ratio)

    #aggregate trajectories
    #to train models
    X_train,y_train = get_xy(df_train,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict,outlier_limit = 0,balance = 'by_edge')
    #to pick better performing models
    X_test,y_test = get_xy(df_test,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict)

    model.fit(X_train,y_train)
    
    accuracy,_ = voting_accuracy(X_test,y_test, model,predict_proba = True)
    print("voting accuracy Car vs Taxi: ",accuracy)
    

    """ 
    Fit model to classify among cars. 
    1. Select 50% cars and label them as Car_1
    2. follow steps 1-4 as above
    """
    
    df_car = df[df.type == 'Car'].copy()
    df_car_filtered = df_car.groupby(df_car.index.names[:-1]) \
            .filter(lambda grp: (len(grp) >= traj_len) )
    df_car_filtered['speed_bool'] = (df_car_filtered['speed']>speed_limit).astype(int)

    df_index = df_car_filtered.reset_index()[['file_name','id']].drop_duplicates()
    df_car_filtered.loc[df_car_filtered.reset_index(['edge_id', 'time'],drop = True).index.isin(df_index.sample(frac = 0.5).set_index(['file_name','id']).index),'type']='Car_1'

    df_train,df_test = split_train_test(df_car_filtered,validation_ratio)

    #to train models
    X_train,y_train = get_xy(df_train,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict,outlier_limit = 0,balance = 'by_edge')
    #to pick better performing models
    X_test,y_test = get_xy(df_test,overlap = overlap,traj_len = traj_len,agg_dict = agg_dict)


    model.fit(X_train,y_train)
    accuracy,_ = voting_accuracy(X_test,y_test, model,predict_proba = True)
    print("voting accuracy Car vs Car: ",accuracy)
    print("\n")


voting accuracy Car vs Taxi:  0.6508474576271186
voting accuracy Car vs Car:  0.5276967930029155


voting accuracy Car vs Taxi:  0.6576271186440678
voting accuracy Car vs Car:  0.5101449275362319


voting accuracy Car vs Taxi:  0.6348122866894198
voting accuracy Car vs Car:  0.48695652173913045


voting accuracy Car vs Taxi:  0.5966101694915255
voting accuracy Car vs Car:  0.4883720930232558


voting accuracy Car vs Taxi:  0.6326530612244898
voting accuracy Car vs Car:  0.4738372093023256


voting accuracy Car vs Taxi:  0.6203389830508474
voting accuracy Car vs Car:  0.5130434782608696


voting accuracy Car vs Taxi:  0.6790540540540541
voting accuracy Car vs Car:  0.4839650145772595


voting accuracy Car vs Taxi:  0.6452702702702703
voting accuracy Car vs Car:  0.42565597667638483


voting accuracy Car vs Taxi:  0.6394557823129252
voting accuracy Car vs Car:  0.4985507246376812


voting accuracy Car vs Taxi:  0.6224489795918368
voting accuracy Car vs Car:  0.52046783625731




In [22]:
df_acc.sort_index().to_csv('agg_wo_pca.csv')
df_acc.sort_index().to_pickle('agg_wo_pca.pkl')