In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_cleaning import *

In [2]:
df = pd.read_csv("./dataset_revs/Merged_File_05092022.csv")

df = clean_data(dataframe=df)

In [3]:
df.head()

Unnamed: 0,casenum,region,regionname,urbanicity,urbanicityname,makename,mak_modname,body_typ,body_typname,mod_yearname,...,age_im,sex_im,sex_imname,veh_no,vspd_lim,vspd_limname,trav_sp,trav_spname,hour_binned,speeding_status
0,202002121240,4,West,2,Rural,Toyota,Toyota Camry,4,"4-door sedan, hardtop",2018,...,61.0,2.0,Female,1,98,Not Reported,25,025 MPH,Morning,unknown
1,202002121829,3,South,1,Urban,BMW,BMW 3-series,4,"4-door sedan, hardtop",2013,...,23.0,1.0,Male,1,25,25 MPH,45,045 MPH,Night,speeding
2,202002123484,4,West,2,Rural,Subaru,Subaru XV Crosstrek,14,"Compact Utility (Utility Vehicle Categories ""S...",2015,...,20.0,1.0,Male,1,80,80 MPH,65,065 MPH,Afternoon,not speeding
3,202002123576,1,Northeast,2,Rural,Subaru,Subaru Legacy/Outback(prior to 2003 only; see ...,6,Station Wagon (excluding van and truck based),2004,...,23.0,1.0,Male,1,50,50 MPH,998,Not Reported,Evening,unknown
4,202002125146,2,Midwest,2,Rural,Ford,Ford F-Series pickup,34,Light Pickup,2003,...,57.0,2.0,Female,1,55,55 MPH,998,Not Reported,Evening,unknown


In [5]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import pickle

In [6]:
def gs_to_clean_df(search_results, task="classification", sort_metric=None, sort_ascending=True):
    
    gs_df = pd.DataFrame(search_results)
    
    start_column_names = gs_df.columns
    
    # Remove the columns that give statistics on time or specific cv fold splits
    filtered_column_names = [name for name in start_column_names if "time" not in name]
    filtered_column_names = [name for name in filtered_column_names if "split" not in name]
    
    # Remove columns not in the filtered list above
    columns_to_remove = [name for name in start_column_names if name not in filtered_column_names]
    gs_df.drop(columns=columns_to_remove, inplace=True)
    
    # Columns we want to keep. Remainder of the function just fixes up these columns.
    column_names = gs_df.columns
    modified_column_names = [name.split("__")[-1] for name in column_names]
    modified_column_names = [name.split("param_")[-1] for name in modified_column_names]
    
    # For any negative metrics, take the absolute value and remove negative from the name.
    for col_name in modified_column_names:
        if "_neg" in col_name:
            gs_df.loc[:, col_name] = gs_df.loc[:, col_name].abs()
    modified_column_names = [name.replace("_neg", "") for name in modified_column_names]
    
    # Shorten some names for easier readability
    if task == "regression":
        shortened_names = [("_root_mean_squared_error", "_RMSE"), ("_mean_squared_error", "_MSE"), ("_mean_absolute_error","_MAE")]
        for long_name, short_name in shortened_names:
            modified_column_names = [name.replace(long_name, short_name) for name in modified_column_names]
    
    
    # Perform the final renaming
    renaming_dict = {old_name:new_name for old_name, new_name in zip(column_names, modified_column_names)}
    gs_df.rename(columns=renaming_dict, inplace=True)

    if sort_metric is None:

        if task == "regression":
            gs_df.sort_values(by="mean_test_RMSE", inplace=True)
        else:
            gs_df.sort_values(by="mean_test_accuracy", ascending=False, inplace=True)

    else:
        gs_df.sort_values(by=sort_metric, ascending=sort_ascending, inplace=True)

    return(gs_df)

In [7]:
def create_column_transformer(numeric_features, nominal_features, ordinal_features=None, ohe_bin_map=None, 
                              remainder='passthrough', sparse_threshold=0.3, n_jobs=5, encoder_handle_unknown="infrequent_if_exist"):
    
    binned_nominal_features = [feature for sublist in ohe_bin_map.values() for feature in sublist]
    non_binned_nominal_features = [feature for feature in nominal_features if feature not in binned_nominal_features]
    
    # Create the list of one-hot encoders that have binning requirements. Format each in a tuple that specifies
    # (name, transformer, columns), which is the format expected by ColumnTransformer
    binned_ohes = [(f"max_{key}_ohe", OneHotEncoder(max_categories=key, 
                                                    handle_unknown=encoder_handle_unknown), value) for key, value in ohe_bin_map.items()]
    
    # Regular one-hot encoder for all the other nominal features
    regular_ohe = [("one_hot_encoder", OneHotEncoder(handle_unknown=encoder_handle_unknown), non_binned_nominal_features)]
    
    # Set up standard scaler for the numeric features
    std_scaler = [("standard_scaler", StandardScaler(), numeric_features)]
    
    # Concatenate the lists together
    all_transformers = binned_ohes + regular_ohe + std_scaler
    
    # Instantiate the ColumnTransformer
    preprocess = ColumnTransformer(transformers=all_transformers, 
                                   remainder=remainder, 
                                   sparse_threshold=sparse_threshold, 
                                   n_jobs=n_jobs)
    
    return preprocess

In [8]:
features = ['vtrafconname', 'vsurcondname', 'vtrafwayname', "age_im", 'regionname', 'urbanicityname', 
            'sex_imname', 'speeding_status', 'hour_binned', 'alchl_imname', 'weathername',
            'lgtcon_imname','maxsev_imname']

model_df = df.loc[:, features].copy(deep=True)
model_df.head()

Unnamed: 0,vtrafconname,vsurcondname,vtrafwayname,age_im,regionname,urbanicityname,sex_imname,speeding_status,hour_binned,alchl_imname,weathername,lgtcon_imname,maxsev_imname
0,Traffic control signal(on colors) not known wh...,Snow,"Two-Way, Not Divided",61.0,West,Rural,Female,unknown,Morning,No Alcohol Involved,Cloudy,Daylight,No Apparent Injury
1,No Controls,Dry,"Two-Way, Not Divided",23.0,South,Urban,Male,speeding,Night,No Alcohol Involved,Clear,Dark - Not Lighted,Possible Injury
2,No Controls,Snow,"Two-Way, Divided, Positive Median Barrier",20.0,West,Rural,Male,not speeding,Afternoon,No Alcohol Involved,Snow,Daylight,No Apparent Injury
3,Warning Sign,Snow,Not Reported,23.0,Northeast,Rural,Male,unknown,Evening,No Alcohol Involved,Snow,Dark - Not Lighted,No Apparent Injury
4,No Controls,Dry,Not Reported,57.0,Midwest,Rural,Female,unknown,Evening,No Alcohol Involved,Clear,Dark - Not Lighted,Possible Injury


In [9]:
ohe_map = {6:['vtrafconname'], 
           7:['vsurcondname'], 
           8:['vtrafwayname', 'vsurcondname'], 
           10:['weathername']}

preprocessing = create_column_transformer(numeric_features=["age_im"], 
                                          nominal_features=['regionname', 'urbanicityname', 'sex_imname', 'speeding_status', 
                                                            'hour_binned', 'alchl_imname', 'lgtcon_imname'], 
                                          ohe_bin_map=ohe_map)

full_pipe = Pipeline(steps=[("preprocess", preprocessing), 
                            ("svm", SVC())])

In [10]:
full_pipe

In [11]:
X = model_df.drop(columns="maxsev_imname")
y = model_df["maxsev_imname"].to_numpy()

param_grid = {'svm__kernel':['rbf', 'poly', 'sigmoid', 'linear'], 
              'svm__C':[0.25, 0.5, 1.0, 1.5, 2], 
              'svm__gamma':['scale', 'auto']}

#gs = GridSearchCV(estimator=full_pipe,
#                  param_grid=param_grid,
#                  scoring=["accuracy"],
#                  refit="accuracy",
#                  n_jobs=-1, 
#                  cv=3,
#                  verbose=1, 
#                  error_score='raise',
#                  return_train_score=True)

#gs.fit(X,y)

#with open("./models/gs_svm_052522.pkl", 'wb') as file:
#    pickle.dump(gs, file)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [16]:
search_results = gs.cv_results_

# Display the gridsearch results
gs_df = gs_to_clean_df(search_results, sort_metric="mean_test_accuracy", sort_ascending=False)
gs_df.head()

Unnamed: 0,C,gamma,kernel,params,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,mean_train_accuracy,std_train_accuracy
16,1.0,scale,rbf,"{'svm__C': 1.0, 'svm__gamma': 'scale', 'svm__k...",0.488855,0.011932,1,0.525142,0.001539
9,0.5,scale,poly,"{'svm__C': 0.5, 'svm__gamma': 'scale', 'svm__k...",0.488689,0.012104,2,0.522809,0.002318
1,0.25,scale,poly,"{'svm__C': 0.25, 'svm__gamma': 'scale', 'svm__...",0.487694,0.013645,3,0.512763,0.002978
24,1.5,scale,rbf,"{'svm__C': 1.5, 'svm__gamma': 'scale', 'svm__k...",0.487652,0.010618,4,0.535831,0.002012
0,0.25,scale,rbf,"{'svm__C': 0.25, 'svm__gamma': 'scale', 'svm__...",0.487196,0.01306,5,0.505205,0.004199


### Here is an example of loading the pickle file and using the best model

In [19]:
PATH = "./models/gs_svm_052522.pkl"

with open(PATH, 'rb') as file:
    gs_results = pickle.load(file)

In [None]:
gs_results.best_estimator_.predict(X)