In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

import numpy as np


from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV





In [2]:
data = pd.read_csv('final_wec_data.csv')

  data = pd.read_csv('final_wec_data.csv')


In [3]:
data.columns

Index(['Unnamed: 0', 'car_number', 'driver_number', 'lap_number', 'lap_time',
       'lap_improvement', 'crossing_finish_line_in_pit', 's1',
       's1_improvement', 's2', 's2_improvement', 's3', 's3_improvement',
       'lap_avg_kph', 'elapsed', 'hour', 's1_large', 's2_large', 's3_large',
       'lap_max_kph', 'driver_name', 'pit_time', 'class', 'group', 'team',
       'manufacturer', 'season', 'year', 'circuit', 'round', 'vehicle',
       'team_no', 'lap_time_ms', 'lap_time_s', 'engine', 'driver_stint_no',
       'driver_stint', 'team_stint_no', 'team_stint', 'elapsed_ms', 'position',
       'class_position', 'interval_ms', 'interval', 'gap', 'class_interval',
       'class_gap', 'elapsed_s', 'rating', 'rolling_5l_avg'],
      dtype='object')

In [4]:
# data = data.drop('Unnamed: 0', axis=1)
# #  Dropping the extra index col

# data = data[data['year'] >= 2017].reset_index(drop=True)
# # # Dropping data from prior to 2017 as driver_ratings data was not available

# data = data.drop(['lap_number', 'car_number', 'lap_number', 'driver_number', 'lap_time', 'elapsed', 'hour', 's1_large', 's2_large', 's3_large', 'driver_name', 'pit_time', 'group', 'team', 'manufacturer', 'season', 'vehicle', 'team_no','lap_time_ms', 'engine', 'driver_stint',
#            'team_stint', 'team_stint_no', 'interval_ms', 'interval', 'elapsed_ms', 'position', 'gap', 'elapsed_s', ], axis=1)
# # Initial columns dropped because not needed/not useable for modeling. Remaining columns will be further assessed 

In [5]:
def df_for_ml(df):

    df = df.drop('Unnamed: 0', axis=1)
    df = data[data['year'] >= 2017].reset_index(drop=True)
    # Dropping data from prior to 2017 as driver_ratings data was not available
    df = df.drop(['lap_number', 'car_number', 'lap_number', 'driver_number', 'lap_time', 'elapsed', 'hour', 's1_large', 's2_large', 's3_large', 'driver_name', 'pit_time', 'group', 'team', 'manufacturer', 'season', 'vehicle', 'team_no','lap_time_ms', 'engine', 'driver_stint',
           'team_stint', 'team_stint_no', 'interval_ms', 'interval', 'elapsed_ms', 'position', 'gap', 'elapsed_s'], axis=1)
    # Initial columns dropped because not needed/not useable for modeling. Remaining columns will be further assessed 
    
    return df




In [6]:
data = df_for_ml(data)

In [7]:

in_lap_check = data[data['crossing_finish_line_in_pit'] == 1]

percent_in_lap_check = (in_lap_check.shape[0] / data.shape[0]) * 100
percent_in_lap_check

# Checking percentage of records that have 'crossing_finish_line_in_pit' == 1
#These values can be dropped based on the low %

6.093703744098721

In [8]:
data.isna().sum()/len(data)*100

#small percentages of nan values can be safely dropped without significant effect. 
#12.68% of laps were run by a driver for which we have no rating info. These laps will be dropped

Unnamed: 0                      0.000000
lap_improvement                 0.000000
crossing_finish_line_in_pit     0.000000
s1                              0.007790
s1_improvement                  0.000000
s2                              0.047522
s2_improvement                  0.000000
s3                              0.039731
s3_improvement                  0.000000
lap_avg_kph                     0.000000
lap_max_kph                     0.249295
class                           0.000000
year                            0.000000
circuit                         0.000000
round                           0.000000
lap_time_s                      0.000000
driver_stint_no                 0.000000
class_position                  0.000000
class_interval                  0.000000
class_gap                       0.000000
rating                         12.800517
rolling_5l_avg                  0.000000
dtype: float64

In [9]:
data['class'].unique()

array(['LMGTE Am', 'LMGTE Pro', 'LMP1', 'LMP2', 'HYPERCAR',
       'INNOVATIVE CAR'], dtype=object)

In [10]:
data['circuit'].unique()

array(['silverstone', 'spa', 'le mans', 'nurburgring', 'mexico', 'cota',
       'fuji', 'shanghai', 'bahrain', 'sebring', 'portimao', 'monza'],
      dtype=object)

In [11]:
def fixing_kept_cols(df):

    df = df[df['class']!='INNOVATIVE CAR']
#     removing the innovative class to eliminate outlier data
    
    df = df[df['crossing_finish_line_in_pit']==0]
#     Filtering out rows where the lap ended in the pit lane (commonly known as an in-lap). 
# In theory this data could be useful for the model, under normal circumstances an in-lap would be run on old tires. A driver who can run a faster in-lap than their rivals
# may be able to gain positions or time on a competitor and thus suggested a more skilled driver. However a car that has had an issue on track will have an abnormally slow in-lap as it limps back to the pitlane.
# The in-lap data complicates the process of trimming outliers from the lap time data. Since only ~6% of records were in-laps we will simply drop these records
    
    df = df.dropna().reset_index(drop=True)
#    Dropping any rows with missing values in remaining data

    return df


In [12]:
data = fixing_kept_cols(data)

In [13]:
data.head()

Unnamed: 0.1,Unnamed: 0,lap_improvement,crossing_finish_line_in_pit,s1,s1_improvement,s2,s2_improvement,s3,s3_improvement,lap_avg_kph,lap_max_kph,class,year,circuit,round,lap_time_s,driver_stint_no,class_position,class_interval,class_gap,rating,rolling_5l_avg
0,246956,0,0,41.322,0,51.911,0,33.424,0,167.7,242.7,LMGTE Am,2017,silverstone,1,126.657,1,1.0,0.0,0.0,S,126.657
1,246957,0,0,41.504,0,52.321,0,33.244,0,167.2,249.4,LMGTE Am,2017,silverstone,1,127.069,1,2.0,0.412,0.412,P,124.8002
2,246958,0,0,42.036,0,52.346,0,33.114,0,166.6,246.0,LMGTE Am,2017,silverstone,1,127.496,1,3.0,0.427,0.839,S,135.6296
3,246959,0,0,42.815,0,52.29,0,32.766,0,166.1,248.3,LMGTE Am,2017,silverstone,1,127.871,1,4.0,0.375,1.214,G,125.1642
4,246960,0,0,42.827,0,54.325,0,34.207,0,161.7,242.7,LMGTE Am,2017,silverstone,1,131.359,1,5.0,3.488,4.702,B,124.4692


In [14]:
data.dtypes

Unnamed: 0                       int64
lap_improvement                  int64
crossing_finish_line_in_pit      int64
s1                             float64
s1_improvement                   int64
s2                             float64
s2_improvement                   int64
s3                             float64
s3_improvement                   int64
lap_avg_kph                    float64
lap_max_kph                    float64
class                           object
year                             int64
circuit                         object
round                            int64
lap_time_s                     float64
driver_stint_no                  int64
class_position                 float64
class_interval                 float64
class_gap                      float64
rating                          object
rolling_5l_avg                 float64
dtype: object

In [15]:
data.describe()

Unnamed: 0.1,Unnamed: 0,lap_improvement,crossing_finish_line_in_pit,s1,s1_improvement,s2,s2_improvement,s3,s3_improvement,lap_avg_kph,lap_max_kph,year,round,lap_time_s,driver_stint_no,class_position,class_interval,class_gap,rolling_5l_avg
count,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0,209612.0
mean,371755.643594,0.010205,0.0,44.076848,0.017881,65.846589,0.018119,69.228965,0.01936,183.019491,272.51013,2019.224849,4.301619,179.271765,3.553437,6.320196,115.739313,394.96587,183.289629
std,74905.388276,0.144144,0.0,86.157872,0.165005,28.969278,0.16639,53.97882,0.172087,32.722641,48.526147,1.715857,2.324363,122.64022,2.987287,4.889323,654.140649,989.016427,106.66395
min,246956.0,0.0,0.0,18.809,0.0,25.517,0.0,18.052,0.0,3.1,19.4,2017.0,1.0,85.603,0.0,1.0,0.0,0.0,86.1162
25%,306017.5,0.0,0.0,33.866,0.0,42.71975,0.0,36.107,0.0,163.3,255.9,2018.0,2.0,116.553,1.0,3.0,2.301,15.298,119.4538
50%,368087.5,0.0,0.0,36.572,0.0,61.4645,0.0,46.778,0.0,180.6,281.2,2019.0,4.0,140.0865,2.0,5.0,16.541,87.41,148.1409
75%,433498.0,0.0,0.0,38.583,0.0,84.93125,0.0,102.71575,0.0,208.1,298.0,2021.0,6.0,232.456,5.0,9.0,69.40925,349.1655,236.2399
max,503679.0,3.0,0.0,7255.675,3.0,634.217,3.0,3157.259,3.0,248.6,350.1,2022.0,9.0,9906.273,19.0,27.0,21529.212,24219.34,9906.273


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209612 entries, 0 to 209611
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   209612 non-null  int64  
 1   lap_improvement              209612 non-null  int64  
 2   crossing_finish_line_in_pit  209612 non-null  int64  
 3   s1                           209612 non-null  float64
 4   s1_improvement               209612 non-null  int64  
 5   s2                           209612 non-null  float64
 6   s2_improvement               209612 non-null  int64  
 7   s3                           209612 non-null  float64
 8   s3_improvement               209612 non-null  int64  
 9   lap_avg_kph                  209612 non-null  float64
 10  lap_max_kph                  209612 non-null  float64
 11  class                        209612 non-null  object 
 12  year                         209612 non-null  int64  
 13 

In [17]:
data['rating'].value_counts()

P    95850
G    44561
S    40878
B    28323
Name: rating, dtype: int64

In [18]:
shortest_lap = data.loc[data['lap_time_s'].idxmin()]
shortest_lap
#Checking the record for the fastest lap time in our data. If the fastest lap was an errant value further trimming and cleaning of the data would be needed.
# 85.603 seconds or 01:25.603 is a real lap time at Fuji, it is assumed based on this that no false data exists at the bottom end of the spectrum


Unnamed: 0                      334782
lap_improvement                      3
crossing_finish_line_in_pit          0
s1                              19.341
s1_improvement                       0
s2                               25.58
s2_improvement                       0
s3                              40.682
s3_improvement                       0
lap_avg_kph                      191.9
lap_max_kph                      257.1
class                             LMP1
year                              2018
circuit                           fuji
round                                4
lap_time_s                      85.603
driver_stint_no                      1
class_position                     1.0
class_interval                     0.0
class_gap                          0.0
rating                               P
rolling_5l_avg                 86.8282
Name: 76352, dtype: object

In [19]:
len(data)

209612

In [20]:

def trim_outliers(df):
    grouped = df.groupby(['class', 'circuit'])
    for key, group in grouped:
        IQR = np.percentile(group['lap_time_s'], 75) - np.percentile(group['lap_time_s'], 25)
        u_limit = np.percentile(group['lap_time_s'], 75) + 1*IQR
#         l_limit = np.percentile(group['lap_time_s'], 25) - 1.5*IQR
#         print(key, 'IQR: ', IQR, 'u_lim: ', u_limit, 'l_lim: ', l_limit)
        group = group[group['lap_time_s'] < u_limit]
        if 'new_df' in locals():
            new_df = pd.concat([new_df, group])
        else:
            new_df = group.reset_index(drop=True)
        
    return new_df

# Common practice is to remove records based on both the upper and lower limits. Given the type of data we're dealing with it makes sense to only trim upper-limit outliers
# This will remove laps where a car crashed, broke down, or was otherwise delayed on track. This will also serve to remove laps run under slowzones, FCY, or Safety car procedures since we do not have data to indicate those situations.
#The removed print statement would show the IQR and limit values for each circuit/class pairing for checking

In [21]:
data = trim_outliers(data)
len(data)

176248

In [22]:
longtest_lap = data.loc[data['lap_time_s'].idxmax()]
longtest_lap
# Checking the longest lap remaining after the outliers are trimmed to find an optimal value for the upper limit without removing too many valid laps.
# 261 seconds or a laptime of 4:35.000 for a GTE at Le Mans is a slow lap but not one that indicates an artificial delay (such as a significant accident).

Unnamed: 0                       412804
lap_improvement                       0
crossing_finish_line_in_pit           0
s1                               42.229
s1_improvement                        0
s2                               99.308
s2_improvement                        0
s3                              119.478
s3_improvement                        0
lap_avg_kph                       187.9
lap_max_kph                       286.9
class                          LMGTE Am
year                               2020
circuit                         le mans
round                                 7
lap_time_s                      261.015
driver_stint_no                       4
class_position                     13.0
class_interval                   35.735
class_gap                      1404.954
rating                                B
rolling_5l_avg                 267.3292
Name: 139979, dtype: object

In [23]:
# obs = data.select_dtypes(object).drop('rating', axis=1)

# class_one_hot = pd.get_dummies(obs['class'], prefix='class')

# circuit_one_hot = pd.get_dummies(obs['circuit'], prefix='circuit')

# ob_feats = pd.concat([class_one_hot, circuit_one_hot], axis=1)

# ob_feats

In [24]:
# cont_nums = data.select_dtypes('number')
# cont_nums = cont_nums.drop(['lap_improvement', 'crossing_finish_line_in_pit', 's1_improvement', 's2_improvement', 's3_improvement', 'year', 'round', 'driver_stint_no', 'class_position'], axis=1)

# disc_nums = data[['lap_improvement', 'crossing_finish_line_in_pit', 's1_improvement', 's2_improvement', 's3_improvement', 'year', 'round', 'driver_stint_no', 'class_position']]
# disc_nums

In [25]:
def separate_df(df):
    cont_nums = df.select_dtypes('number')
    cont_nums = cont_nums.drop(['lap_improvement', 'crossing_finish_line_in_pit', 's1_improvement', 's2_improvement', 's3_improvement', 'year', 'round', 'driver_stint_no', 'class_position'], axis=1)
    
    disc_nums = df[['lap_improvement', 'crossing_finish_line_in_pit', 's1_improvement', 's2_improvement', 's3_improvement', 'year', 'round', 'driver_stint_no', 'class_position']]
    
    obs = data.select_dtypes(object).drop('rating', axis=1)

    target = df['rating']
    
    return cont_nums, disc_nums, obs, target

# Splitting the dataframe for encoding and scaling operations

In [26]:
cont_nums, disc_nums, obs, target = separate_df(data)

In [27]:
obs

Unnamed: 0,class,circuit
0,HYPERCAR,bahrain
1,HYPERCAR,bahrain
2,HYPERCAR,bahrain
3,HYPERCAR,bahrain
4,HYPERCAR,bahrain
...,...,...
191869,LMP2,spa
191872,LMP2,spa
191873,LMP2,spa
191874,LMP2,spa


In [28]:
def encode_obs(obs):
    class_one_hot = pd.get_dummies(obs['class'], prefix='class')
    circuit_one_hot = pd.get_dummies(obs['circuit'], prefix='circuit')
    
    ob_feats = pd.concat([class_one_hot, circuit_one_hot], axis=1)
    
    return ob_feats
# Encoding the object features for model use

In [29]:
ob_feats = encode_obs(obs)

In [30]:
ob_feats.head()

Unnamed: 0,class_HYPERCAR,class_LMGTE Am,class_LMGTE Pro,class_LMP1,class_LMP2,circuit_bahrain,circuit_cota,circuit_fuji,circuit_le mans,circuit_mexico,circuit_monza,circuit_nurburgring,circuit_portimao,circuit_sebring,circuit_shanghai,circuit_silverstone,circuit_spa
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [31]:
disc_nums.head()

Unnamed: 0,lap_improvement,crossing_finish_line_in_pit,s1_improvement,s2_improvement,s3_improvement,year,round,driver_stint_no,class_position
0,3,0,2,2,0,2021,5,1,1.0
1,2,0,0,3,0,2021,5,1,2.0
2,2,0,2,0,3,2021,5,1,3.0
3,0,0,0,0,0,2021,5,1,1.0
4,0,0,3,0,0,2021,5,1,2.0


In [32]:
# s_scaler = StandardScaler()
# num_s_scal = s_scaler.fit_transform(nums)
# num_s_scal = pd.DataFrame(num_s_scal)
# num_s_scal

In [33]:
# mm_scaler = MinMaxScaler()
# num_mm_scal = mm_scaler.fit_transform(nums)
# num_mm_scal = pd.DataFrame(num_mm_scal)
# num_mm_scal

In [34]:
# abs_scaler = MaxAbsScaler()
# num_abs_scal = abs_scaler.fit_transform(nums)
# num_abs_scal = pd.DataFrame(num_abs_scal)
# num_abs_scal

In [35]:
def make_scaled_sets(df):
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
    
    scales = []
    
    s_scaler = StandardScaler()
    num_s_scal = s_scaler.fit_transform(df)
    num_s_scal = pd.DataFrame(num_s_scal)
    num_s_scal.columns = num_s_scal.columns.astype(str)
    num_s_scal.name = 'StandardScaler'
    scales.append(num_s_scal)
    
    mm_scaler = MinMaxScaler()
    num_mm_scal = mm_scaler.fit_transform(df)
    num_mm_scal = pd.DataFrame(num_mm_scal)
    num_mm_scal.columns = num_mm_scal.columns.astype(str)
    num_mm_scal.name = 'MinMaxScaler'
    scales.append(num_mm_scal)
    
    abs_scaler = MaxAbsScaler()
    num_abs_scal = abs_scaler.fit_transform(df)
    num_abs_scal = pd.DataFrame(num_abs_scal)
    num_abs_scal.columns = num_abs_scal.columns.astype(str)
    num_abs_scal.name = 'MaxAbsScaler'
    scales.append(num_abs_scal)
    
    
    return scales




In [36]:
scales = make_scaled_sets(cont_nums)


In [37]:
# num_s_scal.head()

In [38]:
# num_mm_scal.head()

In [39]:
# num_abs_scal.head()

In [40]:

disc_nums = disc_nums.reset_index(drop=True)
ob_feats = ob_feats.reset_index(drop=True)

features = pd.concat([scales[0], disc_nums, ob_feats], axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 55)
# # Making a split for initial testing purposes.

In [41]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Training/fitting an initial run of the RandomForest Classifiier for baseline testing and for parameter tuning

# Initial model run resulted in poor accuracy. Hyperparameter tuning can be used to improve the accuracy result.

Accuracy: 0.8933548182107031


In [42]:
# Using the GridSearchCV function will allow us to test multiple iterations of the model each with a different set of parameters. The function compares the accuracy score of each iteration
# and will finally report the best score and the parameter settings used to acheive this. 


# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [5, 10, 15],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2'] }

# rfc = RandomForestClassifier()
# grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# # print the best parameters and score
# print("Best parameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)



# Best parameters:  {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
# Best score:  0.785742018337664

# ***After running the grid search function through the model hyperparameters we have acheived a best score of ~78% accuracy. The output best parameters wil be used for this model from here on out. 


In [58]:
def hyper_tuning_rfc(X_train, y_train):
    param_grid_rfc = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 'log2', None] }
    rfc = RandomForestClassifier()
    grid_search = GridSearchCV(rfc, param_grid=param_grid_rfc, cv=5)
    grid_search.fit(X_train, y_train)
    rfc_best_params = grid_search.best_params_
    
    
    #print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    return rfc_best_params

# # Running multiple loops of the model to find the optimal set of model parameters to maximize model accuracy.
# # Using the GridSearchCV function will allow us to test multiple iterations of the model each with a different set of parameters. The function compares the accuracy score of each iteration
# # and will finally report the best score and the parameter settings used to acheive this. 





In [59]:
 rfc_best_params = hyper_tuning_rfc(X_train, y_train)

KeyboardInterrupt: 

In [45]:

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# # An initial run of the KNN Classifier on the same training data as the RandomForest model reports much higher accuracy

# # Hyper-paramter tuning will be run for this model as well in an attempt to maximize our models accuracy.

Accuracy: 0.8300349507512143


In [46]:
def hyper_tuning_knn(X_train, y_train):
    param_grid_knn = {'n_neighbors': (1,10, 1), 'leaf_size': (20,40,1), 'p': (1,2), 'weights': ('uniform', 'distance'), 'metric': ('minkowski', 'chebyshev')}
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid=param_grid_knn, cv=5)
    grid_search.fit(X_train, y_train)
    knn_best_params = grid_search.best_params_
    
    
    #print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    return knn_best_params

# # Running multiple loops of the model to find the optimal set of model parameters to maximize model accuracy

# Best parameters:  {'leaf_size': 20, 'metric': 'minkowski', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
# Best score:  0.8498700833258669

# # ***After running the grid search function through the model hyperparameters we have acheived a best score of ~84% accuracy. The output best parameters wil be used for this model from here on out. 


In [47]:
knn_best_params = hyper_tuning_knn(X_train, y_train)

Best parameters:  {'leaf_size': 20, 'metric': 'minkowski', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
Best score:  0.8907070247316268


In [49]:
# def test_scales_rfc(scales):
#     for i in range(len(scales)):
#         name = scales[i].name
#         features = pd.concat([scales[i], disc_nums, ob_feats], axis=1)
#         X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 55)
#         rf = RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=200)
#         rf.fit(X_train,y_train)
#         y_pred = rf.predict(X_test)
#         accuracy = accuracy_score(y_test, y_pred)
#         print(name, "Accuracy:", accuracy)
        
        
        
# ###This function will train an iteration of the rfc model (with the best_params determined by the GridSearch) for each of our scaler options (StandardScaler, MinMaxScaler, and MaxABSscaler)
# # And print an accuracy score for each iteration of the model. Allowing us to find best possible accuracy from our scaling options.



In [50]:
# test_scales_rfc(scales)

In [51]:
# def test_scales_knn(scales, knn_best_params):
#     for i in range(len(scales)):
#         name = scales[i].name
#         features = pd.concat([scales[i], disc_nums, ob_feats], axis=1)
#         X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 55)
#         knn = KNeighborsClassifier(**knn_best_params)
#         knn.fit(X_train,y_train)
#         y_pred = knn.predict(X_test)
#         accuracy = accuracy_score(y_test, y_pred)
#         print(name, "Accuracy:", accuracy)
        
        
# ###This function will train an iteration of the knn model (with the best_params determined by the GridSearch) for each of our scaler options (StandardScaler, MinMaxScaler, and MaxABSscaler)
# # And print an accuracy score for each iteration of the model. Allowing us to find best possible accuracy from our scaling options.

In [52]:
# test_scales_knn(scales, knn_best_params)

In [53]:
models = [RandomForestClassifier(), KNeighborsClassifier()]

In [54]:
# def test_models_scales(scales, models, knn_best_params, rfc_best_params):
#     for model in range(len(models)):
#         print(f"{type(model).__name__}:)
#         for i in range(len(scales)):
#             name = scales[i].name
#             features = pd.concat([scales[i], disc_nums, ob_feats], axis=1)
#             X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 55)
#             model = model(**best_params_item matching current model)
#             knn.fit(X_train,y_train)
#             y_pred = knn.predict(X_test)
#             accuracy = accuracy_score(y_test, y_pred)
#             print(name, "Accuracy:", accuracy)
            
        
         
        

In [55]:
def test_models_scales(scales, disc_nums, ob_feats, models, knn_best_params, rfc_best_params, target):
    for model in models:
        model_name = type(model).__name__
        for scale in scales:
            scale_name = scale.name
            features = pd.concat([scale, disc_nums, ob_feats], axis=1)
            X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=55)
            if model_name == 'KNeighborsClassifier':
                clf = KNeighborsClassifier(**knn_best_params)
            elif model_name == 'RandomForestClassifier':
                clf = RandomForestClassifier(**rfc_best_params)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            print(f"{model_name} : {scale_name} : {accuracy}")


In [56]:
test_models_scales(scales, disc_nums, ob_feats, models, knn_best_params, rfc_best_params, target)

RandomForestClassifier : StandardScaler : 0.8122418410421679
RandomForestClassifier : MinMaxScaler : 0.814102855067859
RandomForestClassifier : MaxAbsScaler : 0.8126503563161
KNeighborsClassifier : StandardScaler : 0.9021152013072489
KNeighborsClassifier : MinMaxScaler : 0.8873632608596977
KNeighborsClassifier : MaxAbsScaler : 0.8872270891017203


In [None]:
# With the reults from the test_models_scales function we now have the optimal model type, scaler type, and best parameters for the model.
# Now we can fit the model one last time to export for future use.

driver_ratings_model = KNeighborsClassifier(**knn_best_params)
features = pd.concat([scales[0], disc_nums, ob_feats], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=55)
driver_ratings_model.fit(X_train, y_train)

import joblib
   
# Save the model as a pickle in a file
joblib.dump(driver_ratings_model, 'driver_ratings_model.pkl')
  
