# Test Models

In [6]:
import pandas as pd
import numpy as np
import joblib
import F1_Preprocessing #Internal package to preprocess data
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

### Model 1 >>> Predictions at the beginning of the race, no data given during the race is used.


#### Load and format data

In [7]:
#Load Data for the model
path = 'C:/Users/gabri/Dropbox/Gaby/Proyectos/My_Portafolio/F1/Data/'

ResultsDF=pd.read_csv(path+"ResultsDF.csv")
ResultsDF=ResultsDF[ResultsDF["season"]>=2017]
ResultsDF=F1_Preprocessing.preprocess_F1results(ResultsDF,OneHotEncoder=True,HandleNulls=True)

# Group features for further analysis:
selected_features_circuit_focused=ResultsDF.loc[:, ResultsDF.columns.str.startswith("circuit.circuitid_")].columns.to_list()
selected_features_constructor_focused=ResultsDF.loc[:, ResultsDF.columns.str.startswith("constructor.constructorid_")].columns.to_list()
selected_features_driver_focused=ResultsDF.loc[:, ResultsDF.columns.str.startswith("driverid_")].columns.to_list()
selected_features_status_focused=ResultsDF.loc[:, ResultsDF.columns.str.startswith("final_status_grouped_")].columns.to_list()
selected_features_general=[col for col in ResultsDF.columns if col not in selected_features_circuit_focused+
                           selected_features_constructor_focused+selected_features_driver_focused+selected_features_status_focused]

# Not all the features in these dataframe are available before hand. 
# Below we remove features we don't know before the race starts (apart from dependent variables).
# Here we add features to each observation to have a view of what happened the races before to the same driver
features_to_add_from_the_past=['driverid',"season-round",'final_position','final_grid','fastestlap.rank','race_time_millis_to_max_ratio'] #'final_points'

features_to_add_from_the_past+=selected_features_status_focused

print(len(features_to_add_from_the_past))

# Add certain features to see how the driver performed in previous races
ResultsDF2=F1_Preprocessing.get_past_rows(DF=ResultsDF,
                                          N=5,
                                          iterator_feature='driverid',
                                          grouper_feature='season-round',
                                          features_added=features_to_add_from_the_past)

ResultsDF2.drop(columns=ResultsDF2.loc[:, ResultsDF2.columns.str.startswith("driverid-")].columns.to_list(),inplace=True)
ResultsDF2[list(set(ResultsDF2.columns)-set(ResultsDF.columns))]=ResultsDF2[list(set(ResultsDF2.columns)-set(ResultsDF.columns))].astype(float)

#Features to remove (not available during the race, text, already encoded features)

selected_features_seasonround_focused=ResultsDF2.loc[:, ResultsDF2.columns.str.startswith("season-round-")].columns.to_list()

features_to_remove=['driver_number', 'final_positionText', 'final_points','final_laps','driver.givenname', 'driver.familyname',
       'driver.dateofbirth', 'driver.nationality', 'constructor.url','constructor.name', 'constructor.nationality',
       'fastestlap.rank', 'fastestlap.lap', 'fastestlap.time.time','fastestlap.averagespeed.speed','racename',
       'circuit.circuitname', 'circuit.location.country', 'date','fastestlap.time.in_milliseconds',
       'season-round-driverid','race_time_millis_max_round_season','race_time_millis_min_round_season',
       'race_time_millis_avg_round_season', 'race_time_millis_to_max_ratio','race_time_millis_to_min_ratio', 'race_time_millis_to_avg_ratio',
       'final_status_grouped','race_time.millis']
features_to_remove=features_to_remove+selected_features_status_focused+selected_features_seasonround_focused

ResultsDF2=ResultsDF2.drop(columns=features_to_remove)


13


#### Test

In [8]:
TEST_SEGMENTS=[ResultsDF2["season-round"].max()]
TEST_DF=ResultsDF2[ResultsDF2["season-round"].isin(TEST_SEGMENTS)]

print("Test DF:",len(TEST_DF))
TARGET='final_position'

X_test = TEST_DF.drop(columns=[TARGET,'circuit.circuitid','constructor.constructorid','driverid'])
y_test = TEST_DF[TARGET].astype(int)-1

# Load model
loaded_model=joblib.load('ModelsSaved/m1_best_xgboost.joblib')

# Test
y_pred = loaded_model.predict(X_test)
print(accuracy_score(y_test+1, y_pred+1))
print(f1_score(y_test+1, y_pred+1, average='macro'))

TEST_DF["Prediction"]=y_pred+1
TEST_DF[['circuit.circuitid','constructor.constructorid','driverid',TARGET,'Prediction']].sort_values(by=TARGET)

Test DF: 19
0.15789473684210525
0.11666666666666665


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TEST_DF["Prediction"]=y_pred+1


Unnamed: 0,circuit.circuitid,constructor.constructorid,driverid,final_position,Prediction
121,marina_bay,mclaren,norris,1.0,1
162,marina_bay,red_bull,max_verstappen,2.0,1
39,marina_bay,mclaren,piastri,3.0,5
121,marina_bay,mercedes,russell,4.0,4
142,marina_bay,ferrari,leclerc,5.0,5
161,marina_bay,mercedes,hamilton,6.0,3
161,marina_bay,ferrari,sainz,7.0,3
123,marina_bay,aston_martin,alonso,8.0,9
106,marina_bay,haas,hulkenberg,9.0,6
160,marina_bay,red_bull,perez,10.0,8
