In [None]:
# imports
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers, regularizers
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# df
df = pd.read_csv(r'f1dataset2.csv', encoding='utf-8')

# shuffle data
shuffled_data = df.sample(frac=1, random_state=42)  # Set random_state for reproducibility

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from tensorflow.keras import models, layers, regularizers
import numpy as np
import matplotlib.pyplot as plt

# Set the seed value
np.random.seed(42)

# Separate test races
test_races = shuffled_data['race_id'].unique()[:10]  # 10 test races
excluded_races = test_races.tolist()

# Exclude test races from the dataset
train_data = shuffled_data[~shuffled_data['race_id'].isin(test_races)]
test_data = shuffled_data[shuffled_data['race_id'].isin(test_races)]

# Define the number of folds for cross-validation
n_folds = 10

# Initialize StratifiedKFold with the desired number of folds
stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store evaluation results
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

precision_per_fold = []
recall_per_fold = []

# Define the model
model = models.Sequential()
model.add(layers.Dense(210, activation='relu', input_shape=(29,), kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=0.0009218034891406539),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Separate input features (X) and target variable (y) for the train and test sets
X_train = train_data[['race_progress', 'remaining_pit_stops', 'location', 'fulfilled_second_compound', 'number_of_available_compounds']]
y_train = train_data['relativecompound']

X_test = test_data[['race_progress', 'remaining_pit_stops', 'location', 'fulfilled_second_compound', 'number_of_available_compounds']]
y_test = test_data['relativecompound']

# Separate categorical and numerical features
cat_features = ['remaining_pit_stops', 'location', 'fulfilled_second_compound', 'number_of_available_compounds']
num_features = ['race_progress']

# Perform preprocessing on numerical features
scaler = StandardScaler()
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

# Create a ColumnTransformer to apply OneHotEncoder on categorical features
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)],
    remainder='passthrough'
)

# Fit the preprocessor on the training data and transform both training and test data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)  # Add this line to transform the test data

# Encode the target variable using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Iterate over the folds
for train_index, val_index in stratified_kfold.split(X_train_encoded, y_train_encoded):
    # Get the training and validation subsets for the current fold
    X_train_fold = X_train_encoded[train_index]
    y_train_fold = y_train_encoded[train_index]
    X_val_fold = X_train_encoded[val_index]
    y_val_fold = y_train_encoded[val_index]

    # Train the model
    history = model.fit(X_train_fold, y_train_fold, validation_data=(X_val_fold, y_val_fold), batch_size=32, epochs=10)

    # Predict probabilities on the validation set
    y_val_pred_prob = model.predict(X_val_fold)

    # Convert predicted probabilities to predicted labels
    y_val_pred = np.argmax(y_val_pred_prob, axis=1)

    # Calculate evaluation metrics for the validation set
    val_accuracy = accuracy_score(y_val_fold, y_val_pred)
    val_precision = precision_score(y_val_fold, y_val_pred, average='macro', zero_division=0)
    val_recall = recall_score(y_val_fold, y_val_pred, average='macro')
    val_f1 = f1_score(y_val_fold, y_val_pred, average='macro')

    # Calculate precision and recall for the validation set
    n_classes = y_val_pred_prob.shape[1]
    precision, recall, _ = precision_recall_curve(y_val_fold, y_val_pred_prob[:, 1], pos_label=1)
    precision_per_fold.append(precision)
    recall_per_fold.append(recall)

    # Store evaluation metrics for the validation set
    accuracy_scores.append(val_accuracy)
    precision_scores.append(val_precision)
    recall_scores.append(val_recall)
    f1_scores.append(val_f1)

# Predict probabilities on the test set
y_test_pred_prob = model.predict(X_test_encoded)

# Convert predicted probabilities to predicted labels
y_test_pred = np.argmax(y_test_pred_prob, axis=1)

# Calculate evaluation metrics for the test set
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
test_precision = precision_score(y_test_encoded, y_test_pred, average='macro', zero_division=0)
test_recall = recall_score(y_test_encoded, y_test_pred, average='macro')
test_f1 = f1_score(y_test_encoded, y_test_pred, average='macro')

# Store evaluation metrics for the test set
accuracy_scores.append(test_accuracy)
precision_scores.append(test_precision)
recall_scores.append(test_recall)
f1_scores.append(test_f1)

# Calculate and print the average evaluation metrics for the test set
print('Average Test Accuracy:', np.mean(accuracy_scores))
print('Average Test Precision:', np.mean(precision_scores))
print('Average Test Recall:', np.mean(recall_scores))
print('Average Test F1 Score:', np.mean(f1_scores))

# Plot the precision-recall curve for each fold
plt.figure()
for fold in range(n_folds):
    plt.plot(recall_per_fold[fold], precision_per_fold[fold], label='Fold {}'.format(fold + 1))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for 10 Folds')
plt.legend()
plt.show()
