## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, concatenate, Reshape, SimpleRNN
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import regularizers
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, roc_auc_score, roc_curve
from sklearn.utils import shuffle, resample
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import History, EarlyStopping
from imblearn.under_sampling import RandomUnderSampler

## Reading the csv file and handling NaNs

In [None]:
df = pd.read_csv(r'f1dataset1.csv', encoding='utf-8')

# HANDLING NANs
df['laptime'] = np.where(df['laptime'].isna(), 0.0, df['laptime'])
df['race_progress'] = np.where(df['race_progress'].isna(), 0.0, df['race_progress'])
df['tyreageprogress'] = np.where(df['tyreageprogress'].isna(), 0.0, df['tyreageprogress'])

## Evalutation pipeline with preprocessing

In [None]:
# shuffle data
shuffled_data = df.sample(frac=1, random_state=42)  # Set random_state for reproducibility

In [None]:
# Set the seed value
np.random.seed(42)

# Separate test races
test_races = shuffled_data['race_id'].unique()[:10]  # 10 test races
excluded_races = test_races.tolist()

# Exclude test races from the dataset
train_data = shuffled_data[~shuffled_data['race_id'].isin(test_races)]
test_data = shuffled_data[shuffled_data['race_id'].isin(test_races)]

# Separate input features (X) and target variable (y) for the training set
X_train = train_data[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
y_train = train_data['pitstop']

# Separate categorical and numerical features
cat_features = ['is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']
num_features = ['race_progress', 'tyreageprogress']

# Perform preprocessing on numerical features
scaler = StandardScaler()
X_train.loc[:, num_features] = scaler.fit_transform(X_train[num_features])

# Perform preprocessing on categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[cat_features])

# Combine preprocessed numerical and categorical features
X_train_processed = np.concatenate((X_train_encoded, X_train[num_features].values), axis=1)

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Encode the categorical labels into integer values
y_train_encoded = label_encoder.fit_transform(y_train)

# Perform undersampling on the training data
rus = RandomUnderSampler(random_state=42)
X_train_processed_undersampled, y_train_encoded_undersampled = rus.fit_resample(X_train_processed, y_train_encoded)

# Define the number of folds for cross-validation
n_folds = 10

# Initialize StratifiedKFold with the desired number of folds
stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store evaluation results
test_accuracy_scores = []
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []

precision_per_fold = []
recall_per_fold = []
auc_scores = []
roc_curves = []


# Iterate over the folds
for train_index, val_index in stratified_kfold.split(X_train_processed_undersampled, y_train_encoded_undersampled):
    # Get the training and validation subsets for the current fold
    X_train_fold = X_train_processed_undersampled[train_index]
    y_train_fold = y_train_encoded_undersampled[train_index]

    X_val_fold = X_train_processed_undersampled[val_index]
    y_val_fold = y_train_encoded_undersampled[val_index]

    # Reshape the input data 
    X_train_reshaped = X_train_fold.reshape(X_train_fold.shape[0], X_train_fold.shape[1], 1)
    X_val_reshaped = X_val_fold.reshape(X_val_fold.shape[0], X_val_fold.shape[1], 1)

    early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

    rnn_model = Sequential()
    rnn_model.add(SimpleRNN(64, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]),
                           kernel_regularizer=regularizers.l1_l2(l1=0.09, l2=0.01)))  # L1 and L2 regularization
    rnn_model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.09, l2=0.01)))  # L1 and L2 regularization
    rnn_model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    rnn_model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    rnn_model.fit(X_train_reshaped, y_train_fold, validation_data=(X_val_reshaped, y_val_fold),
                  batch_size=32,
                  epochs=10,
                  callbacks=[early_stopping])


    # Perform preprocessing on the test set using the same transformations
    X_test = test_data[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
    y_test = test_data['pitstop']
    X_test.loc[:, num_features] = scaler.transform(X_test[num_features])
    X_test_encoded = encoder.transform(X_test[cat_features])
    X_test_processed = np.concatenate((X_test_encoded, X_test[num_features].values), axis=1)
    X_test_reshaped = X_test_processed.reshape(X_test_processed.shape[0], X_test_processed.shape[1], 1)

    # Evaluate the model on the test set and calculate F1 score
    y_test_pred = np.round(rnn_model.predict(X_test_reshaped)).astype(int)

    # Calculate evaluation metrics for the test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    # Store evaluation metrics for the test data
    test_accuracy_scores.append(test_accuracy)
    test_precision_scores.append(test_precision)
    test_recall_scores.append(test_recall)
    test_f1_scores.append(test_f1)
    
    # Evaluate the model on the test set and calculate probabilities
    y_test_prob = rnn_model.predict(X_test_reshaped)

    # Calculate precision and recall values for the fold
    precision, recall, _ = precision_recall_curve(y_test, y_test_prob)
    precision_per_fold.append(precision)
    recall_per_fold.append(recall)

    # Calculate AUC for the test data
    test_auc = roc_auc_score(y_test, y_test_prob)
    auc_scores.append(test_auc)

    # Calculate ROC curve for the fold
    fpr, tpr, _ = roc_curve(y_test, y_test_prob)
    roc_curves.append((fpr, tpr))

# Calculate and print the average evaluation metrics for the test data
print('Average Test Accuracy:', np.mean(test_accuracy_scores))
print('Average Test Precision:', np.mean(test_precision_scores))
print('Average Test Recall:', np.mean(test_recall_scores))
print('Average Test F1 Score:', np.mean(test_f1_scores))

# Plot Precision-Recall curves for each fold
plt.figure(figsize=(8, 6))
for i in range(n_folds):
    plt.plot(recall_per_fold[i], precision_per_fold[i], lw=2, label=f'Fold {i+1}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(title='Folds', loc='lower left')
plt.grid(True)
plt.show()

# Calculate and print the average AUC score for the test data
print('Average Test AUC:', np.mean(auc_scores))

# Plot ROC curves for each fold
plt.figure(figsize=(8, 6))
for i in range(n_folds):
    fpr, tpr = roc_curves[i]
    plt.plot(fpr, tpr, lw=2, label=f'Fold {i+1}')
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(title='Folds', loc='lower right')
plt.grid(True)
plt.show()