## imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping, History
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

## Reading the csv file and handling NaNs

In [None]:
df = pd.read_csv(r'f1dataset1.csv', encoding='utf-8')

# HANDLING NANs
df['laptime'] = np.where(df['laptime'].isna(), 0.0, df['laptime'])
df['race_progress'] = np.where(df['race_progress'].isna(), 0.0, df['race_progress'])
df['tyreageprogress'] = np.where(df['tyreageprogress'].isna(), 0.0, df['tyreageprogress'])

# shuffle data
shuffled_data = df.sample(frac=1, random_state=42)  # Set random_state for reproducibility

## Evaluation pipeline with preprocessing

In [None]:
# Set the seed value
np.random.seed(42)

# Separate test races
test_races = shuffled_data['race_id'].unique()[:10]  # 10 test races
excluded_races = test_races.tolist()

# Train + test races from the dataset
train_data = shuffled_data[~shuffled_data['race_id'].isin(test_races)]
test_data = shuffled_data[shuffled_data['race_id'].isin(test_races)]

# Define the number of folds for cross-validation
n_folds = 10

# Initialize StratifiedKFold with the desired number of folds
stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store evaluation results
test_accuracy_scores = []
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []

# Initialize empty lists to store precision and recall values for each fold
precision_per_fold = []
recall_per_fold = []

auc_scores = []
roc_curves = []

# Define the FFNN model
ffnn_model = tf.keras.models.Sequential()
ffnn_model.add(tf.keras.layers.Dense(64, activation='relu', input_shape=(17,),
                                    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.08, l2=0.0005)))
ffnn_model.add(tf.keras.layers.Dropout(0.2))  # Dropout layer for regularization
ffnn_model.add(tf.keras.layers.Dense(64, activation='relu',
                                    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.08, l2=0.0005)))
ffnn_model.add(tf.keras.layers.Dropout(0.2))  # Dropout layer for regularization
ffnn_model.add(tf.keras.layers.Dense(64, activation='relu',
                                    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.08, l2=0.0005)))
ffnn_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
ffnn_model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])

# Iterate over the folds
for train_index, val_index in stratified_kfold.split(train_data, train_data['pitstop']):
    # Initialize the EarlyStopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    
    # Get the training and validation subsets for the current fold
    train_subset = train_data.iloc[train_index]
    val_subset = train_data.iloc[val_index]

    # Separate input features (X) and target variable (y)
    X_train = train_subset[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
    y_train = train_subset['pitstop']

    X_val = val_subset[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
    y_val = val_subset['pitstop']

    # Separate categorical and numerical features
    cat_features = ['is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']
    num_features = ['race_progress', 'tyreageprogress']

    # Perform preprocessing on numerical features
    scaler = StandardScaler()
    X_train.loc[:, num_features] = scaler.fit_transform(X_train[num_features])
    X_val.loc[:, num_features] = scaler.transform(X_val[num_features])

    # Perform preprocessing on categorical features
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_train_encoded = encoder.fit_transform(X_train[cat_features])
    X_val_encoded = encoder.transform(X_val[cat_features])

    # Combine preprocessed numerical and categorical features
    X_train_processed = np.concatenate((X_train_encoded, X_train[num_features]), axis=1)
    X_val_processed = np.concatenate((X_val_encoded, X_val[num_features]), axis=1)
    
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    # Encode the categorical labels into integer values
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    # Undersample the majority class
    majority_class_indices = np.where(y_train_encoded == 0)[0]
    minority_class_indices = np.where(y_train_encoded == 1)[0]
    undersampled_majority_indices = np.random.choice(majority_class_indices, size=len(minority_class_indices), replace=False)
    undersampled_indices = np.concatenate((undersampled_majority_indices, minority_class_indices))
    np.random.shuffle(undersampled_indices)

    X_train_processed_undersampled = X_train_processed[undersampled_indices]
    y_train_encoded_undersampled = y_train_encoded[undersampled_indices]

    # Define the class weights
    #class_weights = {0: 1, 1: 5}

    # Fit the model 
    history = ffnn_model.fit(X_train_processed_undersampled, y_train_encoded_undersampled,
                             validation_data=(X_val_processed, y_val_encoded),
                             batch_size=64, epochs=10, callbacks=[early_stopping],
                             #class_weight=class_weights, 
                             verbose=1)

#     # Evaluate on the validation set
#     val_loss, val_accuracy = ffnn_model.evaluate(X_val_processed, y_val_encoded, verbose=0)
#     print('Validation Loss:', val_loss)
    
    # Predict on the test data
    X_test = test_data[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
    y_test = test_data['pitstop']
    X_test.loc[:, num_features] = scaler.transform(X_test[num_features])
    X_test_encoded = encoder.transform(X_test[cat_features])
    X_test_processed = np.concatenate((X_test_encoded, X_test[num_features]), axis=1)

    # Predict on the test data
    y_test_prob = ffnn_model.predict(X_test_processed)
    y_test_pred = (y_test_prob > 0.5).astype(int)  # Convert probabilities to class labels

    # Calculate evaluation metrics for the test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=1)  # Set zero_division to 1
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    # Store evaluation metrics for the test data
    test_accuracy_scores.append(test_accuracy)
    test_precision_scores.append(test_precision)
    test_recall_scores.append(test_recall)
    test_f1_scores.append(test_f1)
    
    # Calculate precision and recall values for the fold
    precision, recall, _ = precision_recall_curve(y_test, y_test_prob)
    precision_per_fold.append(precision)
    recall_per_fold.append(recall)
    
     # Calculate AUC for the test data
    test_auc = roc_auc_score(y_test, y_test_prob)
    auc_scores.append(test_auc)

    # Calculate ROC curve for the fold
    fpr, tpr, _ = roc_curve(y_test, y_test_prob)
    roc_curves.append((fpr, tpr))

# Calculate and print the average evaluation metrics for the test data
print('Average Test Accuracy:', np.mean(test_accuracy_scores))
print('Average Test Precision:', np.mean(test_precision_scores))
print('Average Test Recall:', np.mean(test_recall_scores))
print('Average Test F1 Score:', np.mean(test_f1_scores))

# Plot Precision-Recall curves for each fold
plt.figure(figsize=(8, 6))
for i in range(n_folds):
    plt.plot(recall_per_fold[i], precision_per_fold[i], lw=2, label=f'Fold {i+1}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(title='Folds', loc='lower left')
plt.grid(True)
plt.show()

# Calculate and print the average AUC score for the test data
print('Average Test AUC:', np.mean(auc_scores))

# Plot ROC curves for each fold
plt.figure(figsize=(8, 6))
for i in range(n_folds):
    fpr, tpr = roc_curves[i]
    plt.plot(fpr, tpr, lw=2, label=f'Fold {i+1}')
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(title='Folds', loc='lower right')
plt.grid(True)
plt.show()

In [None]:
import optuna

# Define the objective function for Optuna
def objective(trial):
    # Set the seed value
    np.random.seed(42)

    # Define the number of folds for cross-validation
    n_folds = 10

    # Initialize StratifiedKFold with the desired number of folds
    stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

    # Initialize lists to store evaluation results
    test_accuracy_scores = []
    test_precision_scores = []
    test_recall_scores = []
    test_f1_scores = []

    # Iterate over the folds
    for train_index, val_index in stratified_kfold.split(train_data, train_data['pitstop']):
        # Initialize the EarlyStopping callback
        early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

        # Get the training and validation subsets for the current fold
        train_subset = train_data.iloc[train_index]
        val_subset = train_data.iloc[val_index]

        # Separate input features (X) and target variable (y)
        X_train = train_subset[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
        y_train = train_subset['pitstop']

        X_val = val_subset[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
        y_val = val_subset['pitstop']

        # Separate categorical and numerical features
        cat_features = ['is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']
        num_features = ['race_progress', 'tyreageprogress']

        # Perform preprocessing on numerical features
        scaler = StandardScaler()
        X_train.loc[:, num_features] = scaler.fit_transform(X_train[num_features])
        X_val.loc[:, num_features] = scaler.transform(X_val[num_features])

        # Perform preprocessing on categorical features
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        X_train_encoded = encoder.fit_transform(X_train[cat_features])
        X_val_encoded = encoder.transform(X_val[cat_features])

        # Combine preprocessed numerical and categorical features
        X_train_processed = np.concatenate((X_train_encoded, X_train[num_features]), axis=1)
        X_val_processed = np.concatenate((X_val_encoded, X_val[num_features]), axis=1)

        # Create an instance of LabelEncoder
        label_encoder = LabelEncoder()

        # Encode the categorical labels into integer values
        y_train_encoded = label_encoder.fit_transform(y_train)
        y_val_encoded = label_encoder.transform(y_val)

        # Undersample the majority class
        majority_class_indices = np.where(y_train_encoded == 0)[0]
        minority_class_indices = np.where(y_train_encoded == 1)[0]
        undersampled_majority_indices = np.random.choice(majority_class_indices, size=len(minority_class_indices), replace=False)
        undersampled_indices = np.concatenate((undersampled_majority_indices, minority_class_indices))
        np.random.shuffle(undersampled_indices)

        X_train_processed_undersampled = X_train_processed[undersampled_indices]
        y_train_encoded_undersampled = y_train_encoded[undersampled_indices]

        # Define the FFNN model
        ffnn_model = tf.keras.models.Sequential()
        ffnn_model.add(tf.keras.layers.Dense(64, activation='relu', input_shape=(17,),
                                              kernel_regularizer=tf.keras.regularizers.l1_l2(
                                                  l1=trial.suggest_float("l1", 0, 0.1),
                                                  l2=trial.suggest_float("l2", 0, 0.1))))
        ffnn_model.add(tf.keras.layers.Dropout(0.2))  # Dropout layer for regularization
        ffnn_model.add(tf.keras.layers.Dense(64, activation='relu',
                                              kernel_regularizer=tf.keras.regularizers.l1_l2(
                                                  l1=trial.suggest_float("l1", 0, 0.1),
                                                  l2=trial.suggest_float("l2", 0, 0.1))))
        ffnn_model.add(tf.keras.layers.Dropout(0.2))  # Dropout layer for regularization
        ffnn_model.add(tf.keras.layers.Dense(64, activation='relu',
                                              kernel_regularizer=tf.keras.regularizers.l1_l2(
                                                  l1=trial.suggest_float("l1", 0, 0.1),
                                                  l2=trial.suggest_float("l2", 0, 0.1))))
        ffnn_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # Output layer

        # Compile the model
        ffnn_model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])

        # Fit the model
        history = ffnn_model.fit(X_train_processed_undersampled, y_train_encoded_undersampled,
                                 validation_data=(X_val_processed, y_val_encoded),
                                 batch_size=64, epochs=10, callbacks=[early_stopping],
                                 verbose=1)

        # Evaluate on the validation set
        val_loss, val_accuracy = ffnn_model.evaluate(X_val_processed, y_val_encoded, verbose=0)
        print('Validation Loss:', val_loss)

        # Predict on the test data
        X_test = test_data[['race_progress', 'tyreageprogress', 'is_leader', 'relativecompound', 'racetrackcat', 'fcystatus', 'remaining_pit_stops', 'pursuer_tyre_change', 'close_ahead']]
        y_test = test_data['pitstop']
        X_test.loc[:, num_features] = scaler.transform(X_test[num_features])
        X_test_encoded = encoder.transform(X_test[cat_features])
        X_test_processed = np.concatenate((X_test_encoded, X_test[num_features]), axis=1)

        # Predict on the test data
        y_test_prob = ffnn_model.predict(X_test_processed)
        y_test_pred = (y_test_prob > 0.5).astype(int)  # Convert probabilities to class labels

        # Calculate evaluation metrics for the test data
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_precision = precision_score(y_test, y_test_pred, zero_division=1)  # Set zero_division to 1
        test_recall = recall_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred)

        # Store evaluation metrics for the test data
        test_accuracy_scores.append(test_accuracy)
        test_precision_scores.append(test_precision)
        test_recall_scores.append(test_recall)
        test_f1_scores.append(test_f1)

    # Return the average F1 score as the objective value
    return np.mean(test_f1_scores)

# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Start the hyperparameter search
study.optimize(objective, n_trials=50)

# Print the best hyperparameters and objective value
best_params = study.best_params
best_value = study.best_value
print('Best Hyperparameters:', best_params)
print('Best Objective Value:', best_value)

# Get the best model configuration
best_l1 = best_params['l1']
best_l2 = best_params['l2']