In [99]:
!pip install pytorch_forecasting
!pip install pytorch_lightning
!pip install optuna



In [100]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet, NBeats
from pytorch_forecasting.data import GroupNormalizer
from pytorch_lightning import Trainer
import torch
import torchmetrics
from pytorch_lightning.core.module import LightningModule
import matplotlib.pyplot as plt
import h5py
import optuna

In [101]:
# Load datasets
def load_data():
    df_20_21 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_20_21.csv')
    df_21_22 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_21_22.csv')
    df_22_23 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_22_23.csv')
    df_23_24 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_23_24.csv')
    df_24_25 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_24_25.csv')
    return df_20_21,df_21_22,df_22_23, df_23_24, df_24_25

In [102]:
# Preprocess data for TFT
def preprocess_data_tft(df):
    # Add time_idx for temporal ordering
    df = df.reset_index()  # Reset index to ensure uniqueness
    df['time_idx'] = pd.factorize(df['MP'])[0]

    # Feature Engineering
    # Fill NaN values in 'Gls' and 'Ast' with 0 before calculating 'G+A'
    df['Gls'] = df['Gls'].fillna(0)
    df['Ast'] = df['Ast'].fillna(0)
    df['G+A'] = df['Gls'] + df['Ast']

    df['xG+xAG'] = df['xG'] + df['xAG']

    df['Performance_Index'] = (
        df['G+A'] * 0.4 +
        df['xG+xAG'] * 0.3 +
        (df['PrgC'] + df['PrgP'] + df['PrgR']) * 0.2 +
        (df['Tkl'] + df['Int'] + df['Blocks']) * 0.1
    )

    df['Future_Potential'] = (
        (1 / (df['Age'] + 1)) * df['MP'] +
        df.groupby('Player')['G+A'].transform(lambda x: x.diff().fillna(0))
    )

    # Verify uniqueness of the index
    if not df.index.is_unique:
        raise ValueError("Data index must be unique.")

    return df

In [103]:
# Create TimeSeriesDataSet
def create_tft_dataset(df):
    df = preprocess_data_tft(df)

    max_prediction_length = 1  # predict one season ahead
    max_encoder_length = 3  # use data from the last three seasons

    # Calculate the minimum required length
    min_data_length = max_encoder_length + max_prediction_length

    # Filter out players with insufficient data
    df = df.groupby('Player').filter(lambda x: len(x) >= min_data_length)

    # Reset index after filtering
    df = df.reset_index(drop=True)
    df['time_idx'] = pd.factorize(df['MP'])[0]

    # Ensure 'Player' column is a string
    df['Player'] = df['Player'].astype(str)

    training = TimeSeriesDataSet(
        df,
        time_idx="time_idx",
        target="G+A",
        group_ids=["Player"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        static_categoricals=["Player"],  # Only "Player" should be categorical
        static_reals=["Age"],  # "Age" should be treated as numerical
        time_varying_known_reals=["time_idx"],
        time_varying_unknown_reals=["Gls", "Ast", "xG", "xAG", "PrgC", "PrgP", "PrgR", "Tkl", "Int", "Blocks", "Performance_Index", "Future_Potential"],
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        allow_missing_timesteps=True
    )

    return training

In [104]:
class NBeatsLightningModule(LightningModule):
    def __init__(self, nbeats_model):
        super().__init__()
        self.model = nbeats_model

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)  # N-BEATS output is directly the prediction
        if isinstance(y, tuple):
            y = y[0]
        y = y.squeeze()
        # Ensure y_hat and y have the same shape
        y_hat = y_hat.view(y.shape)
        loss = torchmetrics.functional.mean_squared_error(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        if isinstance(y, tuple):
            y = y[0]
        y = y.squeeze()
        y_hat = y_hat.view(y.shape)
        loss = torchmetrics.functional.mean_squared_error(y_hat, y)
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.03)

In [105]:
# Train N-BEATS model
def train_nbeats(training, model=None):
    trainer = Trainer(accelerator="cpu", max_epochs=10)

    if model is None:
        nbeats = NBeats.from_dataset(
            training,
            learning_rate=0.03,
            # ... other N-BEATS hyperparameters ...
            loss=torchmetrics.MeanSquaredError(),
        )
        model = NBeatsLightningModule(nbeats)
    else:
        model.model.train()

    trainer.fit(model, train_dataloaders=training.to_dataloader(train=True, batch_size=64))
    return model

In [106]:
# Evaluate N-BEATS model
def evaluate_nbeats(model, test_dataloader, df_test):
    raw_predictions = model.model.predict(test_dataloader, mode="prediction", return_x=True)
    predictions = raw_predictions.cpu().detach().numpy()  # N-BEATS output is directly the prediction

    # Reshape if necessary
    predictions = predictions.reshape(-1)

    if len(predictions) != len(df_test['G+A']):
        predictions = np.repeat(predictions, len(df_test['G+A']) // len(predictions) + 1)
        predictions = predictions[:len(df_test['G+A'])]
    return predictions

In [107]:
def display_metrics(predictions, actual):
    mse = mean_squared_error(actual, predictions)
    r2 = r2_score(actual, predictions)
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")
    best_threshold = 0.5

    predicted_labels = (predictions > best_threshold).astype(int)
    actual_labels = (actual > best_threshold).astype(int)

    f1 = f1_score(actual_labels, predicted_labels)
    testing_accuracy = accuracy_score(actual_labels, predicted_labels)

    print(f"F1 Score: {f1:.4f}")
    # print(f"Training Accuracy: {training_accuracy:.4f}")
    print(f"Testing Accuracy: {testing_accuracy:.4f}")

In [108]:
# Define a function for hyperparameter optimization
def objective(trial):
    # Define hyperparameter search space for N-BEATS
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        # ... other N-BEATS hyperparameters ...
    }

    # Create and train the model with the current hyperparameters
    nbeats = NBeats.from_dataset(
        training_initial,
        **params,
        loss=torchmetrics.MeanSquaredError(),
    )
    model = NBeatsLightningModule(nbeats)
    trainer = Trainer(accelerator="cpu", max_epochs=10)
    trainer.fit(
        model,
        train_dataloaders=training_initial.to_dataloader(train=True, batch_size=64),
        val_dataloaders=validation_data.to_dataloader(train=False, batch_size=64)
    )

    # Get validation loss
    validation_loss = trainer.callback_metrics["val_loss"].item()

    return validation_loss

In [109]:
# Main workflow
def main():
    df_20_21, df_21_22, df_22_23, df_23_24, df_24_25 = load_data()

    df_train = pd.concat([df_20_21, df_21_22, df_22_23, df_23_24])
    df_test = df_24_25

    # Identify players not in training data but in test data
    players_in_test_not_in_train = set(df_test['Player'].unique()) - set(df_train['Player'].unique())

    # Move data for these players from test to training
    df_train = pd.concat([df_train, df_test[df_test['Player'].isin(players_in_test_not_in_train)]])
    df_test = df_test[~df_test['Player'].isin(players_in_test_not_in_train)]  # Remove from test set

    global training_initial  # Make training_initial accessible to objective function
    training_initial = create_tft_dataset(df_train)

    # Create validation data using subset instead of split method
    global validation_data  # Make validation_data accessible to objective function
    split_index = int(len(training_initial.data["time"]) * 0.8)  # Use the length of the 'time' data

    # Split the dataset using the underlying PyTorch tensors
    training_initial_data = training_initial.data
    validation_data_data = {}
    for key in training_initial_data:
        # Check if the value is a PyTorch tensor before cloning
        if isinstance(training_initial_data[key], torch.Tensor):
            validation_data_data[key] = training_initial_data[key][split_index:].clone()
            training_initial_data[key] = training_initial_data[key][:split_index].clone()
        # If it's a list, create a copy instead of cloning
        elif isinstance(training_initial_data[key], list):
            validation_data_data[key] = training_initial_data[key][split_index:]
            training_initial_data[key] = training_initial_data[key][:split_index]
        else:
            # Handle other data types if necessary
            validation_data_data[key] = training_initial_data[key]

    # Update the data attributes of training_initial and validation_data
    training_initial.data = training_initial_data
    validation_data = TimeSeriesDataSet.from_dataset(training_initial, data=validation_data_data)

    # Hyperparameter optimization using Optuna
    study = optuna.create_study(direction="minimize")  # Minimize validation loss
    study.optimize(objective, n_trials=50)  # Run 50 trials

    # Get the best hyperparameters
    best_params = study.best_params

    # Create and train the final N-BEATS model with the best hyperparameters
    nbeats = NBeats.from_dataset(
        training_initial,
        **best_params,
        loss=torchmetrics.MeanSquaredError(),
    )
    model = NBeatsLightningModule(nbeats)
    trainer = Trainer(accelerator="cpu", max_epochs=20)  # Increase epochs for final training
    trainer.fit(model, train_dataloaders=training_initial.to_dataloader(train=True, batch_size=64))

    # Now loop through remaining players in the test set for evaluation and retraining
    for player_name in df_test['Player'].unique():
        df_player_test = df_test[df_test['Player'] == player_name]

        # Create a TimeSeriesDataSet for the current player
        testing_player = create_tft_dataset(df_player_test)
        test_dataloader_player = testing_player.to_dataloader(train=False, batch_size=64)

        # Make predictions for the current player using evaluate_nbeats
        predictions = evaluate_nbeats(model, test_dataloader_player, df_player_test)
        actual = df_player_test['G+A'].values

        display_metrics(predictions, actual)

        # Retrain the model with the new data (optional - you might remove this)
        # ... (Code for retraining if desired) ...

        print("-" * 20)  # Separator for each player's results

    # Save the final model
    torch.save(model.model.state_dict(), "nbeats_model_final.pth")  # Save N-BEATS model

    print("Final model saved as nbeats_model_final.pth")

In [110]:
# Execute the workflow
if __name__ == "__main__":
    main()



KeyError: 'time_idx'