In [None]:
!pip install pytorch_forecasting
!pip install pytorch_lightning

Collecting pytorch_forecasting
  Downloading pytorch_forecasting-1.2.0-py3-none-any.whl.metadata (13 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch_forecasting)
  Downloading lightning-2.5.0.post0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading lightning_utilities-0.12.0-py3-none-any.whl.metadata (5.6 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning (from lightning<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-ma

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_lightning import Trainer
import torch
import torchmetrics
from pytorch_lightning.core.module import LightningModule
import matplotlib.pyplot as plt
import h5py

In [None]:
class TFTLightningModule(LightningModule):
    def __init__(self, tft_model):
        super().__init__()
        self.model = tft_model

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)["prediction"]
        # Select the first output from the 7 outputs, to align with the target shape
        y_hat = y_hat[:, 0]
        # Check if y is a tuple and extract the target tensor if necessary
        if isinstance(y, tuple):
            y = y[0]  # Assuming the target is the first element of the tuple
        # y should have shape [64] to match y_hat, assuming y has shape [64, 1] originally
        y = y.squeeze()

        # Reshape y_hat to match y. The output of the TFT is [batch_size, prediction_length]
        # and in this case prediction_length is 7 (set in the TFT model definition).
        # We only need the first prediction, so we take y_hat[:, 0].
        # Then, we ensure that y_hat has shape [batch_size]
        y_hat = y_hat[:, 0].view(y.shape)

        loss = torchmetrics.functional.mean_squared_error(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.03)

In [None]:
# Load datasets
def load_data():
    df_20_21 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_20_21.csv')
    df_21_22 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_21_22.csv')
    df_22_23 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_22_23.csv')
    df_23_24 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_23_24.csv')
    df_24_25 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_24_25.csv')
    return df_20_21,df_21_22,df_22_23, df_23_24, df_24_25

In [1]:
# Preprocess data for TFT
def preprocess_data_tft(df):
    # Add time_idx for temporal ordering
    df = df.reset_index()  # Reset index to ensure uniqueness
    df['time_idx'] = pd.factorize(df['MP'])[0]

    # Fill NaN values in relevant columns
    df.fillna(0, inplace=True)

    # Feature Engineering
    df['G+A'] = df['Gls'] + df['Ast']
    df['G-PK'] = df['Gls'] - df['PK']
    df['G+A-PK'] = df['G+A'] - df['PK']

    df['xG+xAG'] = df['xG'] + df['xAG']
    df['npxG+xAG'] = df['npxG'] + df['xAG']

    # Define a weighted Performance Index
    df['Performance_Index'] = (
        df['G+A-PK'] * 0.35 +    # Emphasis on actual goal contributions
        df['xG+xAG'] * 0.25 +    # Expected goal contributions
        df['PrgC'] * 0.15 +      # Progressive carries
        df['PrgP'] * 0.15 +      # Progressive passes
        df['PrgR'] * 0.1         # Progressive receptions
    )

    # Future Performance Potential based on trends
    df['Future_Potential'] = (
        (df['Min'] / (df['MP'] + 1)) * 0.4 +  # Playing time influence
        df.groupby('Player')['Performance_Index'].transform(lambda x: x.diff().fillna(0)) * 0.6  # Performance trends
    )

    # Verify uniqueness of the index
    if not df.index.is_unique:
        raise ValueError("Data index must be unique.")

    return df

In [2]:
# Create TimeSeriesDataSet
def create_tft_dataset(df):
    df = preprocess_data_tft(df)  # Apply preprocessing

    max_prediction_length = 1  # Predict one season ahead
    max_encoder_length = 3  # Use data from the last three seasons

    training = TimeSeriesDataSet(
        df,
        time_idx="time_idx",
        target="Performance_Index",  # Predict overall performance instead of just G+A
        group_ids=["Player"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        static_categoricals=["Player"],  # "Player" as categorical ID
        static_reals=["Age"],  # "Age" as numerical static feature
        time_varying_known_reals=["time_idx"],  # Known time index
        time_varying_unknown_reals=[
            "MP", "Min", "Gls", "Ast", "G+A-PK", "xG", "xAG", "xG+xAG",
            "npxG", "npxG+xAG", "PrgC", "PrgP", "PrgR", "Tkl", "Int", "Blocks",
            "Performance_Index", "Future_Potential"
        ],
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        allow_missing_timesteps=True
    )

    return training

In [3]:
# Train TFT model
def train_tft(training, model=None):
    trainer = Trainer(accelerator="cpu", max_epochs=20)  # Increased epochs for better learning

    if model is None:
        tft = TemporalFusionTransformer.from_dataset(
            training,
            learning_rate=0.02,  # Slightly reduced learning rate for stability
            hidden_size=64,  # Increased for better pattern learning
            attention_head_size=8,  # More attention heads for improved forecasting
            dropout=0.2,  # Increased dropout to prevent overfitting
            hidden_continuous_size=32,  # More capacity for continuous features
            output_size=1,  # Predicting a single value (Performance_Index)
            loss=RMSE(),  # Root Mean Squared Error as the loss function
        )
        model = TFTLightningModule(tft)
    else:
        model.model.train()  # Ensure the existing model is in training mode

    trainer.fit(model, train_dataloaders=training.to_dataloader(train=True, batch_size=32))  # Smaller batch size for stability

    return model

In [None]:
# Evaluate TFT model
def evaluate_tft(model, test_dataloader, df_test):

    raw_predictions = model.model.predict(test_dataloader, mode="raw", return_x=True)
    predictions = raw_predictions[0]["prediction"].cpu().detach().numpy()

    predictions = predictions[:, 0, 0]  # Get the first prediction for each sample

    if len(predictions) != len(df_test['G+A']):
        predictions = np.repeat(predictions, len(df_test['G+A']) // len(predictions) + 1)
        predictions = predictions[:len(df_test['G+A'])]  # Truncate if necessary
    return predictions

In [None]:
def display_metrics(predictions, actual):
    mse = mean_squared_error(actual, predictions)
    r2 = r2_score(actual, predictions)
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")
    best_threshold = 0.5

    predicted_labels = (predictions > best_threshold).astype(int)
    actual_labels = (actual > best_threshold).astype(int)

    f1 = f1_score(actual_labels, predicted_labels)
    testing_accuracy = accuracy_score(actual_labels, predicted_labels)

    print(f"F1 Score: {f1:.4f}")
    # print(f"Training Accuracy: {training_accuracy:.4f}")
    print(f"Testing Accuracy: {testing_accuracy:.4f}")

In [None]:
# Main workflow
def main():
    df_20_21,df_21_22,df_22_23, df_23_24, df_24_25 = load_data()

    df_train = pd.concat([df_20_21,df_21_22,df_22_23, df_23_24])
    df_test = df_24_25

    training = create_tft_dataset(df_train)
    testing = create_tft_dataset(df_test)

    model = train_tft(training)

    test_dataloader = testing.to_dataloader(train=False, batch_size=64)
    # Pass df_test to evaluate_tft
    predictions = evaluate_tft(model, test_dataloader, df_test)

    actual = df_test['G+A'].values

    display_metrics(predictions, actual)

    print(predictions)

In [None]:
# Execute the workflow
if __name__ == "__main__":
    main()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
  super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                      | Params | Mode 
--------------------------------------------

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


Mean Squared Error (MSE): 1.8252
R-squared (R2): -0.4600
F1 Score: 0.3353
Testing Accuracy: 0.2014
[1.7072477 1.7072477 1.7072477 ... 1.7035608 1.7035608 1.7035608]
