In [1]:
!pip install pytorch_forecasting
!pip install pytorch_lightning

Collecting pytorch_forecasting
  Downloading pytorch_forecasting-1.2.0-py3-none-any.whl.metadata (13 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch_forecasting)
  Downloading lightning-2.5.0.post0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading lightning_utilities-0.12.0-py3-none-any.whl.metadata (5.6 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning (from lightning<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch_forecasting)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-ma

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score, mean_absolute_error  # Import mean_absolute_error
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
from pytorch_lightning import Trainer
import torch
import torchmetrics
from pytorch_lightning.core.module import LightningModule
import matplotlib.pyplot as plt
import h5py
import torch.nn as nn

In [13]:
class LSTMModel(LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_hat = y_hat.squeeze(1)
        loss = torchmetrics.functional.mean_squared_error(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.03)

In [5]:
# Load datasets
def load_data():
    df_20_21 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_20_21.csv')
    df_21_22 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_21_22.csv')
    df_22_23 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_22_23.csv')
    df_23_24 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_23_24.csv')
    df_24_25 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_24_25.csv')
    return df_20_21,df_21_22,df_22_23, df_23_24, df_24_25

In [6]:
# Preprocess data for LSTM
def preprocess_data_lstm(df):
    # Add time_idx for temporal ordering
    df = df.reset_index()  # Reset index to ensure uniqueness
    df['time_idx'] = pd.factorize(df['MP'])[0]

    # Fill NaN values in relevant columns
    df.fillna(0, inplace=True)

    # Feature Engineering
    df['G+A'] = df['Gls'] + df['Ast']
    df['G-PK'] = df['Gls'] - df['PK']
    df['G+A-PK'] = df['G+A'] - df['PK']

    df['xG+xAG'] = df['xG'] + df['xAG']
    df['npxG+xAG'] = df['npxG'] + df['xAG']

    # Define a weighted Performance Index
    df['Performance_Index'] = (
        df['G+A-PK'] * 0.35 +    # Emphasis on actual goal contributions
        df['xG+xAG'] * 0.25 +    # Expected goal contributions
        df['PrgC'] * 0.15 +      # Progressive carries
        df['PrgP'] * 0.15 +      # Progressive passes
        df['PrgR'] * 0.1         # Progressive receptions
    )

    # Future Performance Potential based on trends (without 'Min')
    df['Future_Potential'] = (
        (df['MP'] / (df['MP'].mean() + 1)) * 0.4 +  # Playing time influence using 'MP'
        df.groupby('Player')['Performance_Index'].transform(lambda x: x.diff().fillna(0)) * 0.6  # Performance trends
    )

    features = [
        "MP", "Gls", "Ast", "G+A-PK", "xG", "xAG", "xG+xAG",
        "npxG", "npxG+xAG", "PrgC", "PrgP", "PrgR", "Tkl", "Int", "Blocks",
        "Performance_Index", "Future_Potential"
    ]

    # Verify uniqueness of the index
    if not df.index.is_unique:
        raise ValueError("Data index must be unique.")

    return df[features], df["Performance_Index"]

In [7]:
# Create dataset for LSTM
def create_lstm_dataset(df):
    X, y = preprocess_data_lstm(df)

    # Convert to PyTorch tensors
    X = torch.tensor(X.values, dtype=torch.float32)
    y = torch.tensor(y.values, dtype=torch.float32)

    # Reshape for LSTM input (batch_size, sequence_length, input_size)
    # Assuming sequence_length = 1 for now (you might need to adjust)
    X = X.unsqueeze(1)

    return X, y

In [8]:
# Train LSTM model
def train_lstm(X_train, y_train):
    input_size = X_train.shape[2]  # Number of features
    hidden_size = 64  # Adjust as needed
    num_layers = 2  # Adjust as needed
    output_size = 1

    model = LSTMModel(input_size, hidden_size, num_layers, output_size)
    trainer = Trainer(accelerator="cpu", max_epochs=50)

    # Create a TensorDataset and DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

    trainer.fit(model, train_dataloaders=train_loader)

    return model

In [18]:
def evaluate_lstm(model, X_test, y_test, df_test): # Pass df_test to the function
    model.eval()
    with torch.no_grad():
        predictions = model(X_test)

    raw_predictions = predictions
    predictions = raw_predictions.cpu().numpy() # Remove the index and prediction key to get the raw predictions

    num_test_samples = len(df_test['G+A'])  # Assuming df_test is defined globally

    if len(predictions) < num_test_samples:
        predictions = np.pad(predictions, (0, num_test_samples - len(predictions)), mode='edge')
    elif len(predictions) > num_test_samples:
        predictions = predictions[:num_test_samples]  # Truncate excess values
    return predictions

In [10]:
def display_metrics(predictions, actual):
    # Regression Metrics
    mse = mean_squared_error(actual, predictions)
    rmse = np.sqrt(mse)  # More interpretable than MSE
    mae = mean_absolute_error(actual, predictions)
    r2 = r2_score(actual, predictions)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R2): {r2:.4f}")

    # Classification Metrics (Only if necessary)
    if np.unique(actual).size > 1:  # Avoid classification issues when only one class exists
        best_threshold = np.mean(actual)  # Dynamically set threshold based on average actual value

        predicted_labels = (predictions > best_threshold).astype(int)
        actual_labels = (actual > best_threshold).astype(int)

        f1 = f1_score(actual_labels, predicted_labels)
        testing_accuracy = accuracy_score(actual_labels, predicted_labels)

        print(f"F1 Score: {f1:.4f}")
        print(f"Testing Accuracy: {testing_accuracy:.4f}")
    else:
        print("Skipping classification metrics due to single-class data.")

In [19]:
def main():
    df_20_21, df_21_22, df_22_23, df_23_24, df_24_25 = load_data()

    df_train = pd.concat([df_20_21, df_21_22, df_22_23, df_23_24])
    df_test = df_24_25

    X_train, y_train = create_lstm_dataset(df_train)
    X_test, y_test = create_lstm_dataset(df_test)

    model = train_lstm(X_train, y_train)
    predictions = evaluate_lstm(model, X_test, y_test, df_test)

    actual = df_test['G+A'].values

    display_metrics(predictions, actual)

    print(predictions)

In [20]:
# Execute the workflow
if __name__ == "__main__":
    main()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name | Type   | Params | Mode 
----------------------------------------
0 | lstm | LSTM   | 54.5 K | train
1 | fc   | Linear | 65     | train
----------------------------------------
54.6 K    Trainable params
0         Non-trainable params
54.6 K    Total params
0.218     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Mean Squared Error (MSE): 857.6709
Root Mean Squared Error (RMSE): 29.2860
Mean Absolute Error (MAE): 18.0340
R-squared (R2): -685.0740
F1 Score: 0.3354
Testing Accuracy: 0.2018
[[ 0.94194126]
 [16.87988   ]
 [16.311325  ]
 ...
 [19.856031  ]
 [ 3.3228674 ]
 [ 3.3253498 ]]
