In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler

In [2]:
DATA_PATH  = "/home/achyuth/DL_Project/Predicting-Football-Injuries-using-LSTM/Cleaned_Dataset/dataset.csv"
OUTPUT_DIR = "/home/achyuth/DL_Project/Predicting-Football-Injuries-using-LSTM/LSTM_Sequences/"

In [3]:
MAX_SEQ_LEN = 15

FEATURE_COLS = [
    "injury_code", "age_at_injury", "height", "position_code",
    "total_prior_injuries", "days_since_last_injury",
    "same_site_reinjury", "days_since_same_site_injury",
]
CONTINUOUS_COLS = [
    "age_at_injury", "height",
    "days_since_last_injury", "days_since_same_site_injury",
]
TARGET_COL = "days_missed"

df = pd.read_csv(DATA_PATH)

Applying log transform to 'Days Missed' since they have a right skew. Most injuries are minor and heal in a few days. But severe injuries take a long time.

In [4]:
df[TARGET_COL] = np.log1p(df[TARGET_COL])

Scaling the data

In [5]:
scaler = MinMaxScaler()
df[CONTINUOUS_COLS] = scaler.fit_transform(df[CONTINUOUS_COLS])
with open(f"{OUTPUT_DIR}scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

Creating sequences for the LSTM. Grouping by player_id

In [6]:
sequences, targets = [], []
for _, group in df.groupby("player_id"):
    features = group[FEATURE_COLS].values
    labels   = group[TARGET_COL].values
    for t in range(len(group)):
        seq = features[: t + 1]
        pad_len = MAX_SEQ_LEN - len(seq)
        if pad_len > 0:
            seq = np.vstack([np.zeros((pad_len, len(FEATURE_COLS))), seq])
        else:
            seq = seq[-MAX_SEQ_LEN:]
        sequences.append(seq)
        targets.append(labels[t])

X = np.array(sequences, dtype=np.float32)
y = np.array(targets,   dtype=np.float32)

Creating training, validation and test datasets

In [7]:
n         = len(X)
train_end = int(n * 0.80)
val_end   = int(n * 0.90)

X_train, y_train = X[:train_end],        y[:train_end]
X_val,   y_val   = X[train_end:val_end], y[train_end:val_end]
X_test,  y_test  = X[val_end:],          y[val_end:]

np.save(f"{OUTPUT_DIR}X_train.npy", X_train)
np.save(f"{OUTPUT_DIR}y_train.npy", y_train)
np.save(f"{OUTPUT_DIR}X_val.npy",   X_val)
np.save(f"{OUTPUT_DIR}y_val.npy",   y_val)
np.save(f"{OUTPUT_DIR}X_test.npy",  X_test)
np.save(f"{OUTPUT_DIR}y_test.npy",  y_test)

print(f"X: {X.shape}, y: {y.shape}")
print(f"Train: {len(X_train):,} | Val: {len(X_val):,} | Test: {len(X_test):,}")

X: (96968, 15, 8), y: (96968,)
Train: 77,574 | Val: 9,697 | Test: 9,697
