In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sqlalchemy import create_engine

In [10]:
# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [14]:
# ------------------------------------------------------------
# Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM player_game_features", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["player_id", "game_date"])

# Extract the season or year from 'game_date'
df['game_year'] = pd.to_datetime(df['game_date']).dt.year

# Features and target
features = ["player_id", "pts", "min", "fgm", "fga", "pts_per_min", "fg_pct"]
target = "pts"
df = df.dropna(subset=features + ["pts"])

X = df[features]
y = df["pts"]

# ------------------------------------------------------------
# Helper Function: Create Sequences
# ------------------------------------------------------------
class PlayerSequenceDataset(Dataset):
    def __init__(self, features, targets, player_column):
        self.X, self.y = self.create_sequences(features, targets, player_column)

    @staticmethod
    def create_sequences(data, target, player_column):
        X, y = [], []
        for player_id, group in data.groupby(player_column):
            player_features = group.values  # All games for the player
            player_target = target[group.index].values

            for i in range(1, len(player_features)):
                X.append(player_features[:i])  # Sequence of all past games
                y.append(player_target[i])     # Target for the current game

        max_len = max(len(seq) for seq in X)
        padded_X = np.array([
            np.pad(seq, ((max_len - len(seq), 0), (0, 0)), mode='constant')
            for seq in X
        ])
        return np.array(padded_X), np.array(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

OperationalError: (sqlite3.OperationalError) no such table: player_game_features
[SQL: SELECT * FROM player_game_features]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
# ------------------------------------------------------------
# LSTM Model
# ------------------------------------------------------------
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, 32)
        self.fc2 = nn.Linear(32, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        x = hn[-1]
        x = torch.relu(self.fc1(x))
        return self.fc2(x)
# ------------------------------------------------------------
# Training & Validation Loop
# ------------------------------------------------------------
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch).squeeze()
            predictions.extend(outputs.cpu().numpy())
            targets.extend(y_batch.cpu().numpy())
    return np.array(predictions), np.array(targets)

In [None]:
# ------------------------------------------------------------
# Main Workflow
# ------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
training_window = 4
mae_scores, rmse_scores, years_tested = [], [], []
available_years = sorted(df['game_year'].unique())

for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        continue

    train_mask = (df['game_year'] >= start_train_year) & (df['game_year'] < validate_year)
    val_mask = (df['game_year'] == validate_year)

    train_data = df[train_mask]
    val_data = df[val_mask]

    if len(train_data) == 0 or len(val_data) == 0:
        continue

    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(train_data[features].drop(columns=["player_id"]))
    scaled_features_val = scaler.transform(val_data[features].drop(columns=["player_id"]))

    train_scaled = pd.DataFrame(scaled_features, index=train_data.index, columns=features[1:])
    train_scaled["player_id"] = train_data["player_id"].values

    val_scaled = pd.DataFrame(scaled_features_val, index=val_data.index, columns=features[1:])
    val_scaled["player_id"] = val_data["player_id"].values

    train_dataset = PlayerSequenceDataset(train_scaled, train_data[target], "player_id")
    val_dataset = PlayerSequenceDataset(val_scaled, val_data[target], "player_id")

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    input_size = train_loader.dataset[0][0].shape[1]
    model = LSTMModel(input_size=input_size).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(50):
        train_loss = train_model(model, train_loader, criterion, optimizer, device)
        y_pred, y_val = evaluate_model(model, val_loader, device)

        mae = mean_absolute_error(y_val, y_pred)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))

        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)


In [None]:
# ------------------------------------------------------------
# Save Results
# ------------------------------------------------------------
results = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_pred
})
print(results.head())