In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:
# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [3]:
# ------------------------------------------------------------
# Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM player_game_features", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["player_id", "game_date"])

# Extract the year from 'game_date'
df['game_year'] = pd.to_datetime(df['game_date']).dt.year

# Features and target
features = ["player_id", "pts", "min", "fgm", "fga", "pts_per_min", "fg_pct"]
target = "pts"

df = df.dropna(subset=features + ["pts"])

In [4]:
# ------------------------------------------------------------
# Helper Function: Create Sequences (Fixed Max Length)
# ------------------------------------------------------------
def create_player_sequences_fixed_length(data, target, player_column, max_length):
    """
    Create sequences of all past games for each player, then pad them to 'max_length'.
    """
    X_list, y_list = [], []
    
    for p_id, group in data.groupby(player_column):
        player_features = group.drop(columns=[player_column]).values
        player_target = target[group.index].values

        for i in range(1, len(player_features)):
            seq = player_features[:i]
            X_list.append(seq)
            y_list.append(player_target[i])

    num_features = X_list[0].shape[1] if X_list else 0
    X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)

    for i, seq in enumerate(X_list):
        seq_len = len(seq)
        if seq_len <= max_length:
            X_padded[i, max_length - seq_len:, :] = seq
        else:
            X_padded[i, :, :] = seq[-max_length:]

    return X_padded, np.array(y_list)

# ------------------------------------------------------------
# Helper Function: Build Model
# ------------------------------------------------------------
def build_lstm_model(input_shape):
    """
    Build an LSTM model with a Masking layer.
    """
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),
        LSTM(64, activation='tanh', return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [5]:
# ------------------------------------------------------------
# Fixed Period Train/Validation Split
# ------------------------------------------------------------
# Define training and validation periods
TRAIN_START_YEAR = 2015
TRAIN_END_YEAR = 2022
VALIDATION_YEAR = 2023

# Create masks for train and validation sets
train_mask = (df['game_year'] >= TRAIN_START_YEAR) & (df['game_year'] <= TRAIN_END_YEAR)
val_mask = (df['game_year'] == VALIDATION_YEAR)

train_data = df[train_mask]
val_data = df[val_mask]

print(f"Training data size: {len(train_data)} games")
print(f"Validation data size: {len(val_data)} games")

# Scale features (excluding player_id)
scaler = MinMaxScaler()
scaled_features_train = scaler.fit_transform(train_data[features].drop(columns=["player_id"]))
scaled_features_val = scaler.transform(val_data[features].drop(columns=["player_id"]))

# Create scaled dataframes with player_id
train_scaled = pd.DataFrame(scaled_features_train, 
                          index=train_data.index, 
                          columns=features[1:])
train_scaled["player_id"] = train_data["player_id"].values

val_scaled = pd.DataFrame(scaled_features_val, 
                        index=val_data.index, 
                        columns=features[1:])
val_scaled["player_id"] = val_data["player_id"].values

# Find maximum sequence length across both datasets
def find_player_longest_sequence(data_df, id_col="player_id"):
    max_len = 0
    for _, group in data_df.groupby(id_col):
        length = len(group)
        max_len = max(max_len, length - 1)
    return max_len

max_len_train = find_player_longest_sequence(train_scaled, "player_id")
max_len_val = find_player_longest_sequence(val_scaled, "player_id")
max_len_both = max(max_len_train, max_len_val)

print(f"Maximum sequence length: {max_len_both}")

# Create sequences for LSTM
X_train, y_train = create_player_sequences_fixed_length(
    train_scaled, train_data[target], "player_id", max_len_both
)
X_val, y_val = create_player_sequences_fixed_length(
    val_scaled, val_data[target], "player_id", max_len_both
)

# Build and train the model
num_features = X_train.shape[2]
input_shape = (max_len_both, num_features)

model = build_lstm_model(input_shape=input_shape)
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50, batch_size=32,
    verbose=1, callbacks=[early_stop]
)

# Evaluate on validation set
y_pred = model.predict(X_val).flatten()

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("\nValidation Results:")
print(f"Training Period: {TRAIN_START_YEAR}-{TRAIN_END_YEAR}")
print(f"Validation Year: {VALIDATION_YEAR}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

Training data size: 187537 games
Validation data size: 14268 games
Maximum sequence length: 578
Epoch 1/50
  25/5823 [..............................] - ETA: 6:01:50 - loss: 144.2296 - mae: 8.9315  

KeyboardInterrupt: 