## CNNs (Convolutional Neural Networks) - Rolling Window

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Masking, Conv1D, GlobalAveragePooling1D, Dense)
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "../../nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [3]:
# ------------------------------------------------------------
# Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM player_game_features", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["player_id", "game_date"])

# Extract the season or year from 'game_date'
df['game_year'] = pd.to_datetime(df['game_date']).dt.year

# Features and target
features = ["player_id", "pts", "min", "fgm", "fga", "pts_per_min", "fg_pct"]
target = "pts"

df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

In [4]:
# ------------------------------------------------------------
# Helper Function: Create Sequences (Fixed Max Length)
# ------------------------------------------------------------
def create_player_sequences_fixed_length(data, target, player_column, max_length):
    """
    Create sequences of all past games for each player, then pad them to 'max_length'.
    Returns:
        X_padded: np.array of shape [num_sequences, max_length, num_features]
        y_list:   np.array of shape [num_sequences]
    """
    X_list, y_list = [], []
    
    for p_id, group in data.groupby(player_column):
        # Convert features to np.array
        player_features = group.drop(columns=[player_column]).values
        # Pull the corresponding target values
        player_target = target[group.index].values

        # Build sequences from length=1 up to the current index
        for i in range(1, len(player_features)):
            seq = player_features[:i]  # all past games up to (not including) i
            X_list.append(seq)
            y_list.append(player_target[i])  # the target at index i

    # Now, pad/truncate each sequence to 'max_length'
    if not X_list:
        return np.array([]), np.array([])

    num_features = X_list[0].shape[1]
    X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)

    for i, seq in enumerate(X_list):
        seq_len = len(seq)
        if seq_len <= max_length:
            # Put seq at the end (most recent at the last positions), zeros at the front
            X_padded[i, max_length - seq_len:, :] = seq
        else:
            # If sequence is longer than max_length, truncate from the front
            X_padded[i, :, :] = seq[-max_length:]

    return X_padded, np.array(y_list)

In [5]:
# ------------------------------------------------------------
# Helper Function: Build CNN Model
# ------------------------------------------------------------
def build_cnn_model(input_shape):
    """
    Build a 1D CNN model with a Masking layer to ignore zero-padded timesteps.
    Example architecture:
      - Masking layer
      - Conv1D (with padding='same')
      - GlobalAveragePooling1D (or MaxPooling1D/Flatten)
      - Dense(32, relu)
      - Dense(1)
    """
    model = Sequential([
        # Mask out zeros from padding. The mask_value=0.0 is okay if scaled data never equals 0
        # or you explicitly want 0 to represent padded timesteps.
        Masking(mask_value=0.0, input_shape=input_shape),
        
        # Convolution over time dimension: filters=32, kernel_size=3, same padding
        Conv1D(filters=32, kernel_size=3, activation='relu', padding='same'),
        
        # You can replace GlobalAveragePooling1D with GlobalMaxPooling1D or Flatten
        GlobalAveragePooling1D(),
        
        Dense(32, activation='relu'),
        Dense(1)  # output layer
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [6]:
# ------------------------------------------------------------
# Rolling/Expanding Window Validation
# ------------------------------------------------------------
available_years = sorted(df['game_year'].unique())
print("Available Years in Data:", available_years)

training_window = 4
mae_scores = []
rmse_scores = []
years_tested = []

for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        # Not enough data prior to 'validate_year'
        continue
    # Ensure all intermediate years are also in the dataset
    if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
        continue

    # Build train/val splits
    train_mask = (df['game_year'] >= start_train_year) & (df['game_year'] < validate_year)
    val_mask   = (df['game_year'] == validate_year)

    train_data = df[train_mask]
    val_data   = df[val_mask]

    if len(train_data) == 0 or len(val_data) == 0:
        continue

    # Scale only the feature columns except "player_id"
    scaler = MinMaxScaler()
    scaled_features_train = scaler.fit_transform(train_data[features].drop(columns=["player_id"]))
    scaled_features_val   = scaler.transform(val_data[features].drop(columns=["player_id"]))

    # Create scaled dataframes with 'player_id' re-attached
    train_scaled = pd.DataFrame(
        scaled_features_train,
        index=train_data.index,
        columns=features[1:]  # everything except player_id
    )
    train_scaled["player_id"] = train_data["player_id"].values

    val_scaled = pd.DataFrame(
        scaled_features_val,
        index=val_data.index,
        columns=features[1:]
    )
    val_scaled["player_id"] = val_data["player_id"].values

    # Find max sequence lengths so we can pad to a single max_length
    def find_player_longest_sequence(data_df, id_col="player_id"):
        max_len = 0
        for _, group in data_df.groupby(id_col):
            length = len(group)
            max_len = max(max_len, length - 1)  # since sequences go up to i-1
        return max_len

    max_len_train = find_player_longest_sequence(train_scaled, "player_id")
    max_len_val   = find_player_longest_sequence(val_scaled, "player_id")
    max_len_both  = max(max_len_train, max_len_val)
    if max_len_both < 1:
        # Not enough data to form sequences
        continue

    # Create sequences for CNN
    X_train, y_train = create_player_sequences_fixed_length(
        train_scaled, train_data[target], "player_id", max_len_both
    )
    X_val, y_val = create_player_sequences_fixed_length(
        val_scaled, val_data[target], "player_id", max_len_both
    )

    if len(X_train) == 0 or len(X_val) == 0:
        continue

    # Build the CNN model
    num_features = X_train.shape[2]  # e.g., 6 for [pts, min, fgm, fga, pts_per_min, fg_pct]
    input_shape = (max_len_both, num_features)

    model = build_cnn_model(input_shape=input_shape)
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50, batch_size=32,
        verbose=1, callbacks=[early_stop]
    )

    # Predict on the validation set
    y_pred = model.predict(X_val).flatten()

    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)

    print(f"Validation Year: {validate_year}")
    print(f"Train Years: {start_train_year} to {validate_year-1}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}\n")

Available Years in Data: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Validation Year: 2019
Train Years: 2015 to 2018
MAE:  4.81
RMSE: 6.23

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epo

KeyboardInterrupt: 

In [None]:
# After the loop, you could examine the final metrics:
print("Years Tested:", years_tested)
print("MAE Scores:", mae_scores)
print("RMSE Scores:", rmse_scores)