In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [5]:
# ------------------------------------------------------------
# Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM player_game_features", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["player_id", "game_date"])

# Extract the season or year from 'game_date'
df['game_year'] = pd.to_datetime(df['game_date']).dt.year

# Features and target
features = ["player_id", "pts", "min", "fgm", "fga", "pts_per_min", "fg_pct"]

target = "pts"

df = df.dropna(subset=features + ["pts"])

X = df[features]
y = df["pts"]

In [15]:
# ------------------------------------------------------------
# Helper Function: Create Sequences
# ------------------------------------------------------------

def create_player_sequences(data, target, player_column):
    """
    Create sequences of all past games for each player.

    Parameters:
        data (pd.DataFrame): Input features as a DataFrame.
        target (pd.Series): Target values.
        player_column (str): Column name for player IDs.

    Returns:
        np.array: Feature sequences for all players.
        np.array: Target values aligned with the sequences.
    """
    X, y = [], []
    for player_id, group in data.groupby(player_column):
        player_features = group.values  # All games for the player
        player_target = target[group.index].values
        
        for i in range(1, len(player_features)):
            # Include all games up to the current game as the sequence
            X.append(player_features[:i])  # Sequence of all past games
            y.append(player_target[i])     # Target for the current game

    # Pad sequences to the same length
    max_len = max(len(seq) for seq in X)
    padded_X = np.array([
        np.pad(seq, ((max_len - len(seq), 0), (0, 0)), mode='constant')
        for seq in X
    ])
    return np.array(padded_X), np.array(y)

# ------------------------------------------------------------
# Helper Function: Build Model
# ------------------------------------------------------------

# Build LSTM model with masking
def build_lstm_model(input_shape):
    model = Sequential([
        # Masking layer to ignore padded values
        Masking(mask_value=0.0, input_shape=input_shape),
        # LSTM layer with ReLU activation
        LSTM(64, activation='relu', return_sequences=False),
        # Fully connected (Dense) layer with ReLU
        Dense(32, activation='relu'),
        # Output layer for regression
        Dense(1)  # No activation for the output layer
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model



In [17]:
# ------------------------------------------------------------
# Rolling/Expanding Window Validation
# ------------------------------------------------------------
available_years = sorted(df['game_year'].unique())
print("Available Years in Data:", available_years)


training_window = 4
mae_scores = []
rmse_scores = []
years_tested = []


for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        continue
    if not all(y in available_years for y in range(start_train_year, validate_year)):
        continue

    train_mask = (df['game_year'] >= start_train_year) & (df['game_year'] < validate_year)
    val_mask = (df['game_year'] == validate_year)

    train_data = df[train_mask]
    val_data = df[val_mask]

    if len(train_data) == 0 or len(val_data) == 0:
        continue

    # Scale only the feature columns
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(train_data[features].drop(columns=["player_id"]))
    scaled_features_val = scaler.transform(val_data[features].drop(columns=["player_id"]))

    # Add back the 'player_id' column to the scaled data
    train_scaled = pd.DataFrame(scaled_features, index=train_data.index, columns=features[1:])
    train_scaled["player_id"] = train_data["player_id"].values

    val_scaled = pd.DataFrame(scaled_features_val, index=val_data.index, columns=features[1:])
    val_scaled["player_id"] = val_data["player_id"].values

    # Create sequences for LSTM (grouped by player_id)
    X_train, y_train = create_player_sequences(
        train_scaled,
        train_data[target],
        "player_id"
    )
    X_val, y_val = create_player_sequences(
        val_scaled,
        val_data[target],
        "player_id"
    )

    if len(X_train) == 0 or len(X_val) == 0:
        continue

    # Build the LSTM model
    
    # Define input shape
    max_len = X_train.shape[1]  # Length of padded sequences
    num_features = X_train.shape[2]  # Number of features per timestep
    input_shape = (max_len, num_features)
    
    model = build_lstm_model(input_shape=input_shape)
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50, batch_size=16,
        verbose=1, callbacks=[early_stop]
    )

    # Predict on the validation set
    y_pred = model.predict(X_val).flatten()

    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)

    print(f"Validation Year: {validate_year}")
    print(f"Train Years: {start_train_year} to {validate_year-1}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}\n")

Available Years in Data: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]


  super().__init__(**kwargs)


Epoch 1/50
[1m3718/3718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 66ms/step - loss: 64860993880064.0000 - mae: 253928.4844 - val_loss: 183.3690 - val_mae: 10.5718
Epoch 2/50
[1m 696/3718[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m3:15[0m 65ms/step - loss: 167.4792 - mae: 10.0583

KeyboardInterrupt: 

In [None]:
# Convert to DataFrame
results = pd.DataFrame({
    'Player Name': df['player_name'].iloc[sequence_length:].values,  # Align with sequences
    'Game Date': df['game_date'].iloc[sequence_length:].values,      # Align with sequences
    'Actual': y_val,
    'Predicted': y_pred
})

# Display the first 10 rows
print(results.head(10))

# Optionally, save to CSV
results.to_csv('predicted_vs_actual.csv', index=False)
print("Results saved to 'predicted_vs_actual_with_metadata.csv'.")