In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# ------------------------------------------------------------
# 1) Configuration
# ------------------------------------------------------------
DB_NAME = "../../copy_nba_box_scores.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [3]:
# ------------------------------------------------------------
# 2) Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM combined_player_game_data", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["PLAYER_ID", "GAME_DATE_EST"])

# Extract the part after the colon and convert to int
df['MIN'] = df['MIN'].apply(lambda x: int(x.split(':')[1]))

features = [
    "PLAYER_ID",
    "PTS",
    "TEAM_ID",
    "MIN",       # Minutes played
    "FGA",       # Field Goal Attempts
    "FGM",       # Field Goals Made
    "FG_PCT",    # Field Goal Percentage
    "FG3A",      # 3-Point Attempts
    "FG3M",      # 3-Point Makes
    "FG3_PCT",   # 3-Point Percentage
    "FTA",       # Free Throw Attempts
    "FTM",       # Free Throws Made
    "FT_PCT",    # Free Throw Percentage
    "OREB",
    "DREB",
    "AST",
    "USG_PCT",
    "TS_PCT",
    "STL",
    "BLK",
    "TO",
    "PLUS_MINUS",
    "E_OFF_RATING", 
    "OFF_RATING", 
    "E_DEF_RATING", 
    "DEF_RATING", 
    "E_NET_RATING", 
    "NET_RATING", 
    "AST_PCT", 
    "AST_TOV", 
    "AST_RATIO", 
    "OREB_PCT", 
    "DREB_PCT", 
    "REB_PCT", 
    "TM_TOV_PCT", 
    "EFG_PCT", 
    "TS_PCT", 
    "USG_PCT", 
    "E_USG_PCT", 
    "E_PACE", 
    "PACE", 
    "PACE_PER40", 
    "POSS", 
    "PIE"
]
target = "PTS"

# Drop rows with missing values in features, target, or PLAYER_NAME
df = df.dropna(subset=features + [target, "PLAYER_NAME"])

# Convert SEASON to integer if not already
if df["SEASON"].dtype != int:
    df["SEASON"] = df["SEASON"].astype(int)

X = df[features]
y = df[target]

In [13]:
# ------------------------------------------------------------
# 3) Helper Function: Create Sequences
# ------------------------------------------------------------
def create_player_sequences_fixed_length(
    data, 
    target, 
    player_column, 
    max_length, 
    history_data=None, 
    min_length=10,
    max_cap=100,
    drop_columns=None  # New parameter to specify additional columns to drop
):
    """
    Creates fixed-length sequences per player. If a sequence is shorter 
    than max_length, it is zero-padded at the front. If it is longer, 
    we only take the most recent max_length steps.

    Parameters:
      data (DataFrame): The dataset containing features for sequence creation.
      target (Series): The target variable corresponding to each row in data.
      player_column (str): The column name for player identifiers.
      max_length (int): The maximum length of sequences.
      history_data (DataFrame, optional): Additional historical data to include.
      min_length (int): Minimum number of actual data points required in a sequence.
      max_cap (int): Maximum number of games to consider per player.
      drop_columns (list, optional): Additional columns to drop.

    Returns:
      X_padded: shape (num_sequences, max_length, num_features)
      y_list: target values (shape = (num_sequences,))
      idx_list: indices into the original DataFrame for each target
    """
    max_length = min(max_length, max_cap)
    X_list, y_list, idx_list = [], [], []
    
    # If history_data is provided, prepend it to data for each player
    if history_data is not None:
        combined_data = pd.concat([history_data, data])
    else:
        combined_data = data.copy()
    
    for p_id, group in combined_data.groupby(player_column):
        # Drop PLAYER_ID and any other specified columns (e.g., PLAYER_NAME)
        if drop_columns is None:
            drop_columns = []
        player_features = group.drop(columns=[player_column] + drop_columns).values
        player_target = target[group.index].values
        group_indices = group.index  # Original DataFrame indices

        for i in range(1, len(group)):
            # Determine the sequence up to (but not including) the current game
            seq = player_features[:i]
            actual_length = len(seq)

            # Skip sequences that do not meet the minimum length
            if actual_length < min_length:
                continue

            # Fetch the target value for the current game
            y = player_target[i]

            # Skip if the target is NaN (i.e., belongs to history_data)
            if pd.isna(y):
                continue

            X_list.append(seq)
            y_list.append(y)
            idx_list.append(group_indices[i])

    if not X_list:
        return np.array([]), np.array([]), np.array([])

    num_features = X_list[0].shape[1]

    X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)
    
    for i, seq in enumerate(X_list):
        seq_len = len(seq)
        if seq_len <= max_length:
            X_padded[i, max_length - seq_len:, :] = seq
        else:
            X_padded[i, :, :] = seq[-max_length:]

    return X_padded, np.array(y_list), np.array(idx_list)

In [10]:
# ------------------------------------------------------------
# 4) LSTM Model Builder
# ------------------------------------------------------------
def build_lstm_model(timesteps, num_features, lstm_units=64):
    """
    Build a simple LSTM Network:
      - LSTM -> Dense(1)
    """
    model = Sequential()
    model.add(
        LSTM(
            units=lstm_units,
            input_shape=(timesteps, num_features),  # (max_length, num_features)
            return_sequences=False
        )
    )
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [14]:
# ------------------------------------------------------------
# 5) Rolling/Expanding Window Validation with Enhancements
# ------------------------------------------------------------
df["SEASON"] = df["SEASON"].astype(int)
available_years = sorted(df['SEASON'].unique())
print("Available Years in Data:", available_years)

training_window = 3
mae_scores = []
rmse_scores = []
years_tested = []

# Initialize list to collect all predictions
all_predictions = []

# Define minimum sequence length
MIN_SEQUENCE_LENGTH = 10  # You can adjust this value as needed

for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        print(f"Skipping {validate_year}: Not enough training years.")
        continue
    if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
        print(f"Skipping {validate_year}: Missing intermediate years in training window.")
        continue

    train_mask = (df['SEASON'] >= start_train_year) & (df['SEASON'] < validate_year)
    val_mask   = (df['SEASON'] == validate_year)

    train_data = df[train_mask]
    val_data   = df[val_mask]

    print(f"\nValidate Year: {validate_year}, Train Years: {start_train_year} to {validate_year-1}")
    print(f"Training Data Length: {len(train_data)}, Validation Data Length: {len(val_data)}")

    if len(train_data) == 0 or len(val_data) == 0:
        print(f"Skipping {validate_year}: Insufficient training or validation data.")
        continue

    # Identify numerical features to scale (exclude PLAYER_ID)
    numerical_features = [feat for feat in features if feat not in ["PLAYER_ID"]]

    # Fit scaler on training data only
    scaler = MinMaxScaler()
    scaled_features_train = scaler.fit_transform(train_data[numerical_features])

    # Combine training and validation data for sequence creation
    combined_data = pd.concat([train_data, val_data])

    # Scale the combined data using the scaler fitted on training data
    scaled_features_combined = scaler.transform(combined_data[numerical_features])

    # Reconstruct scaled DataFrame
    combined_scaled = pd.DataFrame(scaled_features_combined, index=combined_data.index, columns=numerical_features)
    combined_scaled["PLAYER_ID"] = combined_data["PLAYER_ID"].values
    combined_scaled["PLAYER_NAME"] = combined_data["PLAYER_NAME"].values  # Ensure PLAYER_NAME is included

    # Split back into scaled training and validation data
    scaled_train = combined_scaled.loc[train_data.index]
    scaled_val = combined_scaled.loc[val_data.index]

    # Create a combined target series with NaN for training data and actual PTS for validation data
    target_combined = pd.concat([
        pd.Series([np.nan] * len(train_data), index=train_data.index),
        val_data[target]
    ])

    # Create training sequences from training data
    X_train_3D, y_train, train_idx = create_player_sequences_fixed_length(
        scaled_train, 
        train_data[target], 
        "PLAYER_ID", 
        max_length=100, 
        history_data=None, 
        min_length=MIN_SEQUENCE_LENGTH,
        drop_columns=["PLAYER_NAME"]  # Exclude PLAYER_NAME
    )

    # Create validation sequences using combined data (training + validation up to each target game)
    # Pass the combined target series to include both training (NaN) and validation targets
    X_val_3D, y_val, val_idx = create_player_sequences_fixed_length(
        scaled_val, 
        target_combined,  # Use the combined target
        "PLAYER_ID", 
        max_length=100, 
        history_data=scaled_train, 
        min_length=MIN_SEQUENCE_LENGTH,
        drop_columns=["PLAYER_NAME"]  # Exclude PLAYER_NAME
    )

    print(f"Train Sequences Shape: {X_train_3D.shape}, Validation Sequences Shape: {X_val_3D.shape}")
    # Shapes are (num_sequences, timesteps, num_features)

    if X_train_3D.size == 0 or X_val_3D.size == 0:
        print(f"Skipping {validate_year}: No valid sequences after applying minimum sequence length.")
        continue

    timesteps   = X_train_3D.shape[1]
    num_features = X_train_3D.shape[2]

    # Build and train LSTM model
    model = build_lstm_model(timesteps, num_features, lstm_units=64)

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(
        X_train_3D, y_train,
        validation_data=(X_val_3D, y_val),
        epochs=50, batch_size=32,
        verbose=1,
        callbacks=[early_stop]
    )

    # Predict on the validation set
    y_pred = model.predict(X_val_3D).flatten()

    # Compute metrics
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)

    print(f"Validation Year: {validate_year}")
    print(f"Train Years: {start_train_year} to {validate_year-1}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}\n")

    # --------------------------------------------
    # Collect Predictions with Player Info
    # --------------------------------------------
    val_rows = df.loc[val_idx]

    # Ensure PLAYER_NAME is in val_rows
    if 'PLAYER_NAME' not in val_rows.columns:
        print("PLAYER_NAME column not found in the DataFrame. Please ensure it exists.")
        continue

    predictions_df = pd.DataFrame({
        'PLAYER_ID': val_rows['PLAYER_ID'].values,
        'PLAYER_NAME': val_rows['PLAYER_NAME'].values,
        'Actual_PTS': y_val,
        'Predicted_PTS': y_pred,
        'Season': validate_year
    }, index=val_idx)

    all_predictions.append(predictions_df)

Available Years in Data: [2019, 2020, 2021, 2022]
Skipping 2019: Not enough training years.
Skipping 2020: Not enough training years.
Skipping 2021: Not enough training years.

Validate Year: 2022, Train Years: 2019 to 2021
Training Data Length: 195593, Validation Data Length: 51796
Train Sequences Shape: (188137, 100, 43), Validation Sequences Shape: (50960, 100, 43)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Validation Year: 2022
Train Years: 2019 to 2021
MAE:  2.55
RMSE: 4.34



In [15]:
# ------------------------------------------------------------
# 6) Export All Predictions
# ------------------------------------------------------------
if all_predictions:
    final_predictions = pd.concat(all_predictions)
    
    # Optionally, sort by Season and PLAYER_ID
    final_predictions = final_predictions.sort_values(by=['Season', 'PLAYER_ID'])
    
    # Reset index if needed
    final_predictions.reset_index(drop=True, inplace=True)
    
    # Export to CSV
    final_predictions.to_csv("player_predictions.csv", index=False)
    
    print("Predictions have been exported to 'player_predictions.csv'.")
else:
    print("No predictions were collected.")

# Optionally, print average MAE and RMSE across all seasons
if mae_scores and rmse_scores:
    avg_mae = np.mean(mae_scores)
    avg_rmse = np.mean(rmse_scores)
    print(f"Average MAE across tested years: {avg_mae:.2f}")
    print(f"Average RMSE across tested years: {avg_rmse:.2f}")

Predictions have been exported to 'player_predictions.csv'.
Average MAE across tested years: 2.55
Average RMSE across tested years: 4.34


In [16]:
# Check if PLAYER_ID 203999 exists in the dataset
player_id = 203999
player_data = df[df['PLAYER_ID'] == player_id]

if player_data.empty:
    print(f"No data found for PLAYER_ID {player_id}. Ensure that the player has historical game data.")
else:
    print(f"Found {len(player_data)} games for PLAYER_ID {player_id}.")


Found 760 games for PLAYER_ID 203999.


In [17]:
# Combine all features for scaling
features_to_scale = [feat for feat in features if feat not in ["PLAYER_ID", "PLAYER_NAME"]]

# Fit the scaler on the entire dataset
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df[features_to_scale])

# Create a scaled DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=features_to_scale)
scaled_df["PLAYER_ID"] = df["PLAYER_ID"].values
scaled_df["PLAYER_NAME"] = df["PLAYER_NAME"].values
scaled_df["SEASON"] = df["SEASON"].values  # Ensure SEASON is included if needed


In [18]:
def get_latest_sequence(scaled_data, player_id, max_length=100, min_length=10, drop_columns=None):
    """
    Retrieves the latest sequence of features for a specific player.

    Parameters:
      scaled_data (DataFrame): Scaled features including PLAYER_ID.
      player_id (int): The PLAYER_ID to retrieve data for.
      max_length (int): Maximum sequence length.
      min_length (int): Minimum required sequence length.
      drop_columns (list, optional): Columns to drop from the sequence.

    Returns:
      np.array: Padded sequence of shape (max_length, num_features)
    """
    player_scaled = scaled_data[scaled_data['PLAYER_ID'] == player_id].sort_values(by="GAME_DATE_EST")
    
    if drop_columns is None:
        drop_columns = []
    
    # Drop unnecessary columns
    player_features = player_scaled.drop(columns=["PLAYER_ID", "PLAYER_NAME"] + drop_columns).values
    
    if len(player_features) < min_length:
        raise ValueError(f"Not enough historical data for PLAYER_ID {player_id}. Minimum required: {min_length}")
    
    # Take the last 'max_length' games
    seq = player_features[-max_length:]
    
    # Pad sequences if necessary
    if len(seq) < max_length:
        padding = np.zeros((max_length - len(seq), seq.shape[1]))
        seq = np.vstack((padding, seq))
    
    return seq


In [None]:
try:
    latest_sequence = get_latest_sequence(
        scaled_df, 
        player_id=203999, 
        max_length=100, 
        min_length=10, 
        drop_columns=["PLAYER_NAME"]
    )
    # Reshape to (1, timesteps, num_features) for prediction
    latest_sequence = latest_sequence.reshape(1, latest_sequence.shape[0], latest_sequence.shape[1])
except ValueError as e:
    print(e)
    # Handle insufficient data scenario


In [20]:
# Define training data (all data)
train_data_final = scaled_df.copy()

# Create sequences
X_final, y_final, _ = create_player_sequences_fixed_length(
    data=train_data_final, 
    target=df[target], 
    player_column="PLAYER_ID", 
    max_length=100, 
    history_data=None, 
    min_length=MIN_SEQUENCE_LENGTH,
    drop_columns=["PLAYER_NAME"]
)

print(f"Final Training Sequences Shape: {X_final.shape}")


Final Training Sequences Shape: (239097, 100, 44)


In [21]:
# Define model parameters
timesteps_final = X_final.shape[1]
num_features_final = X_final.shape[2]

# Build the model
final_model = build_lstm_model(timesteps_final, num_features_final, lstm_units=64)

# Define Early Stopping
early_stop_final = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train the model on all data
final_model.fit(
    X_final, y_final,
    epochs=50, 
    batch_size=32,
    verbose=1,
    callbacks=[early_stop_final]
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
 567/7472 [=>............................] - ETA: 35s - loss: 74.2119 - mae: 6.8758

KeyboardInterrupt: 

In [None]:
# Predict
predicted_pts = final_model.predict(latest_sequence).flatten()[0]

print(f"Predicted Points for PLAYER_ID {player_id}: {predicted_pts:.2f}")