In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# ------------------------------------------------------------
# 1) Configuration
# ------------------------------------------------------------
DB_NAME = "../../copy_nba_box_scores.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [3]:
# ------------------------------------------------------------
# 2) Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM combined_player_game_data", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["PLAYER_ID", "GAME_DATE_EST"])

# Extract the part after the colon and convert to int
df['MIN'] = df['MIN'].apply(lambda x: int(x.split(':')[1]))

# ------------------------------------------------------------
# 2a) Create Rolling Averages (Example: 3-game and 5-game windows)
# ------------------------------------------------------------
# We demonstrate using PTS, MIN, USG_PCT as sample columns to create rolling averages.
# You can add more columns or different window sizes as you like.
columns_to_roll = ["PTS", "MIN", "USG_PCT"]
window_sizes = [3, 5]

for col in columns_to_roll:
    for w in window_sizes:
        df[f"{col}_rolling_{w}"] = (
            df.groupby("PLAYER_ID")[col]
              .transform(lambda x: x.shift(1).rolling(w).mean())
        )

# Fill any NaN values (early games won't have enough history for rolling windows)
df.fillna(0, inplace=True)


# ------------------------------------------------------------
# 3) Feature & Target Definitions
# ------------------------------------------------------------
features = [
    "PLAYER_ID",
    "TEAM_ID",
    # "PTS",
    "MIN",       # Minutes played
    "FGA",       # Field Goal Attempts
    "FGM",       # Field Goals Made
    "FG_PCT",    # Field Goal Percentage
    "FG3A",      # 3-Point Attempts
    "FG3M",      # 3-Point Makes
    "FG3_PCT",   # 3-Point Percentage
    "FTA",       # Free Throw Attempts
    "FTM",       # Free Throws Made
    "FT_PCT",    # Free Throw Percentage
    "OREB",
    "DREB",
    "AST",
    "USG_PCT",
    "TS_PCT",
    "STL",
    "BLK",
    "TO",
    "PLUS_MINUS",
    "E_OFF_RATING", 
    "OFF_RATING", 
    "E_DEF_RATING", 
    "DEF_RATING", 
    "E_NET_RATING", 
    "NET_RATING", 
    "AST_PCT", 
    "AST_TOV", 
    "AST_RATIO", 
    "OREB_PCT", 
    "DREB_PCT", 
    "REB_PCT", 
    "TM_TOV_PCT", 
    "EFG_PCT", 
    "TS_PCT", 
    "USG_PCT", 
    "E_USG_PCT", 
    "E_PACE", 
    "PACE", 
    "PACE_PER40", 
    "POSS", 
    "PIE",
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    # New rolling features:
    "PTS_rolling_3",
    "PTS_rolling_5",
    "MIN_rolling_3",
    "MIN_rolling_5",
    "USG_PCT_rolling_3",
    "USG_PCT_rolling_5",
]
target = "PTS"

df = df.dropna(subset=features + [target, "PLAYER_NAME"])

X = df[features]
y = df[target]

In [4]:
# def create_fixed_window_sequences(
#     data: pd.DataFrame, 
#     feature_cols: list, 
#     target_col: pd.Series, 
#     player_col: str, 
#     window_size: int
# ):
#     """
#     For each player, generate sliding windows of length `window_size`.
#     If i < window_size, we skip that row (not enough history).
    
#     Returns:
#       X: shape (num_samples, window_size, num_features)
#       y: shape (num_samples,)
#       idx_list: indices of the original rows used for the target
#     """
#     X_list, y_list, idx_list = [], [], []

#     # Sort data by date if not already sorted
#     # (Assuming it is sorted in your main script, you can skip here.)
    
#     for p_id, group in data.groupby(player_col):
#         # We'll pick out only the columns we need
#         group_features = group[feature_cols].values
#         group_target = target_col[group.index].values
#         group_indices = group.index

#         # For each row i, the input is the window_size rows that come immediately before i
#         # So we skip the first 'window_size' rows because there's not enough data for a full window
#         for i in range(window_size, len(group_features)):
#             X_window = group_features[i - window_size : i]  # last 'window_size' samples
#             y_value = group_target[i]                       # the target for the i-th row (game)
#             X_list.append(X_window)
#             y_list.append(y_value)
#             idx_list.append(group_indices[i])
    
#     X_array = np.array(X_list, dtype=np.float32)
#     y_array = np.array(y_list, dtype=np.float32)
#     idx_array = np.array(idx_list)
    
#     return X_array, y_array, idx_array

In [10]:
# ------------------------------------------------------------
# 3) Helper Function: Create Sequences
# ------------------------------------------------------------
def create_player_sequences_fixed_length(data, target, player_column, max_length, max_cap=100):
    """
    Creates fixed-length sequences per player. If a sequence is shorter 
    than max_length, it is zero-padded at the front. If it is longer, 
    we only take the most recent max_length steps.

    Returns:
      X_padded: shape (num_sequences, max_length, num_features)
      y_list: target values (shape = (num_sequences,))
      idx_list: indices into the original DataFrame for each target
    """
    max_length = min(max_length, max_cap)
    X_list, y_list, idx_list = [], [], []
    
    for p_id, group in data.groupby(player_column):
        player_features = group.drop(columns=[player_column]).values
        player_target = target[group.index].values
        group_indices = group.index  # these are the row indices in the original DataFrame

        # NOTE: If you want each example to be the preceding steps 
        #       that predict the next row, you might start from i=1:
        for i in range(1, len(player_features)):
            seq = player_features[:i]
            X_list.append(seq)
            y_list.append(player_target[i])
            idx_list.append(group_indices[i])

    num_features = X_list[0].shape[1] if X_list else 0

    X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)
    
    for i, seq in enumerate(X_list):
        seq_len = len(seq)
        if seq_len <= max_length:
            X_padded[i, max_length - seq_len:, :] = seq
        else:
            X_padded[i, :, :] = seq[-max_length:]

    return X_padded, np.array(y_list), np.array(idx_list)

In [11]:
# ------------------------------------------------------------
# 4) LSTM Model Builder
# ------------------------------------------------------------
def build_lstm_model(timesteps, num_features, lstm_units=64):
    """
    Build a simple LSTM Network:
      - LSTM -> Dense(1)
    """
    model = Sequential()
    model.add(
        LSTM(
            units=lstm_units,
            input_shape=(timesteps, num_features),  # (max_length, num_features)
            return_sequences=False
        )
    )
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model


In [7]:
# # ------------------------------------------------------------
# # 6) Rolling/Expanding Window Validation (by Season)
# # ------------------------------------------------------------
# df["SEASON"] = df["SEASON"].astype(int)
# available_years = sorted(df['SEASON'].unique())
# print("Available Years in Data:", available_years)

# training_window = 3    # how many years to include in training
# window_size = 10       # how many games in each fixed-size input window

# mae_scores = []
# rmse_scores = []
# years_tested = []

# # For storing predictions
# all_predictions = []

# for validate_year in available_years:
#     start_train_year = validate_year - training_window
#     if start_train_year < available_years[0]:
#         print(f"Skipping {validate_year}: Not enough training years.")
#         continue
#     if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
#         print(f"Skipping {validate_year}: Missing intermediate years in training window.")
#         continue

#     # Split data
#     train_mask = (df['SEASON'] >= start_train_year) & (df['SEASON'] < validate_year)
#     val_mask   = (df['SEASON'] == validate_year)

#     train_data = df[train_mask].copy()
#     val_data   = df[val_mask].copy()

#     print(f"Validate Year: {validate_year}, Train Years: {start_train_year} to {validate_year-1}")
#     print(f"Training Data Length: {len(train_data)}, Validation Data Length: {len(val_data)}")

#     if len(train_data) == 0 or len(val_data) == 0:
#         print(f"Skipping {validate_year}: Insufficient training or validation data.")
#         continue

#     # ------------------------
#     # Scale the features
#     # ------------------------
#     # Exclude PLAYER_ID from scaling
#     numerical_features = [feat for feat in features if feat != "PLAYER_ID"]

#     scaler = MinMaxScaler()
#     train_data_scaled = train_data.copy()
#     val_data_scaled   = val_data.copy()

#     # Fit scaler on the training portion only
#     train_data_scaled[numerical_features] = scaler.fit_transform(train_data[numerical_features])
#     val_data_scaled[numerical_features]   = scaler.transform(val_data[numerical_features])

#     # ------------------------
#     # Create X, y using fixed window
#     # ------------------------
#     X_train, y_train, train_idx = create_fixed_window_sequences(
#         train_data_scaled, 
#         feature_cols=numerical_features, 
#         target_col=train_data_scaled[target], 
#         player_col="PLAYER_ID", 
#         window_size=window_size
#     )

#     X_val, y_val, val_idx = create_fixed_window_sequences(
#         val_data_scaled, 
#         feature_cols=numerical_features, 
#         target_col=val_data_scaled[target], 
#         player_col="PLAYER_ID", 
#         window_size=window_size
#     )

#     print(f"Train Sequences Shape: {X_train.shape}, Validation Sequences Shape: {X_val.shape}")
#     # Shapes: (num_samples, window_size, num_features)

#     if X_train.shape[0] == 0 or X_val.shape[0] == 0:
#         print(f"Skipping {validate_year}: Not enough sequences after windowing.")
#         continue

#     timesteps   = X_train.shape[1]
#     num_features = X_train.shape[2]

#     # ------------------------
#     # Build LSTM model
#     # ------------------------
#     model = build_lstm_model(timesteps, num_features, lstm_units=64)

#     early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

#     history = model.fit(
#         X_train, y_train,
#         validation_data=(X_val, y_val),
#         epochs=50, batch_size=32,
#         verbose=1,
#         callbacks=[early_stop]
#     )

#     # Predict
#     y_pred = model.predict(X_val).flatten()

#     mae = mean_absolute_error(y_val, y_pred)
#     mse = mean_squared_error(y_val, y_pred)
#     rmse = np.sqrt(mse)

#     mae_scores.append(mae)
#     rmse_scores.append(rmse)
#     years_tested.append(validate_year)

#     print(f"Validation Year: {validate_year}")
#     print(f"Train Years: {start_train_year} to {validate_year-1}")
#     print(f"MAE:  {mae:.2f}")
#     print(f"RMSE: {rmse:.2f}\n")

#     # --------------------------------------------
#     # Collect predictions with Player Info
#     # --------------------------------------------
#     # We need the original (unscaled) val_data for merging names, or you can do it here:
#     val_rows = val_data.loc[val_idx]  # get corresponding rows from original DataFrame

#     if 'PLAYER_NAME' not in val_rows.columns:
#         print("PLAYER_NAME column not found in the DataFrame. Please ensure it exists.")
#         continue

#     predictions_df = pd.DataFrame({
#         'PLAYER_ID': val_rows['PLAYER_ID'].values,
#         'PLAYER_NAME': val_rows['PLAYER_NAME'].values,
#         'Actual_PTS': y_val,
#         'Predicted_PTS': y_pred,
#         'Season': validate_year
#     }, index=val_idx)

#     all_predictions.append(predictions_df)

Available Years in Data: [2019, 2020, 2021, 2022]
Skipping 2019: Not enough training years.
Skipping 2020: Not enough training years.
Skipping 2021: Not enough training years.
Validate Year: 2022, Train Years: 2019 to 2021
Training Data Length: 149103, Validation Data Length: 38916
Train Sequences Shape: (142501, 10, 50), Validation Sequences Shape: (34242, 10, 50)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Validation Year: 2022
Train Years: 2019 to 2021
MAE:  2.73
RMSE: 4.63



In [12]:
# ------------------------------------------------------------
# 5) Rolling/Expanding Window Validation
# ------------------------------------------------------------
df["SEASON"] = df["SEASON"].astype(int)
available_years = sorted(df['SEASON'].unique())
print("Available Years in Data:", available_years)

training_window = 3
mae_scores = []
rmse_scores = []
years_tested = []

# Initialize list to collect all predictions
all_predictions = []

for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        print(f"Skipping {validate_year}: Not enough training years.")
        continue
    if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
        print(f"Skipping {validate_year}: Missing intermediate years in training window.")
        continue

    train_mask = (df['SEASON'] >= start_train_year) & (df['SEASON'] < validate_year)
    val_mask   = (df['SEASON'] == validate_year)

    train_data = df[train_mask]
    val_data   = df[val_mask]

    print(f"Validate Year: {validate_year}, Train Years: {start_train_year} to {validate_year-1}")
    print(f"Training Data Length: {len(train_data)}, Validation Data Length: {len(val_data)}")

    if len(train_data) == 0 or len(val_data) == 0:
        print(f"Skipping {validate_year}: Insufficient training or validation data.")
        continue

    # Identify numerical features to scale (exclude PLAYER_ID)
    numerical_features = [feat for feat in features if feat not in ["PLAYER_ID"]]
    
    scaler = MinMaxScaler()
    scaled_features_train = scaler.fit_transform(train_data[numerical_features])
    scaled_features_val   = scaler.transform(val_data[numerical_features])
    
    # Reconstruct scaled DataFrame
    train_scaled = pd.DataFrame(scaled_features_train, index=train_data.index, columns=numerical_features)
    train_scaled["PLAYER_ID"] = train_data["PLAYER_ID"].values
    
    val_scaled = pd.DataFrame(scaled_features_val, index=val_data.index, columns=numerical_features)
    val_scaled["PLAYER_ID"] = val_data["PLAYER_ID"].values

    max_len_train = len(train_scaled)  # use entire training set as a cap if you want
    max_len_val   = len(val_scaled)

    X_train_3D, y_train, train_idx = create_player_sequences_fixed_length(
        train_scaled, train_data[target], "PLAYER_ID", max_len_train
    )
    
    X_val_3D, y_val, val_idx = create_player_sequences_fixed_length(
        val_scaled, val_data[target], "PLAYER_ID", max_len_val
    )

    print(f"Train Sequences Shape: {X_train_3D.shape}, Validation Sequences Shape: {X_val_3D.shape}")
    # Shapes are (num_sequences, timesteps, num_features)

    timesteps   = X_train_3D.shape[1]
    num_features = X_train_3D.shape[2]

    # Build and train LSTM model
    model = build_lstm_model(timesteps, num_features, lstm_units=64)

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(
        X_train_3D, y_train,
        validation_data=(X_val_3D, y_val),
        epochs=50, batch_size=32,
        verbose=1,
        callbacks=[early_stop]
    )

    # Predict on the validation set
    y_pred = model.predict(X_val_3D).flatten()

    # Compute metrics
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)

    print(f"Validation Year: {validate_year}")
    print(f"Train Years: {start_train_year} to {validate_year-1}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}\n")

    # --------------------------------------------
    # Collect Predictions with Player Info
    # --------------------------------------------
    val_rows = val_data.loc[val_idx]

    # If PLAYER_NAME is not in val_rows, you might need to merge from another source
    if 'PLAYER_NAME' not in val_rows.columns:
        print("PLAYER_NAME column not found in the DataFrame. Please ensure it exists.")
        continue

    predictions_df = pd.DataFrame({
        'PLAYER_ID': val_rows['PLAYER_ID'].values,
        'PLAYER_NAME': val_rows['PLAYER_NAME'].values,
        'Actual_PTS': y_val,
        'Predicted_PTS': y_pred,
        'Season': validate_year
    }, index=val_idx)

    all_predictions.append(predictions_df)

Available Years in Data: [2019, 2020, 2021, 2022]
Skipping 2019: Not enough training years.
Skipping 2020: Not enough training years.
Skipping 2021: Not enough training years.
Validate Year: 2022, Train Years: 2019 to 2021
Training Data Length: 149103, Validation Data Length: 38916
Train Sequences Shape: (148413, 100, 50), Validation Sequences Shape: (38404, 100, 50)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Validation Year: 2022
Train Years: 2019 to 2021
MAE:  2.78
RMSE: 4.57



In [9]:
# ------------------------------------------------------------
# 6) Export All Predictions
# ------------------------------------------------------------
if all_predictions:
    final_predictions = pd.concat(all_predictions)
    
    # Optionally, sort by Season and PLAYER_ID
    final_predictions = final_predictions.sort_values(by=['Season', 'PLAYER_ID'])
    
    # Reset index if needed
    final_predictions.reset_index(drop=True, inplace=True)
    
    # Export to CSV
    final_predictions.to_csv("player_predictions.csv", index=False)
    
    print("Predictions have been exported to 'player_predictions.csv'.")
else:
    print("No predictions were collected.")

# Optionally, print average MAE and RMSE across all seasons
if mae_scores and rmse_scores:
    avg_mae = np.mean(mae_scores)
    avg_rmse = np.mean(rmse_scores)
    print(f"Average MAE across tested years: {avg_mae:.2f}")
    print(f"Average RMSE across tested years: {avg_rmse:.2f}")

Predictions have been exported to 'player_predictions.csv'.
Average MAE across tested years: 2.73
Average RMSE across tested years: 4.63
