## Embeddings for Categorical Variables

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense, Flatten, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# ------------------------------------------------------------
# 1) Configuration and Data Loading
# ------------------------------------------------------------
DB_NAME = "../../copy_nba_box_scores.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

# Example: Another table or CSV that has team-level defensive stats
# E.g. columns: TEAM_ID, TEAM_DEF_RATING, TEAM_NAME, etc.
# For demonstration, we'll assume we have a "team_stats" table in the same DB.
team_stats_df = pd.read_sql("SELECT * FROM team_stats", engine)

# Main DataFrame: for each player's game-level data
df = pd.read_sql("SELECT * FROM combined_player_game_data", engine)

# Ensure chronological order by player
df = df.sort_values(by=["PLAYER_ID", "GAME_DATE_EST"])

# Example: convert MIN from "XX:YY" -> integer representing "YY"
df["MIN"] = df["MIN"].apply(lambda x: int(x.split(":")[1]))


In [10]:
# ------------------------------------------------------------
# 2) Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM combined_player_game_data", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["PLAYER_ID", "GAME_DATE_EST"])

# Extract the part after the colon and convert to int
df['MIN'] = df['MIN'].apply(lambda x: int(x.split(':')[1]))

# ------------------------------------------------------------
# 2a) Create Rolling Averages (Example: 3-game and 5-game windows)
# ------------------------------------------------------------
# We demonstrate using PTS, MIN, USG_PCT as sample columns to create rolling averages.
# You can add more columns or different window sizes as you like.
columns_to_roll = ["PTS", "MIN", "USG_PCT"]
window_sizes = [3, 5]

for col in columns_to_roll:
    for w in window_sizes:
        df[f"{col}_rolling_{w}"] = (
            df.groupby("PLAYER_ID")[col]
              .transform(lambda x: x.shift(1).rolling(w).mean())
        )

# Fill any NaN values (early games won't have enough history for rolling windows)
df.fillna(0, inplace=True)


# ------------------------------------------------------------
# 3) Feature & Target Definitions
# ------------------------------------------------------------
numeric_features = [
    "PLAYER_ID",
    "TEAM_ID",
    # "PTS",
    "MIN",       # Minutes played
    "FGA",       # Field Goal Attempts
    "FGM",       # Field Goals Made
    "FG_PCT",    # Field Goal Percentage
    "FG3A",      # 3-Point Attempts
    "FG3M",      # 3-Point Makes
    "FG3_PCT",   # 3-Point Percentage
    "FTA",       # Free Throw Attempts
    "FTM",       # Free Throws Made
    "FT_PCT",    # Free Throw Percentage
    "OREB",
    "DREB",
    "AST",
    "USG_PCT",
    "TS_PCT",
    "STL",
    "BLK",
    "TO",
    "PLUS_MINUS",
    "E_OFF_RATING", 
    "OFF_RATING", 
    "E_DEF_RATING", 
    "DEF_RATING", 
    "E_NET_RATING", 
    "NET_RATING", 
    "AST_PCT", 
    "AST_TOV", 
    "AST_RATIO", 
    "OREB_PCT", 
    "DREB_PCT", 
    "REB_PCT", 
    "TM_TOV_PCT", 
    "EFG_PCT", 
    "TS_PCT", 
    "USG_PCT", 
    "E_USG_PCT", 
    "E_PACE", 
    "PACE", 
    "PACE_PER40", 
    "POSS", 
    "PIE",
    "HOME_TEAM_ID",
    "VISITOR_TEAM_ID",
    # New rolling features:
    "PTS_rolling_3",
    "PTS_rolling_5",
    "MIN_rolling_3",
    "MIN_rolling_5",
    "USG_PCT_rolling_3",
    "USG_PCT_rolling_5",
]
target_col = "PTS"

df = df.dropna(subset=features + [target, "PLAYER_NAME"])

X = df[features]
y = df[target]

In [None]:
# ------------------------------------------------------------
# 6) Create a Fixed-Window Sequence Function
#    (for time-series numeric input)
# ------------------------------------------------------------
def create_fixed_window_sequences(
    data: pd.DataFrame,
    numeric_cols: list,
    target_col: str,
    player_col: str,
    window_size: int
):
    """
    For each player, generate sliding windows of length `window_size`.
    If i < window_size, we skip that row.
    
    Returns:
      X_numeric: shape (num_samples, window_size, num_numeric_features)
      y:         shape (num_samples,)
      idx_list:  indices of the original rows used for the target
    """
    X_list, y_list, idx_list = [], [], []
    
    for p_id, group in data.groupby(player_col):
        group_features = group[numeric_cols].values  # (num_games, num_features)
        group_target = group[target_col].values
        group_idx = group.index
        
        # Slide over the group
        for i in range(window_size, len(group_features)):
            X_window = group_features[i - window_size : i]
            y_value = group_target[i]
            idx_val = group_idx[i]
            
            X_list.append(X_window)
            y_list.append(y_value)
            idx_list.append(idx_val)
    
    X_numeric = np.array(X_list, dtype=np.float32)
    y_array = np.array(y_list, dtype=np.float32)
    idx_array = np.array(idx_list)
    
    return X_numeric, y_array, idx_array


# ------------------------------------------------------------
# 7) Train / Validation Split
#    (Simple example, not season-based. We'll do 80/20.)
# ------------------------------------------------------------
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=False)

# We will scale numeric features. Fit only on train set, then transform val.
scaler = MinMaxScaler()
train_df_scaled = train_df.copy()
val_df_scaled = val_df.copy()

train_df_scaled[numeric_features] = scaler.fit_transform(train_df[numeric_features])
val_df_scaled[numeric_features]   = scaler.transform(val_df[numeric_features])

# Choose a window size
window_size = 10

# Create time-series numeric arrays
X_train_numeric, y_train, train_idx = create_fixed_window_sequences(
    train_df_scaled, 
    numeric_cols=numeric_features,
    target_col=target_col,
    player_col="PLAYER_ID",
    window_size=window_size
)
X_val_numeric, y_val, val_idx = create_fixed_window_sequences(
    val_df_scaled,
    numeric_cols=numeric_features,
    target_col=target_col,
    player_col="PLAYER_ID",
    window_size=window_size
)

print("X_train_numeric shape:", X_train_numeric.shape)  # (num_samples, window_size, num_features)
print("y_train shape:", y_train.shape)
print("X_val_numeric shape:", X_val_numeric.shape)
print("y_val shape:", y_val.shape)


# ------------------------------------------------------------
# 8) Prepare Embedding Inputs
# ------------------------------------------------------------
# We need integer ID arrays for:
#   - PLAYER_ID
#   - TEAM_ID
#   - OPP_TEAM_ID
# But note, each row in the final X_train_numeric corresponds to some row in the original df.
# We'll gather IDs from the original data at those indices.
#
# Because each "sample" in X_train_numeric is a window of 10 rows from the same player, we
# only need the "final row's" IDs. So let's do that simply:

def gather_ids_for_samples(df_original, idx_array):
    """
    Given indices from the final row in the time series,
    return arrays for player_id, team_id, opp_team_id.
    """
    player_ids = df_original.loc[idx_array, "PLAYER_ID"].values
    team_ids   = df_original.loc[idx_array, "TEAM_ID"].values
    opp_ids    = df_original.loc[idx_array, "VISITOR_TEAM_ID"].values
    return player_ids.astype(int), team_ids.astype(int), opp_ids.astype(int)

train_player_id, train_team_id, train_opp_id = gather_ids_for_samples(train_df, train_idx)
val_player_id, val_team_id, val_opp_id       = gather_ids_for_samples(val_df, val_idx)

# For embeddings, we usually want them as shape (num_samples, 1)
train_player_id = train_player_id.reshape(-1, 1)
train_team_id   = train_team_id.reshape(-1, 1)
train_opp_id    = train_opp_id.reshape(-1, 1)

val_player_id = val_player_id.reshape(-1, 1)
val_team_id   = val_team_id.reshape(-1, 1)
val_opp_id    = val_opp_id.reshape(-1, 1)

print("train_player_id shape:", train_player_id.shape)
print("train_team_id shape:", train_team_id.shape)
print("train_opp_id shape:", train_opp_id.shape)


# ------------------------------------------------------------
# 9) Build a Multi-Input Model with Embeddings
# ------------------------------------------------------------
NUM_PLAYERS = df["PLAYER_ID"].nunique()
NUM_TEAMS   = df["TEAM_ID"].nunique()

# Decide on embedding dimensions
EMBED_DIM_PLAYER = 32
EMBED_DIM_TEAM   = 16

# 1) Time-series numeric input: shape = (window_size, len(numeric_features))
input_numeric = Input(shape=(window_size, len(numeric_features)), name="numeric_input")
x = LSTM(64, return_sequences=False)(input_numeric)  
# shape now (batch, 64)

# 2) Player ID input: shape = (None, 1)
input_player = Input(shape=(1,), name="player_id_input")
embed_player = Embedding(
    input_dim=NUM_PLAYERS + 1,  # +1 in case IDs go up to N
    output_dim=EMBED_DIM_PLAYER,
    name="player_embedding"
)(input_player)
embed_player = Flatten()(embed_player)  # shape (batch, EMBED_DIM_PLAYER)

# 3) Team ID input
input_team = Input(shape=(1,), name="team_id_input")
embed_team = Embedding(
    input_dim=NUM_TEAMS + 1,
    output_dim=EMBED_DIM_TEAM,
    name="team_embedding"
)(input_team)
embed_team = Flatten()(embed_team)  # shape (batch, EMBED_DIM_TEAM)

# 4) Opponent Team ID input
input_opp = Input(shape=(1,), name="opp_team_id_input")
embed_opp = Embedding(
    input_dim=NUM_TEAMS + 1,
    output_dim=EMBED_DIM_TEAM,
    name="opp_team_embedding"
)(input_opp)
embed_opp = Flatten()(embed_opp)

# 5) Concatenate all
merged = Concatenate(name="concat_layer")([x, embed_player, embed_team, embed_opp])

# 6) Dense layers -> output
hidden = Dense(64, activation="relu")(merged)
output = Dense(1, activation="linear", name="pred_pts")(hidden)

model = Model(
    inputs=[input_numeric, input_player, input_team, input_opp],
    outputs=output
)

model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()


# ------------------------------------------------------------
# 10) Train the Model
# ------------------------------------------------------------
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history = model.fit(
    x={
       "numeric_input": X_train_numeric,
       "player_id_input": train_player_id,
       "team_id_input": train_team_id,
       "opp_team_id_input": train_opp_id
    },
    y=y_train,
    validation_data=(
        {
            "numeric_input": X_val_numeric,
            "player_id_input": val_player_id,
            "team_id_input": val_team_id,
            "opp_team_id_input": val_opp_id
        },
        y_val
    ),
    epochs=50,
    batch_size=32,
    verbose=1,
    callbacks=[early_stop]
)


# ------------------------------------------------------------
# 11) Evaluate
# ------------------------------------------------------------
y_pred = model.predict({
    "numeric_input": X_val_numeric,
    "player_id_input": val_player_id,
    "team_id_input": val_team_id,
    "opp_team_id_input": val_opp_id
}).flatten()

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print(f"Validation MAE:  {mae:.3f}")
print(f"Validation RMSE: {rmse:.3f}")


# ------------------------------------------------------------
# 12) Optional: Build a DataFrame of Predictions
# ------------------------------------------------------------
val_results_df = val_df.loc[val_idx, ["PLAYER_ID", "PLAYER_NAME", target_col]].copy()
val_results_df["Predicted_PTS"] = y_pred
val_results_df.rename(columns={target_col: "Actual_PTS"}, inplace=True)

# Sort or manipulate as desired
val_results_df.sort_values(by="Predicted_PTS", ascending=False, inplace=True)

print(val_results_df.head(20))

X_train_numeric shape: (145838, 10, 51)
y_train shape: (145838,)
X_val_numeric shape: (34883, 10, 51)
y_val shape: (34883,)
train_player_id shape: (145838, 1)
train_team_id shape: (145838, 1)
train_opp_id shape: (145838, 1)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 player_id_input (InputLayer)   [(None, 1)]          0           []                               
                                                                                                  
 team_id_input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 opp_team_id_input (InputLayer)  [(None, 1)]         0           []                               
                                                                    