## Radial Basis Function Model - Added More Data

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# ------------------------------------------------------------
# 1) Custom RBF Layer (as shown above)
# ------------------------------------------------------------
class RBFLayer(tf.keras.layers.Layer):
    def __init__(self, units, gamma=1.0, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.gamma = gamma

    def build(self, input_shape):
        features = input_shape[-1]
        self.centers = self.add_weight(name='centers',
                                       shape=(self.units, features),
                                       initializer='glorot_uniform',
                                       trainable=True)
        self.betas = self.add_weight(name='betas',
                                     shape=(self.units,),
                                     initializer='ones',
                                     trainable=True)
        super().build(input_shape)

    def call(self, inputs):
        expanded_inputs = tf.expand_dims(inputs, axis=1)
        expanded_centers = tf.expand_dims(self.centers, axis=0)
        distances = tf.reduce_sum(tf.square(expanded_inputs - expanded_centers), axis=-1)
        rbfs = tf.exp(-self.gamma * tf.expand_dims(self.betas, 0) * distances)
        return rbfs

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.units)

In [3]:
# ------------------------------------------------------------
# 2) Configuration
# ------------------------------------------------------------
DB_NAME = "../../copy_nba_box_scores.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [4]:
# ------------------------------------------------------------
# 3) Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM combined_player_game_data", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["PLAYER_ID", "GAME_DATE_EST"])

# Extract the part after the colon and convert to int
df['MIN'] = df['MIN'].apply(lambda x: int(x.split(':')[1]))

features = [
    "PLAYER_ID",
    "PTS",
    "TEAM_ID",
    "MIN",       # Minutes played
    "FGA",       # Field Goal Attempts
    "FGM",       # Field Goals Made
    "FG_PCT",    # Field Goal Percentage
    "FG3A",      # 3-Point Attempts
    "FG3M",      # 3-Point Makes
    "FG3_PCT",   # 3-Point Percentage
    "FTA",       # Free Throw Attempts
    "FTM",       # Free Throws Made
    "FT_PCT",    # Free Throw Percentage
    "OREB",       # Total Rebounds (sometimes helpful)
    "DREB",
    "AST",       # Assists (also sometimes correlated with scoring load)
    "USG_PCT",   # Usage Percentage
    "TS_PCT",    # True Shooting Percentage
    "STL",
    "BLK",
    "TO",
    "PLUS_MINUS",
    "E_OFF_RATING", 
    "OFF_RATING", 
    "E_DEF_RATING", 
    "DEF_RATING", 
    "E_NET_RATING", 
    "NET_RATING", 
    "AST_PCT", 
    "AST_TOV", 
    "AST_RATIO", 
    "OREB_PCT", 
    "DREB_PCT", 
    "REB_PCT", 
    "TM_TOV_PCT", 
    "EFG_PCT", 
    "TS_PCT", 
    "USG_PCT", 
    "E_USG_PCT", 
    "E_PACE", 
    "PACE", 
    "PACE_PER40", 
    "POSS", 
    "PIE"
]
target = "PTS"

df = df.dropna(subset=features + [target, "PLAYER_NAME"])

X = df[features]
y = df[target]
# Initialize lists to collect predictions and associated player info
all_predictions = []


In [5]:
# ------------------------------------------------------------
# 4) Helper Function: Create Sequences (Same as Before)
# ------------------------------------------------------------
def create_player_sequences_fixed_length(data, target, player_column, max_length, max_cap=100):
    max_length = min(max_length, max_cap)
    X_list, y_list, idx_list = [], [], []
    
    for p_id, group in data.groupby(player_column):
        player_features = group.drop(columns=[player_column]).values
        player_target = target[group.index].values
        group_indices = group.index  # these are the row indices in the original DataFrame

        for i in range(1, len(player_features)):
            seq = player_features[:i]
            X_list.append(seq)
            y_list.append(player_target[i])
            idx_list.append(group_indices[i])  # store the row index of that target

    num_features = X_list[0].shape[1] if X_list else 0
    X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)
    
    for i, seq in enumerate(X_list):
        seq_len = len(seq)
        if seq_len <= max_length:
            X_padded[i, max_length - seq_len:, :] = seq
        else:
            X_padded[i, :, :] = seq[-max_length:]

    return X_padded, np.array(y_list), np.array(idx_list)

# ------------------------------------------------------------
# 5) RBF Model Builder
# ------------------------------------------------------------
def build_rbf_model(input_dim, rbf_units=20, gamma=1.0):
    """
    Build a simple RBF Network:
      - Flattened input -> RBF layer -> Dense(1)
    """
    model = Sequential()
    model.add(RBFLayer(units=rbf_units, gamma=gamma, input_shape=(input_dim,)))
    # Output
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
# ------------------------------------------------------------
# 6) Rolling/Expanding Window Validation
# ------------------------------------------------------------
df["SEASON"] = df["SEASON"].astype(int)
available_years = sorted(df['SEASON'].unique())
print("Available Years in Data:", available_years)

training_window = 3
mae_scores = []
rmse_scores = []
years_tested = []

# Initialize list to collect all predictions
all_predictions = []

for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        print(f"Skipping {validate_year}: Not enough training years.")
        continue
    if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
        print(f"Skipping {validate_year}: Missing intermediate years in training window.")
        continue

    train_mask = (df['SEASON'] >= start_train_year) & (df['SEASON'] < validate_year)
    val_mask   = (df['SEASON'] == validate_year)

    train_data = df[train_mask]
    val_data   = df[val_mask]

    print(f"Validate Year: {validate_year}, Train Years: {start_train_year} to {validate_year-1}")
    print(f"Training Data Length: {len(train_data)}, Validation Data Length: {len(val_data)}")

    if len(train_data) == 0 or len(val_data) == 0:
        print(f"Skipping {validate_year}: Insufficient training or validation data.")
        continue

    # Identify numerical features to scale (exclude PLAYER_ID and PLAYER_NAME)
    numerical_features = [feat for feat in features if feat not in ["PLAYER_ID"]]
    
    scaler = MinMaxScaler()
    scaled_features_train = scaler.fit_transform(train_data[numerical_features])
    scaled_features_val   = scaler.transform(val_data[numerical_features])
    
    # Reconstruct the scaled DataFrame without PLAYER_NAME
    train_scaled = pd.DataFrame(scaled_features_train, index=train_data.index, columns=numerical_features)
    train_scaled["PLAYER_ID"] = train_data["PLAYER_ID"].values
    
    val_scaled = pd.DataFrame(scaled_features_val, index=val_data.index, columns=numerical_features)
    val_scaled["PLAYER_ID"] = val_data["PLAYER_ID"].values


    max_len_train = len(train_scaled)
    max_len_val   = len(val_scaled)

    X_train_3D, y_train, train_idx = create_player_sequences_fixed_length(
        train_scaled, train_data[target], "PLAYER_ID", max_len_train
    )
    
    X_val_3D, y_val, val_idx = create_player_sequences_fixed_length(
        val_scaled, val_data[target], "PLAYER_ID", max_len_val
    )

    print(f"Train Sequences Shape: {X_train_3D.shape}, Validation Sequences Shape: {X_val_3D.shape}")

    X_train = X_train_3D.reshape((X_train_3D.shape[0], -1))
    X_val   = X_val_3D.reshape((X_val_3D.shape[0], -1))

    input_dim = X_train.shape[1]

    model = build_rbf_model(input_dim=input_dim, rbf_units=30, gamma=0.1)

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50, batch_size=32,
        verbose=1,
        callbacks=[early_stop]
    )

    y_pred = model.predict(X_val).flatten()

    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)

    print(f"Validation Year: {validate_year}")
    print(f"Train Years: {start_train_year} to {validate_year-1}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}\n")

    # --------------------------------------------
    # Collect Predictions with Player Info
    # --------------------------------------------
    # Retrieve the relevant rows from the validation data
    val_rows = val_data.loc[val_idx]

    # Check if PLAYER_NAME exists
    if 'PLAYER_NAME' not in val_rows.columns:
        print("PLAYER_NAME column not found in the DataFrame. Please ensure it exists.")
        # If PLAYER_NAME is not available, you might need to join with another table
        # For example:
        # player_names = pd.read_sql("SELECT PLAYER_ID, PLAYER_NAME FROM players_table", engine)
        # val_rows = val_rows.merge(player_names, on='PLAYER_ID', how='left')
        continue

    # Create a DataFrame with predictions and player info
    predictions_df = pd.DataFrame({
        'PLAYER_ID': val_rows['PLAYER_ID'].values,
        'PLAYER_NAME': val_rows['PLAYER_NAME'].values,
        'Actual_PTS': y_val,
        'Predicted_PTS': y_pred,
        'Season': validate_year
    }, index=val_idx)

    # Append to the all_predictions list
    all_predictions.append(predictions_df)

Available Years in Data: [2019, 2020, 2021, 2022]
Skipping 2019: Not enough training years.
Skipping 2020: Not enough training years.
Skipping 2021: Not enough training years.
Validate Year: 2022, Train Years: 2019 to 2021
Training Data Length: 195593, Validation Data Length: 51796
Train Sequences Shape: (194818, 100, 43), Validation Sequences Shape: (51257, 100, 43)


In [None]:
# ------------------------------------------------------------
# 7) Export All Predictions
# ------------------------------------------------------------
# Concatenate all predictions into a single DataFrame
if all_predictions:
    final_predictions = pd.concat(all_predictions)
    
    # Optionally, sort by Season and PLAYER_ID
    final_predictions = final_predictions.sort_values(by=['Season', 'PLAYER_ID'])
    
    # Reset index if needed
    final_predictions.reset_index(drop=True, inplace=True)
    
    # Export to CSV
    final_predictions.to_csv("player_predictions.csv", index=False)
    
    print("Predictions have been exported to 'player_predictions.csv'.")
else:
    print("No predictions were collected.")

# Optionally, print average MAE and RMSE across all seasons
if mae_scores and rmse_scores:
    avg_mae = np.mean(mae_scores)
    avg_rmse = np.mean(rmse_scores)
    print(f"Average MAE across tested years: {avg_mae:.2f}")
    print(f"Average RMSE across tested years: {avg_rmse:.2f}")