## Radial Basis Function Model - Added More Data

In [38]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [39]:
# ------------------------------------------------------------
# 1) Custom RBF Layer (as shown above)
# ------------------------------------------------------------
class RBFLayer(tf.keras.layers.Layer):
    def __init__(self, units, gamma=1.0, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.gamma = gamma

    def build(self, input_shape):
        features = input_shape[-1]
        self.centers = self.add_weight(name='centers',
                                       shape=(self.units, features),
                                       initializer='glorot_uniform',
                                       trainable=True)
        self.betas = self.add_weight(name='betas',
                                     shape=(self.units,),
                                     initializer='ones',
                                     trainable=True)
        super().build(input_shape)

    def call(self, inputs):
        expanded_inputs = tf.expand_dims(inputs, axis=1)
        expanded_centers = tf.expand_dims(self.centers, axis=0)
        distances = tf.reduce_sum(tf.square(expanded_inputs - expanded_centers), axis=-1)
        rbfs = tf.exp(-self.gamma * tf.expand_dims(self.betas, 0) * distances)
        return rbfs

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.units)

In [40]:
# ------------------------------------------------------------
# 2) Configuration
# ------------------------------------------------------------
DB_NAME = "../../copy_nba_box_scores.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [41]:
# Inspect the column
print(df["MIN"].unique())

# Clean the "MIN" column (example assumes it's time-like or mixed format)
def clean_minutes(value):
    try:
        # Extract numeric part or handle as float
        return float(value.split(":")[0])  # Keep only the minute part if colon exists
    except (ValueError, AttributeError):
        return np.nan  # Handle invalid entries

df["MIN"] = df["MIN"].apply(clean_minutes)

# Handle missing/invalid values
df["MIN"] = df["MIN"].fillna(0)  # Replace NaNs with 0 (or any desired value)

# Convert to integer
df["MIN"] = df["MIN"].astype(int)

# Verify the result
print(df["MIN"].head())


['10.000000:14' '11.000000:54' '10.000000:43' ... '44.000000:29'
 '48.000000:00' '46.000000:56']
23724     10
55951     10
172096    10
23468     11
55695     11
Name: MIN, dtype: int32


In [45]:
# ------------------------------------------------------------
# 3) Load Data & Sort
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM combined_player_game_data", engine)

# Ensure data is sorted by player and date
df = df.sort_values(by=["PLAYER_ID", "GAME_DATE_EST"])

# Extract the season or year from 'game_date'
# df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST']).dt.year

features = [
    "PLAYER_ID",
    "PTS",
    "TEAM_ID",
               # Minutes played
    "FGA",       # Field Goal Attempts
    "FGM",       # Field Goals Made
    "FG_PCT",    # Field Goal Percentage
    "FG3A",      # 3-Point Attempts
    "FG3M",      # 3-Point Makes
    "FG3_PCT",   # 3-Point Percentage
    "FTA",       # Free Throw Attempts
    "FTM",       # Free Throws Made
    "FT_PCT",    # Free Throw Percentage
    "OREB",       # Total Rebounds (sometimes helpful)
    "DREB",
    "AST",       # Assists (also sometimes correlated with scoring load)
    "USG_PCT",   # Usage Percentage
    "TS_PCT",    # True Shooting Percentage
    "STL",
    "BLK",
    "TO",
    "PLUS_MINUS",
    "E_OFF_RATING", 
    "OFF_RATING", 
    "E_DEF_RATING", 
    "DEF_RATING", 
    "E_NET_RATING", 
    "NET_RATING", 
    "AST_PCT", 
    "AST_TOV", 
    "AST_RATIO", 
    "OREB_PCT", 
    "DREB_PCT", 
    "REB_PCT", 
    "TM_TOV_PCT", 
    "EFG_PCT", 
    "TS_PCT", 
    "USG_PCT", 
    "E_USG_PCT", 
    "E_PACE", 
    "PACE", 
    "PACE_PER40", 
    "POSS", 
    "PIE"
]
target = "PTS"

df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

In [48]:
# ------------------------------------------------------------
# 4) Helper Function: Create Sequences (Same as Before)
# ------------------------------------------------------------
def create_player_sequences_fixed_length(data, target, player_column, max_length, max_cap=100):
    """
    Create sequences of all past games for each player, then pad them to 'max_length'.
    Cap 'max_length' to avoid excessive memory usage.
    """
    max_length = min(max_length, max_cap)  # Apply cap
    X_list, y_list = [], []
    
    for p_id, group in data.groupby(player_column):
        player_features = group.drop(columns=[player_column]).values
        player_target = target[group.index].values

        for i in range(1, len(player_features)):
            seq = player_features[:i]  # up to i-1
            X_list.append(seq)
            y_list.append(player_target[i])  # target at i

    num_features = X_list[0].shape[1] if X_list else 0
    print(f"Creating padded array: {len(X_list)} sequences, max length: {max_length}, features: {num_features}")
    X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)

    for i, seq in enumerate(X_list):
        seq_len = len(seq)
        if seq_len <= max_length:
            X_padded[i, max_length - seq_len:, :] = seq
        else:
            X_padded[i, :, :] = seq[-max_length:]

    return X_padded, np.array(y_list)

# ------------------------------------------------------------
# 5) RBF Model Builder
# ------------------------------------------------------------
def build_rbf_model(input_dim, rbf_units=20, gamma=1.0):
    """
    Build a simple RBF Network:
      - Flattened input -> RBF layer -> Dense(1)
    """
    model = Sequential()
    model.add(RBFLayer(units=rbf_units, gamma=gamma, input_shape=(input_dim,)))
    # Output
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
# ------------------------------------------------------------
# 6) Rolling/Expanding Window Validation
# ------------------------------------------------------------
df["SEASON"] = df["SEASON"].astype(int)
available_years = sorted(df['SEASON'].unique())
print("Available Years in Data:", available_years)

training_window = 3
mae_scores = []
rmse_scores = []
years_tested = []

for validate_year in available_years:
    start_train_year = validate_year - training_window
    if start_train_year < available_years[0]:
        print(f"Skipping {validate_year}: Not enough training years.")
        continue
    if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
        print(f"Skipping {validate_year}: Missing intermediate years in training window.")
        continue

    train_mask = (df['SEASON'] >= start_train_year) & (df['SEASON'] < validate_year)
    val_mask   = (df['SEASON'] == validate_year)

    train_data = df[train_mask]
    val_data   = df[val_mask]

    print(f"Validate Year: {validate_year}, Train Years: {start_train_year} to {validate_year-1}")
    print(f"Training Data Length: {len(train_data)}, Validation Data Length: {len(val_data)}")

    if len(train_data) == 0 or len(val_data) == 0:
        print(f"Skipping {validate_year}: Insufficient training or validation data.")
        continue

    scaler = MinMaxScaler()
    scaled_features_train = scaler.fit_transform(train_data[features].drop(columns=["PLAYER_ID"]))
    scaled_features_val   = scaler.transform(val_data[features].drop(columns=["PLAYER_ID"]))

    train_scaled = pd.DataFrame(scaled_features_train, index=train_data.index, columns=features[1:])
    train_scaled["PLAYER_ID"] = train_data["PLAYER_ID"].values

    val_scaled = pd.DataFrame(scaled_features_val, index=val_data.index, columns=features[1:])
    val_scaled["PLAYER_ID"] = val_data["PLAYER_ID"].values

    max_len_train = len(train_scaled)
    max_len_val   = len(val_scaled)

    X_train_3D, y_train = create_player_sequences_fixed_length(
        train_scaled, train_data[target], "PLAYER_ID", max_len_train
    )
    X_val_3D, y_val = create_player_sequences_fixed_length(
        val_scaled, val_data[target], "PLAYER_ID", max_len_val
    )

    print(f"Train Sequences Shape: {X_train_3D.shape}, Validation Sequences Shape: {X_val_3D.shape}")

    X_train = X_train_3D.reshape((X_train_3D.shape[0], -1))
    X_val   = X_val_3D.reshape((X_val_3D.shape[0], -1))

    input_dim = X_train.shape[1]

    model = build_rbf_model(input_dim=input_dim, rbf_units=30, gamma=0.1)

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50, batch_size=32,
        verbose=1,
        callbacks=[early_stop]
    )

    y_pred = model.predict(X_val).flatten()

    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    years_tested.append(validate_year)

    print(f"Validation Year: {validate_year}")
    print(f"Train Years: {start_train_year} to {validate_year-1}")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}\n")


Available Years in Data: [2019, 2020, 2021, 2022]
Skipping 2019: Not enough training years.
Skipping 2020: Not enough training years.
Skipping 2021: Not enough training years.
Validate Year: 2022, Train Years: 2019 to 2021
Training Data Length: 195593, Validation Data Length: 51796
Creating padded array: 194818 sequences, max length: 100, features: 42
Creating padded array: 51257 sequences, max length: 100, features: 42
Train Sequences Shape: (194818, 100, 42), Validation Sequences Shape: (51257, 100, 42)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
1152/6089 [====>.........................] - ETA: 6s - loss: 20.1799 - mae: 3.1594 

In [None]:
# import pandas as pd
# import numpy as np
# from sqlalchemy import create_engine
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.metrics import mean_absolute_error, mean_squared_error

# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.callbacks import EarlyStopping

# # ------------------------------------------------------------
# # 1) Custom RBF Layer (as shown above)
# # ------------------------------------------------------------
# class RBFLayer(tf.keras.layers.Layer):
#     def __init__(self, units, gamma=1.0, **kwargs):
#         super().__init__(**kwargs)
#         self.units = units
#         self.gamma = gamma

#     def build(self, input_shape):
#         features = input_shape[-1]
#         self.centers = self.add_weight(name='centers',
#                                        shape=(self.units, features),
#                                        initializer='glorot_uniform',
#                                        trainable=True)
#         self.betas = self.add_weight(name='betas',
#                                      shape=(self.units,),
#                                      initializer='ones',
#                                      trainable=True)
#         super().build(input_shape)

#     def call(self, inputs):
#         expanded_inputs = tf.expand_dims(inputs, axis=1)
#         expanded_centers = tf.expand_dims(self.centers, axis=0)
#         distances = tf.reduce_sum(tf.square(expanded_inputs - expanded_centers), axis=-1)
#         rbfs = tf.exp(-self.gamma * tf.expand_dims(self.betas, 0) * distances)
#         return rbfs

#     def compute_output_shape(self, input_shape):
#         return (input_shape[0], self.units)


# # ------------------------------------------------------------
# # 2) Configuration
# # ------------------------------------------------------------
# DB_NAME = "../../copy_nba_box_scores.db"
# DB_URI = f"sqlite:///{DB_NAME}"
# engine = create_engine(DB_URI, echo=False)

# # ------------------------------------------------------------
# # 3) Load Data & Sort
# # ------------------------------------------------------------
# df = pd.read_sql("SELECT * FROM player_game_features", engine)

# # Ensure data is sorted by player and date
# df = df.sort_values(by=["player_id", "game_date"])

# # Extract the season or year from 'game_date'
# df['game_year'] = pd.to_datetime(df['game_date']).dt.year

# # Features and target
# features = ["player_id", "pts", "min", "fgm", "fga", "pts_per_min", "fg_pct"]
# target = "pts"

# df = df.dropna(subset=features + [target])

# X = df[features]
# y = df[target]

# # ------------------------------------------------------------
# # 4) Helper Function: Create Sequences (Same as Before)
# # ------------------------------------------------------------
# def create_player_sequences_fixed_length(data, target, player_column, max_length):
#     """
#     Create sequences of all past games for each player, then pad them to 'max_length'.
#     """
#     X_list, y_list = [], []
    
#     for p_id, group in data.groupby(player_column):
#         player_features = group.drop(columns=[player_column]).values
#         player_target = target[group.index].values

#         # Build sequences from length=1 up to the current index
#         for i in range(1, len(player_features)):
#             seq = player_features[:i]  # up to i-1
#             X_list.append(seq)
#             y_list.append(player_target[i])  # target at i

#     num_features = X_list[0].shape[1] if X_list else 0
#     X_padded = np.zeros((len(X_list), max_length, num_features), dtype=np.float32)

#     for i, seq in enumerate(X_list):
#         seq_len = len(seq)
#         if seq_len <= max_length:
#             X_padded[i, max_length - seq_len:, :] = seq
#         else:
#             X_padded[i, :, :] = seq[-max_length:]

#     return X_padded, np.array(y_list)

# # ------------------------------------------------------------
# # 5) RBF Model Builder
# # ------------------------------------------------------------
# def build_rbf_model(input_dim, rbf_units=20, gamma=1.0):
#     """
#     Build a simple RBF Network:
#       - Flattened input -> RBF layer -> Dense(1)
#     """
#     model = Sequential()
#     model.add(RBFLayer(units=rbf_units, gamma=gamma, input_shape=(input_dim,)))
#     # Output
#     model.add(Dense(1))
#     model.compile(optimizer='adam', loss='mse', metrics=['mae'])
#     return model

# # ------------------------------------------------------------
# # 6) Rolling/Expanding Window Validation
# # ------------------------------------------------------------
# available_years = sorted(df['game_year'].unique())
# print("Available Years in Data:", available_years)

# training_window = 4
# mae_scores = []
# rmse_scores = []
# years_tested = []

# for validate_year in available_years:
#     start_train_year = validate_year - training_window
#     if start_train_year < available_years[0]:
#         continue
#     if not all(y_ in available_years for y_ in range(start_train_year, validate_year)):
#         continue

#     train_mask = (df['game_year'] >= start_train_year) & (df['game_year'] < validate_year)
#     val_mask   = (df['game_year'] == validate_year)

#     train_data = df[train_mask]
#     val_data   = df[val_mask]

#     if len(train_data) == 0 or len(val_data) == 0:
#         continue

#     # Scale only the feature columns except "player_id"
#     scaler = MinMaxScaler()
#     scaled_features_train = scaler.fit_transform(train_data[features].drop(columns=["player_id"]))
#     scaled_features_val   = scaler.transform(val_data[features].drop(columns=["player_id"]))

#     train_scaled = pd.DataFrame(scaled_features_train, index=train_data.index, columns=features[1:])
#     train_scaled["player_id"] = train_data["player_id"].values

#     val_scaled = pd.DataFrame(scaled_features_val, index=val_data.index, columns=features[1:])
#     val_scaled["player_id"] = val_data["player_id"].values

#     def find_player_longest_sequence(data_df, id_col="player_id"):
#         max_len = 0
#         for _, group in data_df.groupby(id_col):
#             length = len(group)
#             max_len = max(max_len, length - 1)
#         return max_len

#     max_len_train = find_player_longest_sequence(train_scaled, "player_id")
#     max_len_val   = find_player_longest_sequence(val_scaled, "player_id")
#     max_len_both  = max(max_len_train, max_len_val)
#     if max_len_both < 1:
#         continue

#     # Create sequences for RBF
#     X_train_3D, y_train = create_player_sequences_fixed_length(
#         train_scaled, train_data[target], "player_id", max_len_both
#     )
#     X_val_3D, y_val = create_player_sequences_fixed_length(
#         val_scaled, val_data[target], "player_id", max_len_both
#     )

#     if len(X_train_3D) == 0 or len(X_val_3D) == 0:
#         continue

#     # RBFN requires 2D input: flatten [batch, timesteps, features] -> [batch, timesteps*features]
#     X_train = X_train_3D.reshape((X_train_3D.shape[0], -1))
#     X_val   = X_val_3D.reshape((X_val_3D.shape[0], -1))

#     input_dim = X_train.shape[1]  # timesteps * features

#     # Build RBF model
#     model = build_rbf_model(input_dim=input_dim, rbf_units=30, gamma=0.1)

#     # Basic early stopping
#     early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

#     # Train
#     history = model.fit(
#         X_train, y_train,
#         validation_data=(X_val, y_val),
#         epochs=50, batch_size=32,
#         verbose=1,
#         callbacks=[early_stop]
#     )

#     # Predict
#     y_pred = model.predict(X_val).flatten()

#     mae = mean_absolute_error(y_val, y_pred)
#     mse = mean_squared_error(y_val, y_pred)
#     rmse = np.sqrt(mse)

#     mae_scores.append(mae)
#     rmse_scores.append(rmse)
#     years_tested.append(validate_year)

#     print(f"Validation Year: {validate_year}")
#     print(f"Train Years: {start_train_year} to {validate_year-1}")
#     print(f"MAE:  {mae:.2f}")
#     print(f"RMSE: {rmse:.2f}\n")

# print("Years Tested:", years_tested)
# print("MAE Scores:", mae_scores)
# print("RMSE Scores:", rmse_scores)
