In [10]:
import sqlite3

DB_NAME = "../nba_data.db"
connection = sqlite3.connect(DB_NAME)

# Query to list all tables in the database
cursor = connection.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in the database:", tables)

connection.close()


Tables in the database: [('player_game_data',), ('player_game_features',)]


In [11]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "../nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

# ------------------------------------------------------------
# Load Data
# ------------------------------------------------------------
df = pd.read_sql("SELECT * FROM player_game_features", engine)

# Sort by player_id and game_date to maintain chronological order per player
df = df.sort_values(by=["player_id", "game_date"])

# We'll use the same features as before
features = [
    "rolling_pts_5",
    "rolling_min_5",
    "rolling_fg_pct_5",
    "rolling_ppm_5",
    "rolling_fgm_5",
    "rolling_fga_5",
    "reb",
    "ast"
]

# Drop rows with NaNs in features or target
df = df.dropna(subset=features + ["pts"])

X = df[features]
y = df["pts"]

# ------------------------------------------------------------
# Time-Based Split
# ------------------------------------------------------------
# Use the first 80% of the data as "past" and the last 20% as "future".
split_index = int(len(df) * 0.8)
X_train, X_val = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_val = y.iloc[:split_index], y.iloc[split_index:]

# ------------------------------------------------------------
# Hyperparameter Tuning for Ensemble Models
# ------------------------------------------------------------
def tune_and_evaluate(model, model_name, param_dist=None):
    print(f"\n===== {model_name} =====")
    if param_dist:
        # RandomizedSearchCV for hyperparameter tuning
        random_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_dist,
            n_iter=20,
            cv=3,
            n_jobs=-1,
            random_state=42,
            verbose=1
        )
        random_search.fit(X_train, y_train)
        model = random_search.best_estimator_
        print(f"Best Parameters: {random_search.best_params_}")
    else:
        model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_val)
    
    # Metrics
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    print("Time-based validation results:")
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}\n")


# ------------------------------------------------------------
# Model Definitions and Hyperparameter Distributions
# ------------------------------------------------------------
# RandomForest Hyperparameter Distribution
rf_param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['auto', 'sqrt', 'log2']
}

# GradientBoosting Hyperparameter Distribution
gb_param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'subsample': uniform(0.7, 0.3)
}

# ------------------------------------------------------------
# Evaluate Models
# ------------------------------------------------------------
# Linear Regression (no hyperparameters to tune)
tune_and_evaluate(LinearRegression(), "Linear Regression")

# Random Forest with Hyperparameter Tuning
tune_and_evaluate(RandomForestRegressor(random_state=42), "Random Forest", rf_param_dist)

# Gradient Boosting with Hyperparameter Tuning
tune_and_evaluate(GradientBoostingRegressor(random_state=42), "Gradient Boosting", gb_param_dist)



===== Linear Regression =====
Time-based validation results:
MAE: 3.24
MSE: 19.35
RMSE: 4.40


===== Random Forest =====
Fitting 3 folds for each of 20 candidates, totalling 60 fits


18 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Arjun\anaconda3\envs\NBA\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Arjun\anaconda3\envs\NBA\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\Arjun\anaconda3\envs\NBA\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self

Best Parameters: {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 103}
Time-based validation results:
MAE: 3.18
MSE: 18.85
RMSE: 4.34


===== Gradient Boosting =====
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'learning_rate': np.float64(0.023999698964084628), 'max_depth': 6, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 239, 'subsample': np.float64(0.935552788417904)}
Time-based validation results:
MAE: 3.16
MSE: 18.53
RMSE: 4.30



In [12]:
# Retrain Random Forest
rf_best = RandomForestRegressor(
    max_depth=12,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=7,
    n_estimators=103,
    random_state=42
)
rf_best.fit(X_train, y_train)

# Retrain Gradient Boosting
gb_best = GradientBoostingRegressor(
    learning_rate=0.024,
    max_depth=6,
    min_samples_leaf=3,
    min_samples_split=7,
    n_estimators=239,
    subsample=0.936,
    random_state=42
)
gb_best.fit(X_train, y_train)


In [13]:
# Random Forest predictions
rf_preds = rf_best.predict(X_val)
rf_mae = mean_absolute_error(y_val, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_preds))

print("Random Forest - Final Validation Results:")
print(f"MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}")

# Gradient Boosting predictions
gb_preds = gb_best.predict(X_val)
gb_mae = mean_absolute_error(y_val, gb_preds)
gb_rmse = np.sqrt(mean_squared_error(y_val, gb_preds))

print("Gradient Boosting - Final Validation Results:")
print(f"MAE: {gb_mae:.2f}, RMSE: {gb_rmse:.2f}")


Random Forest - Final Validation Results:
MAE: 3.18, RMSE: 4.34
Gradient Boosting - Final Validation Results:
MAE: 3.16, RMSE: 4.30
