In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [3]:
# ------------------------------------------------------------
# 1. Load Data
# ------------------------------------------------------------
query = "SELECT * FROM player_game_features;"
df = pd.read_sql(query, engine)

# Ensure the data is sorted by player and game date
# Replace 'player_id' and 'game_date' with your actual column names
df = df.sort_values(by=["player_id", "game_date"])

# Features (X) and target (y)
features = [
    "rolling_pts_5",
    "rolling_min_5",
    "rolling_fg_pct_5",
    "rolling_ppm_5",
    "rolling_fgm_5",
    "rolling_fga_5",
    "reb",
    "ast"
]

# Drop rows where these features might be NaN (first few games of each player might not have full rolling windows)
df = df.dropna(subset=features + ["pts"])
X = df[features]
y = df["pts"]

In [None]:
# ------------------------------------------------------------
# 2. Time-Series Validation (Cross-Validation Approach)
# ------------------------------------------------------------

# Initialize the time-series split
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Models to test
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()
}

# Metrics to track performance across models and folds
model_metrics = {model_name: [] for model_name in models.keys()}

for model_name, model in models.items():
    print(f"\nTesting Model: {model_name}")
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        print(f"  Fold {fold + 1}/{n_splits}")
        
        # Train and Test split for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5

        model_metrics[model_name].append({"Fold": fold + 1, "MAE": mae, "MSE": mse, "RMSE": rmse})
        print(f"    MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}")


Testing Model: LinearRegression
  Fold 1/5
    MAE: 4.06, MSE: 28.54, RMSE: 5.34
  Fold 2/5
    MAE: 3.63, MSE: 22.92, RMSE: 4.79
  Fold 3/5
    MAE: 3.67, MSE: 23.49, RMSE: 4.85
  Fold 4/5
    MAE: 3.47, MSE: 21.34, RMSE: 4.62
  Fold 5/5
    MAE: 3.37, MSE: 20.73, RMSE: 4.55

Testing Model: RandomForest
  Fold 1/5
    MAE: 4.17, MSE: 30.09, RMSE: 5.49
  Fold 2/5
    MAE: 3.72, MSE: 24.04, RMSE: 4.90
  Fold 3/5
    MAE: 3.73, MSE: 24.53, RMSE: 4.95
  Fold 4/5
    MAE: 3.50, MSE: 21.92, RMSE: 4.68
  Fold 5/5
    MAE: 3.38, MSE: 21.06, RMSE: 4.59

Testing Model: GradientBoosting
  Fold 1/5
    MAE: 4.04, MSE: 28.48, RMSE: 5.34
  Fold 2/5
    MAE: 3.59, MSE: 22.59, RMSE: 4.75
  Fold 3/5
    MAE: 3.61, MSE: 22.95, RMSE: 4.79
  Fold 4/5
    MAE: 3.40, MSE: 20.69, RMSE: 4.55
  Fold 5/5
    MAE: 3.29, MSE: 20.02, RMSE: 4.47

Testing Model: SVR
  Fold 1/5


In [None]:
# ------------------------------------------------------------
# 3. Evaluate and Compare Models
# ------------------------------------------------------------
print("\nModel Comparison:")
for model_name, metrics in model_metrics.items():
    avg_mae = sum(m["MAE"] for m in metrics) / len(metrics)
    avg_mse = sum(m["MSE"] for m in metrics) / len(metrics)
    avg_rmse = sum(m["RMSE"] for m in metrics) / len(metrics)
    print(f"{model_name}: Avg MAE: {avg_mae:.2f}, Avg MSE: {avg_mse:.2f}, Avg RMSE: {avg_rmse:.2f}")