In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DB_NAME = "nba_data.db"
DB_URI = f"sqlite:///{DB_NAME}"
engine = create_engine(DB_URI, echo=False)

In [3]:
# ------------------------------------------------------------
# 1. Load Data
# ------------------------------------------------------------
row_count = pd.read_sql("SELECT COUNT(*) AS count FROM player_game_features;", engine)
print(f"Row count: {row_count['count'][0]}")
query = "SELECT * FROM player_game_features;"
df = pd.read_sql(query, engine)
print(df.head())

# For simplicity, let's assume we want to predict 'pts' using some of the rolling averages and efficiency metrics we created.
# Features (X) could be:
# 'rolling_pts_5', 'rolling_min_5', 'rolling_fg_pct_5', 'rolling_ppm_5', 'rolling_fgm_5', 'rolling_fga_5', 'reb', 'ast'
# Target (y) = 'pts'
features = [
    "rolling_pts_5",
    "rolling_min_5",
    "rolling_fg_pct_5",
    "rolling_ppm_5",
    "rolling_fgm_5",
    "rolling_fga_5",
    "reb",
    "ast"
]

# Drop rows where these features might be NaN (first few games of each player might not have full rolling windows)
df = df.dropna(subset=features + ["pts"])

X = df[features]
y = df["pts"]

Row count: 201805
     player_name  player_id     game_id                   game_date  \
0  Kevin Garnett        708  0021500017  2015-10-28 00:00:00.000000   
1  Kevin Garnett        708  0021500029  2015-10-30 00:00:00.000000   
2  Kevin Garnett        708  0021500050  2015-11-02 00:00:00.000000   
3  Kevin Garnett        708  0021500071  2015-11-05 00:00:00.000000   
4  Kevin Garnett        708  0021500085  2015-11-07 00:00:00.000000   

      team_id      matchup  pts        min  fgm  fga  ...  trailing_pts_5  \
0  1610612750    MIN @ LAL    4  12.886667    2    4  ...             NaN   
1  1610612750    MIN @ DEN    4  22.316667    1    3  ...        4.000000   
2  1610612750  MIN vs. POR    0  16.583333    0    3  ...        4.000000   
3  1610612750  MIN vs. MIA    0  11.266667    0    1  ...        2.666667   
4  1610612750    MIN @ CHI    2  12.263333    1    2  ...        2.000000   

   trailing_min_5 trailing_fgm_5 trailing_fga_5  trailing_reb_5  \
0             NaN        

KeyError: ['rolling_pts_5', 'rolling_min_5', 'rolling_fg_pct_5', 'rolling_ppm_5', 'rolling_fgm_5', 'rolling_fga_5']

In [None]:
# ------------------------------------------------------------
# 2. Split Data into Train and Test
# ------------------------------------------------------------
# We'll do a simple random split. More sophisticated approaches might respect time (train on past, test on future),
# but for a first pass, this is fine.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ------------------------------------------------------------
# 3. Train a Simple Model
# ------------------------------------------------------------
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# ------------------------------------------------------------
# 4. Evaluate the Model
# ------------------------------------------------------------
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print("Model Evaluation:")
print(f"MAE:  {mae:.2f}")
print(f"MSE:  {mse:.2f}")
print(f"RMSE: {rmse:.2f}")

In [None]:
# ------------------------------------------------------------
# 5. Interpretation and Next Steps
# ------------------------------------------------------------
# At this point, you've got a baseline model. The results (MAE, MSE, RMSE) tell you how far off the predictions are.
# You can try:
# - Adding more features (opponent strength, rest days)
# - Trying a more advanced model (RandomForest, Gradient Boosting, Neural Network)
# - Using time-series validation instead of a simple random split for more realistic evaluation.