In [None]:
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.linear_model._logistic import LogisticRegression
from datetime import datetime as dt
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.metrics import r2_score,make_scorer,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from statsmodels.tsa.stattools import adfuller
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from google.oauth2 import service_account

import joblib
import pickle
import pandas as pd
import pandas_gbq
import numpy as np
import xgboost
import lightgbm
import os
import unicodedata

In [None]:
def clean_player_name(name):
    """Standardizes player names by removing special characters and handling known name variations."""
    name = name.lower().strip()  # Convert to lowercase & remove extra spaces
    name = name.replace(".", "")  # Remove periods
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')

    # Remove special characters (apostrophes, dashes, etc.) 
    # Known name changes (add more as needed)
    name_corrections = {
        "alexandre sarr": "alex sarr",
        "jimmy butler": "jimmy butler iii",
        "nicolas claxton": "nic claxton",
        "kenyon martin jr": "kj martin",
        "carlton carrington": "bub carrington",
        "ron holland ii": "ronald holland ii",
        'cameron thomas':'cam thomas'
    }

    # Apply corrections if the name exists in the dictionary
    return name_corrections.get(name, name)  # Default to original name if no correction found

### Gathering Data and Feature Engineering

In [None]:


#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT *
from `capstone_data.player_modeling_data_partitioned`
order by game_date asc
"""

team_query = f"""
SELECT *
from `capstone_data.team_modeling_data_partitioned`
order by game_date asc
"""


In [None]:
try:
    credentials = service_account.Credentials.from_service_account_file(
        "/home/aportra99/scraping_key.json"
    )
    local = False
    print("Credentials file loaded.")
except FileNotFoundError:
    local = True
    credentials = None
    print("Running with default credentials.")

In [None]:
nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203',progress_bar_type='tqdm',credentials=credentials))
team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203',progress_bar_type='tqdm',credentials=credentials))
team_data  = team_data.merge(team_data,on='game_id',suffixes=('',"_opponent"))
team_data = team_data[team_data["team_id"] != team_data["team_id_opponent"]]
full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True) 
full_data.drop([column for column in full_data.columns if '_1' in column],axis = 1 , inplace=True)


In [None]:
data_ordered = full_data.sort_values('game_date')

data_ordered.dropna(inplace=True,axis=1)


#### Feature Engineering Ideas 

* (ratio of 3pa and fga and 3pm and 3pa) TS% for players efg% 
* for players assist_to_turnover ratio assist ratio, 
* rebound_cahnce, defesnive reb %, 
* ast_ratio_season * pace, 
* home * pts season - data pts 3pm avg,
* cold_streak pts_3gm_avg < pts_season boolean, 
* away difficulty away * opponent_defrtg_3gm_avg,
* home_performance = data_ordered[data_ordered["home"] == 1].groupby("team")["pts_season"].mean()
* away_performance = data_ordered[data_ordered["away"] == 1].groupby("team")["pts_season"].mean() these would be to see how the team performance changes 


In [None]:
data_ordered = data_ordered.groupby(['player','season']).apply(lambda x: x.iloc[3:]).reset_index(drop=True)

In [None]:
data_ordered.sort_values(by='game_date',inplace=True)

In [None]:
data_ordered['game_date'] = pd.to_datetime(data_ordered['game_date'])

In [None]:
data_ordered['days_ago'] = (data_ordered['game_date'].max() - data_ordered['game_date']).dt.days
data_ordered['time_decay_weight'] = 1 / (1 + np.log(1 + data_ordered['days_ago']))

In [None]:
pd.set_option('display.max_columns',100000)

In [None]:
try:
    data_ordered = data_ordered.drop('Unnamed: 0', axis =1)
except KeyError:
    print('Irregular column not made')

In [None]:
# Fill NaNs with the column mean, but only for numeric columns
data_ordered.fillna(data_ordered.select_dtypes(include=['number']).mean(), inplace=True)


In [None]:
numeric_columns = data_ordered.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['pts','reb','ast','blk','stl','3pm','game_id','game_date','days_ago','time_decay_weight','team_id', "gp_rank", "w_rank", "l_rank", "w_pct_rank", "min_rank", "fgm_rank",
    "fga_rank", "fg_pct_rank", "fg3m_rank", "fg3a_rank", "fg3_pct_rank",
    "ftm_rank", "fta_rank", "ft_pct_rank", "oreb_rank", "dreb_rank",
    "reb_rank", "ast_rank", "tov_rank", "stl_rank", "blk_rank",
    "blka_rank", "pf_rank", "pfd_rank", "pts_rank", "plus_minus_rank",]]

numeric_columns = [feature for feature in numeric_columns if any(keyword in feature for keyword in ["3gm_avg", "season", "momentum"])]
features = {feature:[] for feature in ['pts','reb','ast','3pm']}

In [None]:
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]

In [None]:


for category in features.keys():
    x = train_data[numeric_columns]
    y = train_data[category]

    mi_scores = mutual_info_regression(x, y)
    mi_scores = pd.Series(mi_scores, index=numeric_columns)
    selected_features = mi_scores[mi_scores > 0.10].index.tolist()  

    features[category] = selected_features


In [None]:
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
saved_models = {category:{} for category in ['pts','reb','ast','3pm']} 
saved_results = {category:{} for category in ['pts','reb','ast','3pm']}

#### SHAP
Applying shap to help reduce collinearity

### Linear Model

In [None]:

for category in features.keys():

    features_list = [f for f in features[category] if f != category]
    print(len(features_list))
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    linear_model = LinearRegression()

    linear_model.fit(x_train,y_train)

    y_pred = linear_model.predict(x_test)
    print(category)
    print(r2_score(y_true=y_test,y_pred=y_pred))

    saved_results[category]['linear_model']={'r2':{r2_score(y_true=y_test,y_pred=y_pred)}, 'mse':{mean_squared_error(y_true=y_test,y_pred=y_pred)}}
    saved_models[category]['linear_model'] = linear_model

In [None]:
for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    ridge_model = Ridge(alpha=1)

    ridge_model.fit(x_train,y_train)

    output = pd.DataFrame({'prediction':ridge_model.predict(x_test), 'actual':y_test})
    print(category)
    print(r2_score(y_true=output['actual'],y_pred=output['prediction']))

In [None]:
from sklearn.model_selection import cross_val_score

for category in features.keys():
    features_list = features[category]

    x_train, y_train = train_data[features_list], train_data[category]
    x_test, y_test = test_data[features_list], test_data[category]

    linear_model = Ridge(alpha=1.0)  # Use Ridge instead of LinearRegression
    linear_model.fit(x_train, y_train)

    # Cross-validation score instead of just test R²
    cv_r2 = cross_val_score(linear_model, x_train, y_train, cv=5, scoring='r2').mean()

    y_pred = linear_model.predict(x_test)
    test_r2 = r2_score(y_test, y_pred)

    print(f"{category}: Cross-Val R² = {cv_r2:.4f}, Test R² = {test_r2:.4f}")


### LightGBM

In [None]:
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from lightgbm import LGBMRegressor
import numpy as np

# Define the model
lgb_model = LGBMRegressor(n_estimators=1000, random_state=42,verbosity=-1)

# Define the expanded parameter grid
param_grid = {
    'num_leaves': [15, 31, 50, 75],
    'learning_rate': [0.005, 0.01, 0.05],
    'max_depth': [-1, 5, 10, 15],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Time series split (if your data is chronological)
tscv = TimeSeriesSplit(n_splits=5)

# Randomized search setup
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_grid,
    n_iter=40,  # control number of total combinations to test
    cv=tscv,
    scoring='r2',
    verbose=0,
    n_jobs=-1,
    random_state=42
)

# Fit to your training data
# Best model + params


In [None]:
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]
for category in features.keys():
    x_train,y_train = train_data[features[category]],train_data[category]
    x_test,y_test = test_data[features[category]],test_data[category]

    random_search.fit(x_train,y_train)

    best_model = random_search.best_estimator_
    print(category)
    print("Best Parameters:", random_search.best_params_)

    y_pred = best_model.predict(x_test)

    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    saved_models[category]['lightgbm'] = best_model
    print(f'MSE: {mse}')
    print(f'R2: {r2}')

    saved_results[category]['lightgbm']={'r2':{r2_score(y_true=y_test,y_pred=y_pred)}, 'mse':{mean_squared_error(y_true=y_test,y_pred=y_pred)}}


In [None]:
joblib.dump(saved_models,'models.pkl')


In [None]:
with open('saved_performance.txt', 'w') as file:
    for category, models in saved_results.items():
        file.write(f"Category: {category}\n")
        for model, metrics in models.items():
            file.write(f"  Model: {model}\n")
            for metric, value in metrics.items():
                file.write(f"    {metric}: {value}\n")
        file.write("\n")  # Newline between categories


### Ensemble Modeling into Classification Model

In [None]:
#Ensemble modeling

saved_models = joblib.load('models.pkl')

linear_models = {cat: saved_models[cat]['linear_model'] for cat in saved_models if 'linear_model' in saved_models[cat]}
lightgbm_models = {cat: saved_models[cat]['lightgbm'] for cat in saved_models if 'lightgbm' in saved_models[cat]}

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

meta_models = {}
meta_results = {}

for category in ['pts', 'reb', 'ast', '3pm']:
    # Load models
    lm = saved_models[category]['linear_model']
    lgb = saved_models[category]['lightgbm']

    # Prepare test data
    lm_features_list = [f.strip() for f in lm.feature_names_in_]
    lgb_features_list = [f.strip() for f in lgb.feature_names_in_]

    lm_x_test = test_data[lm_features_list]

    lgb_x_test = test_data[lgb_features_list]

    y_test = test_data[category]

    # Get predictions
    preds_lm = lm.predict(lm_x_test)
    preds_lgb = lgb.predict(lgb_x_test)

    # Stack predictions into meta-model features
    meta_X = np.vstack([preds_lm, preds_lgb]).T
    meta_y = y_test.values

    # Train meta-model
    meta_model = Ridge()
    meta_model.fit(meta_X, meta_y)

    # Evaluate meta-model
    meta_preds = meta_model.predict(meta_X)
    r2 = r2_score(meta_y, meta_preds)
    mse = mean_squared_error(meta_y, meta_preds)

    print(f"{category} Meta-model R²: {r2:.4f}, MSE: {mse:.4f}")

    meta_models[category] = meta_model
    meta_results[category] = {'r2': r2, 'mse': mse}


In [None]:
joblib.dump(meta_models,'meta_model.pkl')

In [None]:
coef = {}
for category, model in meta_models.items():
    coef_linear, coef_lgbm = model.coef_
    print(f"{category.upper()} Meta-Model Weights:")
    print(f"  Linear Model Weight:  {coef_linear:.4f}")
    print(f"  LightGBM Weight:      {coef_lgbm:.4f}")

    coef[f'{category}_lm'] = coef_linear
    coef[f'{category}_lgb'] = coef_lgbm

Ensemble Model into Classification Model

In [None]:
def compute_profit(pred, actual, odds):
    if pred == actual:
        return 100 if odds < 0 else odds
    else:
        return -abs(odds) if odds < 0 else -100

In [None]:
cats = ['points','assists','rebounds','threes_made']
categories = ['pts','ast','reb','3pm']
odds_data = {}
for cat,category in zip(cats,categories):
    predictions_query = f"""
    select *
    from `capstone_data.{category}_outcome`
    order by game_date asc
    """ 
    data = pandas_gbq.read_gbq(predictions_query,project_id='miscellaneous-projects-444203',credentials=credentials)

    
    odds_data[category] = data

In [None]:
data_ordered['game_date'].max()

In [None]:
full_data = {category:[] for category in categories}

data_ordered['player'] = data_ordered['player'].apply(clean_player_name)

for category in categories:

    data = odds_data[category]
    
    data['game_date'] = pd.to_datetime(data['game_date'])

    data['player']= data['player'].apply(clean_player_name)

    data = data.merge(data_ordered, on=['player','game_date'],how='inner')

    data = data.drop_duplicates(subset=['player', 'game_date'], keep='first')

    full_data[category] = data


In [None]:
for cat,category in zip(cats,categories):
    full_data[category][f'{category}_ensemble'] = (pd.to_numeric(full_data[category][f'{category}_linear_model']) * 
    coef[f'{category}_lm'] + pd.to_numeric(full_data[category][f'{category}_lightgbm'] * coef[f'{category}_lgb']))

    full_data[category][f'{category}_delta'] = full_data[category][f'{cat}'] - full_data[category][f'{category}_ensemble']

    full_data[category].sort_values(by='game_date',inplace=True)

In [None]:
classification_features = {}
mi_feature_sets = {}
categories = ['pts', 'ast', 'reb', '3pm']

box_score_cols = [
    'pts', 'pts_y', 'pts_x', 'ast_x', 'ast_y', 'reb_x', 'reb_y', 'reb', 'ast',
    'stl', 'blk', 'to', 'pf', 'min', 'fgm', 'fga', 'fg_pct',
    '3pm_y', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct',
    'oreb', 'dreb', 'plus_minus', '3pm', '3pm_x'
]
exclude_cols = box_score_cols + ['result', 'player', 'game_date', 'team', 'team_city', 'matchup', 'matchup_opponent']

NUM_RUNS = 10
SEEDS = list(range(42, 42 + NUM_RUNS))

for category in categories:
    print(f"\n=== Processing category: {category.upper()} ===")
    data = full_data[category].copy()

    if isinstance(data['result'].iloc[0], str):
        data['result'] = data['result'].map({'Over': 1, 'Under': 0})

    data = data.dropna(subset=['result'])

    # Define features
    candidate_features = [col for col in data.columns if col not in exclude_cols]
    numeric_features = [col for col in candidate_features if pd.api.types.is_numeric_dtype(data[col])]

    X = data[numeric_features].fillna(0)
    y = data['result']

    # ➤ Split first (no shuffle to preserve game ordering if time-based)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

    # ➤ Standard scale only on training set
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # ➤ MI on training data only
    mi_matrix = []
    for seed in SEEDS:
        mi_scores = mutual_info_classif(X_train_scaled, y_train, random_state=seed)
        mi_matrix.append(mi_scores)

    mi_avg_scores = np.mean(mi_matrix, axis=0)
    mi_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Avg_MI_Score': mi_avg_scores
    }).sort_values(by='Avg_MI_Score', ascending=False)

    # ➤ Select top 15 features
    top_features = mi_df['Feature'].head(15).tolist()
    mi_feature_sets[category] = top_features
    classification_features[category] = top_features

    print(f"Top averaged MI features for {category.upper()}:")
    print(top_features)


In [None]:
# Storage
results = {}
best_models = {}
label_map = {'Over': 1, 'Under': 0}

models = {
    'RandomForest': RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'LightGBM': LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42),
    'Stacked': StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('lgbm', LGBMClassifier(n_estimators=100, random_state=42)),
        ],
        final_estimator=LogisticRegression()
    )
}
# Weighted scoring function
def weighted_score(roi, profit, bets):
    roi_score = roi
    profit_score = profit / 1000
    volume_score = np.log1p(bets)
    return roi_score * .1 + profit_score * 0.50 + volume_score * 0.40

# American odds payout
def compute_profit(pred, actual, odds):
    return 100 if pred == actual and odds < 0 else (odds if pred == actual else -abs(odds) if odds < 0 else -100)

# Main loop
for category in ['pts', 'ast', 'reb', '3pm']:
    print(f"\n=== CATEGORY: {category.upper()} ===")
    best_entry = {'score': -float('inf')}  # ← init with low score

    df = full_data[category].copy()
    df = df.dropna(subset=['result'])
    df['result'] = df['result'].map(label_map) if df['result'].dtype == object else df['result']

    x = df[classification_features[category]].fillna(0)
    y = df['result']
    over_odds = df['Over'].values
    under_odds = df['Under'].values

    x_train, x_test, y_train, y_test, over_train, over_test, under_train, under_test = train_test_split(
        x, y, over_odds, under_odds, test_size=0.2, random_state=42
    )

    for model_name, model in models.items():
        print(f"\n--- Model: {model_name} ---")
        base_model = model
        base_model.fit(x_train, y_train)
        calibrated_model = CalibratedClassifierCV(base_model, cv='prefit')
        calibrated_model.fit(x_train, y_train)
        y_prob = calibrated_model.predict_proba(x_test)[:, 1]

        thresholds = [(0.5 + i * 0.05, 0.5 - i * 0.05) for i in range(1, 7)]
        print("Threshold_Over | Threshold_Under | Bets_Placed | Accuracy | Profit ($) | ROI (%)")
        print("-" * 75)

        for over_thresh, under_thresh in thresholds:
            bets, actuals, odds_used = [], [], []

            for prob, actual, o, u in zip(y_prob, y_test, over_test, under_test):
                if prob >= over_thresh:
                    bets.append(1)
                    actuals.append(actual)
                    odds_used.append(o)
                elif prob <= under_thresh:
                    bets.append(0)
                    actuals.append(actual)
                    odds_used.append(u)

            if not bets or len(bets) < 25:
                continue

            profits = [compute_profit(p, a, odd) for p, a, odd in zip(bets, actuals, odds_used)]
            total_profit = sum(profits)
            total_risk = sum(abs(odd) if p != a else 100 for p, a, odd in zip(bets, actuals, odds_used))
            roi = (total_profit / total_risk) * 100 if total_risk > 0 else 0
            accuracy = sum([p == a for p, a in zip(bets, actuals)]) / len(bets)
            score = weighted_score(roi, total_profit, len(bets))  # ✅ now safe to compute

            print(f"    {over_thresh:.2f}     |     {under_thresh:.2f}     |    {len(bets):4}     |  {accuracy:.4f} |   {total_profit:7.2f}  |   {roi:6.2f}")

            # Use weighted score to track best
            if score > best_entry['score']:
                best_entry = {
                    'Model': model_name,
                    'Category': category.upper(),
                    'Over_Threshold': over_thresh,
                    'Under_Threshold': under_thresh,
                    'ROI': roi,
                    'Profit': total_profit,
                    'Accuracy': accuracy,
                    'Bets_Placed': len(bets),
                    'Fitted_Model': calibrated_model,
                    'score': score  # track score
                }

    best_models[category] = best_entry

# Final summary
summary_df = pd.DataFrame.from_dict(best_models, orient='index').reset_index(drop=True)
summary_df = summary_df[['Category', 'Model', 'Over_Threshold', 'Under_Threshold', 'ROI', 'Profit', 'Accuracy', 'Bets_Placed']]
summary_df = summary_df.sort_values(by='Category', ascending=True)

print("\n=== BEST MODELS BY CATEGORY ===")
print(summary_df.to_string(index=False))

# Save to .pkl
with open('classification_models.pkl', 'wb') as f:
    pickle.dump(best_models, f)

print("\nSaved best models and thresholds to 'best_models_and_thresholds.pkl'")


In [None]:
from sklearn.model_selection import train_test_split

# Categories
categories = ['pts', 'ast', 'reb', '3pm']

# --- STEP 1: Get Latest Regression Date from `data_ordered`
train_data['game_date'] = pd.to_datetime(train_data['game_date'])
regression_cutoff_date = train_data['game_date'].max()

print(f"Latest date used in regression training (data_ordered): {regression_cutoff_date.date()}")

# --- STEP 2: Get Earliest Test Date per Category
test_start_dates = {}

for category in categories:
    df = full_data[category].copy()
    df = df.dropna(subset=['result'])
    df = df.sort_values('game_date')  # Just in case

    x = df[classification_features[category]].fillna(0)
    y = df['result']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    test_dates = df.loc[x_test.index, 'game_date']
    test_start_dates[category] = test_dates.min()

# --- STEP 3: Check for Overlap
print("\nChecking for potential overlap between regression training and classification test sets:\n")
for cat in categories:
    test_date = test_start_dates[cat]
    if regression_cutoff_date >= test_date:
        print(f"{cat.upper()}: OVERLAP detected — Regression trained up to {regression_cutoff_date.date()}, classification test starts {test_date.date()}")
    else:
        print(f"{cat.upper()}: Clean split — Regression ends {regression_cutoff_date.date()}, classification test starts {test_date.date()}")
