In [59]:
from __future__ import annotations

import os
import math
import numpy as np
import pandas as pd
import re

from typing import Tuple, List, Dict, Optional

# Visualization (optional; only used in EDA snippets)
import matplotlib.pyplot as plt

# Preprocessing / Modeling
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor

# statsmodels (for Beta regression if available)
try:
    import statsmodels.api as sm
    from statsmodels.genmod.families import Beta as SM_BetaFamily  # may not exist in older versions
    from statsmodels.genmod.families.links import logit as sm_logit
    STATS_BETA_AVAILABLE = True
except Exception:
    STATS_BETA_AVAILABLE = False

from scipy.special import expit, logit

In [40]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)

ensure_dir("outputs")

def safe_logit(y: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    """Apply logit to y in (0,1) with clipping for numerical stability."""
    y = np.asarray(y, dtype=float)
    y = np.clip(y, eps, 1 - eps)
    return logit(y)

def safe_expit(z: np.ndarray) -> np.ndarray:
    """Inverse of logit (sigmoid)."""
    z = np.asarray(z, dtype=float)
    return expit(z)

def normalize_to_0_1(y: pd.Series, eps: float = 1e-6) -> pd.Series:
    """Normalize a positive, bounded target (e.g., PER) to (0,1).
    Uses min-max with tiny epsilon padding to avoid exact 0/1 values for Beta modeling."""
    y = y.astype(float)
    ymin, ymax = y.min(), y.max()
    denom = (ymax - ymin) + 2 * eps
    return (y - ymin + eps) / denom

def per40(x: pd.Series, minutes: pd.Series) -> pd.Series:
    """Per-40 minutes rate: statistic per 40 minutes of play."""
    return (x / minutes.replace(0, np.nan)) * 40.0

def per100_possessions(x: pd.Series, team_possessions: pd.Series) -> pd.Series:
    """Per-100 possessions rate."""
    return (x / team_possessions.replace(0, np.nan)) * 100.0

def true_shooting_percentage(points: pd.Series, fga: pd.Series, fta: pd.Series) -> pd.Series:
    """TS% = PTS / (2 * (FGA + 0.44*FTA))"""
    denom = 2.0 * (fga + 0.44 * fta)
    return points / denom.replace(0, np.nan)

def effective_fg_percentage(fgm: pd.Series, three_pm: pd.Series, fga: pd.Series) -> pd.Series:
    """eFG% = (FGM + 0.5*3PM) / FGA"""
    return (fgm + 0.5 * three_pm) / fga.replace(0, np.nan)

In [41]:
import kagglehub

# Download latest version
college_path = kagglehub.dataset_download("adityak2003/college-basketball-players-20092021")
combine_path = kagglehub.dataset_download("marcusfern/nba-draft-combine")
rookie_path = kagglehub.dataset_download("thedevastator/nba-rookies-performance-statistics-and-minutes-p")

In [42]:
print("Path to dataset files:", college_path)
print("Path to dataset files:", combine_path)
print("Path to dataset files:", rookie_path)

DATA_PATHS = {
    # College-level stats (one row per player-season or player-career; include Minutes, team possessions if available)
    "college_csv": os.path.join(college_path, "CollegeBasketballPlayers2009-2021.csv"),
    # NBA combine measurements (one row per player-year; columns like Height, Wingspan, Weight, Vertical, etc.)
    "combine_csv": os.path.join(combine_path, "Draft Combine - Kaggle.csv"),
    # Rookie PER (one row per player; include rookie PER and rookie season/draft year), e.g., from Basketball-Reference
    "rookie_csv": os.path.join(rookie_path, "NBA Rookies by Year.csv")
}

Path to dataset files: C:\Users\Ethan\.cache\kagglehub\datasets\adityak2003\college-basketball-players-20092021\versions\5
Path to dataset files: C:\Users\Ethan\.cache\kagglehub\datasets\marcusfern\nba-draft-combine\versions\15
Path to dataset files: C:\Users\Ethan\.cache\kagglehub\datasets\thedevastator\nba-rookies-performance-statistics-and-minutes-p\versions\2


In [70]:
def clean_player_name(s: str) -> str:
    """
    Normalize player names across datasets:
    - Removes punctuation and suffixes
    - Handles 'Last, First' → 'First Last'
    - Collapses spaces and uppercases
    """
    if s is None or not isinstance(s, str):
        return ""
    s = s.strip()

    # If formatted like "Smith, John", flip to "John Smith"
    if "," in s:
        parts = [p.strip() for p in s.split(",")]
        if len(parts) == 2:
            s = f"{parts[1]} {parts[0]}"

    # Remove punctuation/apostrophes
    s = re.sub(r"[.\u2019'`]", "", s)
    s = re.sub(r"\b(JR|SR|II|III|IV)\b", "", s, flags=re.I)
    s = re.sub(r"\s+", " ", s)

    return s.upper().strip()

def percent_to_unit(df: pd.DataFrame, cols):
    df = df.copy()
    for c in cols:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            s = df[c]
            # if most values > 1, assume 0..100 and convert
            if s.dropna().gt(1.5).mean() > 0.5:
                df[c] = s / 100.0
    return df

In [53]:
def load_college(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    rename_map = {
        "player_name": "player",
        "team": "school",          
        "year": "season",    
        "": "position",      
        "GP": "gp",
        "mp": "mpg",           

        "FTM": "ftm",
        "FTA": "fta",
        "FT_per": "ft_pct",
        "twoPM": "twopm",
        "twoPA": "twopa",
        "twoP_per": "twop_pct",
        "TPM": "fg3m",             
        "TPA": "fg3a",             
        "TP_per": "fg3_pct",

        "pts": "pts_pg",
        "ast": "ast_pg",
        "treb": "trb_pg",
        "stl": "stl_pg",
        "blk": "blk_pg",

        "eFG": "efg_pct_given",
        "TS_per": "ts_pct_given",

        "ORB_per": "orb_pct",
        "DRB_per": "drb_pct",
        "AST_per": "ast_pct",
        "TO_per": "tov_pct",
        "ftr": "ftr",

        "Ortg": "ortg",
        "Drtg": "drtg",
    
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
    df = df[df["pick"].notna()]

    if {"gp", "mpg"}.issubset(df.columns):
        df["minutes"] = df["gp"] * df["mpg"]
    else:
        df["minutes"] = np.nan

    
    df["fgm"] = df["twopm"].fillna(0) + df["fg3m"].fillna(0)
    
    df["fga"] = df["twopa"].fillna(0) + df["fg3a"].fillna(0)

    df = df.sort_values(["player", "season"]).groupby("player", as_index=False).tail(1).rename(columns={"season": "draft_year"})
    
    df = df.drop_duplicates(subset=[c for c in ["player", "season", "school"] if c in df.columns])
    return df

def load_combine(path: str) -> pd.DataFrame:
    """Load NBA Combine measurements.

    Expected columns (examples):
        - player
        - draft_year (int)
        - height_w_shoes_in, wingspan_in, standing_reach_in, weight_lbs
        - body_fat_pct, bench_reps, vert_no_step_in, vert_max_in, lane_agility_sec, shuttle_run_sec, three_quarter_sprint_sec
    """
    df = pd.read_csv(path)
    rename_map = {
        "PLAYER": "player",
        "YEAR": "draft_year",
        "POS": "position",

        # Measurements
        "HGT": "height_w_shoes_in",    
        "WGT": "weight_lbs",
        "BMI": "bmi",
        "BF": "body_fat_pct",
        "WNGSPN": "wingspan_in",
        "STNDRCH": "standing_reach_in",
        "HANDL": "hand_length_in",
        "HANDW": "hand_width_in",

        # Jump results
        "STNDVERT": "vert_no_step_in",
        "LPVERT": "vert_max_in",
        "PBHGT": "standing_reach_plus_vert_in",       
        "PDHGT": "standing_reach_plus_max_vert_in",   

        # Agility & speed
        "LANE": "lane_agility_sec",
        "SHUTTLE": "shuttle_run_sec",
        "SPRINT": "three_quarter_sprint_sec",

        # Strength
        "BENCH": "bench_reps",

        # Extras
        "BAR": "bar",
        "PAN": "pan",
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
    numeric_like = [
        "draft_year", "height_w_shoes_in", "weight_lbs", "bmi", "body_fat_pct",
        "wingspan_in", "standing_reach_in", "hand_length_in", "hand_width_in",
        "vert_no_step_in", "vert_max_in",
        "standing_reach_plus_vert_in", "standing_reach_plus_max_vert_in",
        "lane_agility_sec", "shuttle_run_sec", "three_quarter_sprint_sec",
        "bench_reps", "bar", "pan"
    ]
    for c in numeric_like:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.drop_duplicates(subset=[c for c in ["player", "draft_year"] if c in df.columns])
    return df

def load_rookie(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Drop unnamed/index
    unnamed = [c for c in df.columns if not str(c).strip() or str(c).lower().startswith("unnamed") or str(c).lower()=="index"]
    if unnamed: 
        df = df.drop(columns=unnamed)

    rename_map = {"Name":"player",
                  "Year Drafted":"rookie_season",
                  "GP":"gp",
                  "MIN":"minutes",
                  "PTS":"pts",
                  "FGM":"fgm",
                  "FGA":"fga",
                  "FG%":"fg_pct",
                  "3P Made":"fg3m",
                  "3PA":"fg3a",
                  "3P%":"fg3_pct",
                  "FTM":"ftm",
                  "FTA":"fta",
                  "FT%":"ft_pct",
                  "OREB":"orb",
                  "DREB":"drb",
                  "REB":"trb",
                  "AST":"ast",
                  "STL":"stl",
                  "BLK":"blk",
                  "TOV":"tov",
                  "EFF":"rookie_per"
            }
    df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})
    # Types
    for c in ["rookie_season","gp","minutes","pts","fgm","fga","fg_pct","fg3m","fg3a","fg3_pct","ftm","fta","ft_pct","orb","drb","trb","ast","stl","blk","tov","rookie_per"]:
        if c in df.columns: 
            df[c] = pd.to_numeric(df[c], errors="coerce")
    df["draft_year"] = df["rookie_season"]
    if {"player","rookie_season"}.issubset(df.columns): 
        df = df.drop_duplicates(subset=["player","rookie_season"])
    return df

In [44]:
def add_rate_features(college_df: pd.DataFrame) -> pd.DataFrame:
    df = college_df.copy()

    # per-40s from totals
    if {"pts", "minutes"}.issubset(df.columns):
        df["pts_per40"] = per40(df["pts"], df["minutes"])
    if {"ast", "minutes"}.issubset(df.columns):
        df["ast_per40"] = per40(df["ast"], df["minutes"])
    if {"trb", "minutes"}.issubset(df.columns):
        df["trb_per40"] = per40(df["trb"], df["minutes"])
    if {"stl", "minutes"}.issubset(df.columns):
        df["stl_per40"] = per40(df["stl"], df["minutes"])
    if {"blk", "minutes"}.issubset(df.columns):
        df["blk_per40"] = per40(df["blk"], df["minutes"])
    if {"tov", "minutes"}.issubset(df.columns):
        df["tov_per40"] = per40(df.get("tov", np.nan), df["minutes"])

    # efficiency recompute (fallback to given if components are missing)
    if {"fgm", "fg3m", "fga"}.issubset(df.columns) and df["fga"].notna().any():
        df["efg_pct"] = effective_fg_percentage(df["fgm"], df["fg3m"], df["fga"])
    elif "efg_pct_given" in df.columns:
        df["efg_pct"] = df["efg_pct_given"]

    if {"pts", "fga", "fta"}.issubset(df.columns) and (df["fga"].notna().any() or df["fta"].notna().any()):
        df["ts_pct"] = true_shooting_percentage(df["pts"], df["fga"], df["fta"])
    elif "ts_pct_given" in df.columns:
        df["ts_pct"] = df["ts_pct_given"]

    return df

def select_model_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str], List[str]]:
    numeric_candidates = [
        # college per-40/efficiency
        "pts_per40", "ast_per40", "trb_per40", "stl_per40", "blk_per40", "tov_per40",
        "efg_pct", "ts_pct",
        # combine
        "height_w_shoes_in", "wingspan_in", "standing_reach_in", "weight_lbs",
        "body_fat_pct", "bench_reps", "vert_no_step_in", "vert_max_in",
        "lane_agility_sec", "shuttle_run_sec", "three_quarter_sprint_sec",
    ]
    categorical_candidates = ["position"] 
    numeric_cols = [c for c in numeric_candidates if c in df.columns]
    categorical_cols = [c for c in categorical_candidates if c in df.columns]
    return df, numeric_cols, categorical_cols

In [71]:
def merge_college_combine_rookie(college_df: pd.DataFrame, combine_df: pd.DataFrame, rookie_df: pd.DataFrame) -> pd.DataFrame:
    dfc = college_df.copy()
    
    merged = dfc.merge(combine_df, on=["player","draft_year"], how="left")
    merged = merged.merge(rookie_df[["player","draft_year","rookie_per"]], on=["player","draft_year"], how="inner")
    return merged

In [46]:
def build_preprocessor(numeric_cols: List[str], categorical_cols: List[str]) -> ColumnTransformer:
    numeric_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_cols),
            ("cat", categorical_pipeline, categorical_cols)
        ],
        remainder="drop"
    )
    return preprocessor

def train_val_split(df: pd.DataFrame, feature_cols: List[str], target_col: str = "rookie_per", test_size: float = 0.2):
    X = df[feature_cols]
    y = df[target_col]
    return train_test_split(X, y, test_size=test_size, random_state=RANDOM_SEED)

In [47]:
def evaluate_linear_regression(df_model: pd.DataFrame, numeric_cols: List[str], categorical_cols: List[str], cv_splits: int = 5) -> dict:
    pre = build_preprocessor(numeric_cols, categorical_cols)
    model = Pipeline([("preprocess", pre), ("model", LinearRegression())])
    X = df_model[numeric_cols + categorical_cols]; y = df_model["rookie_per"]
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_SEED)
    neg_mae = cross_val_score(model, X, y, cv=kf, scoring="neg_mean_absolute_error")
    neg_mse = cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")
    r2 = cross_val_score(model, X, y, cv=kf, scoring="r2")
    return {"MAE": float(-neg_mae.mean()), "RMSE": float(np.sqrt(-neg_mse.mean())), "R2": float(r2.mean())}

In [48]:
def evaluate_beta_regression(
    df_model: pd.DataFrame,
    numeric_cols: List[str],
    categorical_cols: List[str],
    cv_splits: int = 5
) -> Dict[str, float]:
    """Evaluate Beta Regression two ways:
    1) Preferred: statsmodels GLM with Beta family (if available).
    2) Fallback: logit-transform the target in (0,1) using TransformedTargetRegressor + LinearRegression.
    """
    # Normalize target to (0,1)
    y01 = normalize_to_0_1(df_model["rookie_per"])

    if STATS_BETA_AVAILABLE:
        # Prepare design matrix using the same preprocessing as Linear Regression
        preprocessor = build_preprocessor(numeric_cols, categorical_cols)
        X = df_model[numeric_cols + categorical_cols]
        X_design = preprocessor.fit_transform(X)
        # Add constant for intercept
        X_design = sm.add_constant(X_design, has_constant="add")

        # Fit GLM Beta
        model = sm.GLM(y01, X_design, family=SM_BetaFamily(sm_logit()))
        res = model.fit()
        # In-sample predictions for quick baseline metrics (could implement CV with refits if desired)
        preds = res.predict(X_design)
        # Map back from (0,1) to original PER scale for error metrics
        per_min, per_max = df_model["rookie_per"].min(), df_model["rookie_per"].max()
        eps = 1e-6
        preds_per = (preds * ((per_max - per_min) + 2*eps)) + (per_min - eps)

        mae = mean_absolute_error(df_model["rookie_per"], preds_per)
        rmse = math.sqrt(mean_squared_error(df_model["rookie_per"], preds_per))
        r2 = r2_score(df_model["rookie_per"], preds_per)
        return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2), "used": "statsmodels.GLM(Beta)"}
    else:
        # Fallback: approximate beta regression with logit-transformed target
        preprocessor = build_preprocessor(numeric_cols, categorical_cols)
        base = LinearRegression()

        # target transform y->logit(y01), inverse logit back to (0,1)
        y01_series = y01.copy()

        def tfunc(y):
            return safe_logit(y)

        def inv_tfunc(z):
            return safe_expit(z)

        model = Pipeline(steps=[
            ("preprocess", preprocessor),
            ("tt", TransformedTargetRegressor(regressor=base, func=tfunc, inverse_func=inv_tfunc))
        ])

        X = df_model[numeric_cols + categorical_cols]
        kf = KFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_SEED)

        # CV on normalized (0,1) target, then convert back to PER scale for reporting
        preds_all = []
        y_all = []
        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y01_series.iloc[train_idx], y01_series.iloc[test_idx]
            model.fit(X_train, y_train)
            p = model.predict(X_test)  # in (0,1)
            preds_all.append(p)
            y_all.append(y_test.values)

        preds_all = np.concatenate(preds_all)
        y_all = np.concatenate(y_all)

        # Map (0,1) back to PER scale based on global min/max
        per_min, per_max = df_model["rookie_per"].min(), df_model["rookie_per"].max()
        eps = 1e-6
        preds_per = (preds_all * ((per_max - per_min) + 2*eps)) + (per_min - eps)
        y_true_per = (y_all * ((per_max - per_min) + 2*eps)) + (per_min - eps)

        mae = mean_absolute_error(y_true_per, preds_per)
        rmse = math.sqrt(mean_squared_error(y_true_per, preds_per))
        r2 = r2_score(y_true_per, preds_per)
        return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2), "used": "logit-TransformedTargetRegressor"}

In [None]:
# 1) Load data
college = load_college(DATA_PATHS["college_csv"])
combine = load_combine(DATA_PATHS["combine_csv"])
rookie = load_rookie(DATA_PATHS["rookie_csv"])
for df in (college, combine, rookie):
    df["player"] = df["player"].astype(str).map(clean_player_name)
    
college_fe = add_rate_features(college)

# 3) Merge
merged = merge_college_combine_rookie(college_fe, combine, rookie)
pct_cols = [
    "efg_pct_given","ts_pct_given","ft_pct_given","twop_pct_given","fg3_pct_given",
    "efg_pct","ts_pct","fg_pct","fg3_pct","ft_pct"
]
merged = percent_to_unit(merged, pct_cols)
print ("Merged dataset shape:", merged.shape)
print(
    merged.sort_values("draft_year")[["player", "draft_year"]]
    .head()
    .to_string(index=False)
)
print(
    merged.sort_values("draft_year")[["player", "draft_year"]]
    .tail()
    .to_string(index=False)
)

# 4) Choose feature sets
df_all, num_cols, cat_cols = select_model_features(merged)

if not num_cols and not cat_cols:
    raise ValueError("No feature columns were found. Check your input CSV schemas and rename maps.")

feature_cols = num_cols + cat_cols


Merged dataset shape: (217, 91)
          player  draft_year
        AJ PRICE        2009
 HASHEEM THABEET        2009
  DAJUAN SUMMERS        2009
DANTE CUNNINGHAM        2009
 DARREN COLLISON        2009
          player  draft_year
       KRIS DUNN        2016
    CARIS LEVERT        2016
DENZEL VALENTINE        2016
   PASCAL SIAKAM        2016
ISAIAH WHITEHEAD        2016


  df = pd.read_csv(path)
  if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():


In [73]:
# 5) Evaluate Linear Regression (baseline)
lin_metrics = evaluate_linear_regression(df_all, num_cols, cat_cols)
print("Linear Regression (CV) ->", lin_metrics)

# 6) Evaluate Beta Regression (baseline)
beta_metrics = evaluate_beta_regression(df_all, num_cols, cat_cols)
print("Beta Regression (CV or in-sample per availability) ->", beta_metrics)

# 7) Persist merged training table and a quick metrics report
merged.to_csv("outputs/merged_training_table.csv", index=False)
pd.DataFrame([lin_metrics, beta_metrics], index=["linear", "beta"]).to_csv("outputs/baseline_metrics.csv")

Linear Regression (CV) -> {'MAE': 3.264776867542962, 'RMSE': 4.245037803768917, 'R2': -0.24291014442584843}
Beta Regression (CV or in-sample per availability) -> {'MAE': 3.8675313967415086, 'RMSE': 4.978110763504027, 'R2': -0.5733143297963397, 'used': 'logit-TransformedTargetRegressor'}


