# CNU Basketball — Phase 2 Analysis
This notebook expands the project with:
- EDA (win rates, goal lifts)
- Statistical tests (chi-square / Fisher, OR/RR)
- Interpretable logistic regression (statsmodels)
- Predictive baseline (sklearn) with season-aware validation
- Threshold tuning & diagnostics
- Artifacts saved for dashboard/report

In [1]:
import os, math
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.contingency_tables import Table2x2
from statsmodels.stats.proportion import proportion_confint

from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix
)

import plotly.express as px
import plotly.graph_objects as go

# Paths
PROJ = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA = PROJ / "data"
REPORTS = PROJ / "reports"
FIGS = REPORTS / "figures"
FIGS.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJ)

Project root: C:\Users\ianwi\OneDrive\Desktop\Data Projects\CNU-Game-Goals-Historical-Analysis


In [2]:
df = pd.read_csv(DATA / "cnu_games_with_goals.csv")

# Types
df["date"] = pd.to_datetime(df["date"], errors="coerce")
for col in ["win","goal_reb","goal_to","goal_orb","goal_def30","goals_hit"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

print(df.shape)
df.head(3)

(415, 30)


Unnamed: 0,season,date,home,opponent,location_text,result_text,cnu_pts,opp_pts,cnu_fgm,cnu_fga,...,cnu_first_half,opp_first_half,ot,box_url,win,goal_reb,goal_to,goal_orb,goal_def30,goals_hit
0,2010-2011,2011-02-25,0,Ferrum,"Ferrum, Va.","Ferrum 87, Christopher Newport 81",81,87,29,63,...,36,42,0,https://static.cnusports.com/custompages/mbbal...,0,1,0,1,0,2
1,2010-2011,2011-02-22,1,Shenandoah,"Freeman Center; Newport News, Va.","Christopher Newport 84, Shenandoah 66",84,66,30,65,...,31,33,0,https://static.cnusports.com/custompages/mbbal...,1,1,0,1,0,2
2,2010-2011,2011-02-19,1,"Greensboro College 11-14,","Freeman Center; Newport News, Va.","Christopher Newport 55, Greensboro College 52",55,52,16,51,...,17,20,0,https://static.cnusports.com/custompages/mbbal...,1,1,1,0,1,3


In [3]:
# Coach rule: predict win if >=3 goals hit
df["rule3"] = (df["goals_hit"] >= 3).astype(int)

# Offensive rebounding rate (just in case)
misses = (df["cnu_fga"] - df["cnu_fgm"]).replace(0, np.nan)
df["cnu_or_rate"] = df["cnu_orb"] / misses
df.loc[misses.isna(), "cnu_or_rate"] = 0.0  # define as 0 if no misses

In [4]:
wr = df.groupby("goals_hit")["win"].agg(["mean","size"]).reindex([0,1,2,3,4])
wr.rename(columns={"mean":"win_rate","size":"N"}, inplace=True)
wr

Unnamed: 0_level_0,win_rate,N
goals_hit,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.4,25
1,0.602941,68
2,0.820513,156
3,0.956522,115
4,0.941176,51


In [27]:
fig = px.bar(
    wr.reset_index(), x="goals_hit", y="win_rate", text="N",
    labels={"goals_hit":"Goals hit", "win_rate":"Win %"},
    color="goals_hit", color_discrete_sequence=["#A7A8AA","#8A8D8F","#d1d5db","#0033A0","#001e5e"]
)
fig.update_traces(textposition="outside")
fig.update_layout(template="simple_white", yaxis_tickformat=".0%", title="Win% by # of goals hit")
fig.write_html(FIGS/"win_rate_by_goals.html", include_plotlyjs="cdn")
fig.show()

In [9]:
def win_rate(df_, mask):
    sub = df_.loc[mask]
    return float(sub["win"].mean()) if len(sub) else np.nan

goals = {
    "Outrebound (TRB)": df["goal_reb"]==1,
    "Fewer TOs": df["goal_to"]==1,
    "40% OR%": df["goal_orb"]==1,
    "Opp < 30 at half": df["goal_def30"]==1,
    "Rule (>=3/4)": df["rule3"]==1,
}
rows = []
for name, m in goals.items():
    yes = win_rate(df, m)
    no  = win_rate(df, ~m)
    rows.append({"metric":name, "win%_yes":yes, "win%_no":no, "lift_pp": (yes-no)*100})
pd.DataFrame(rows).sort_values("lift_pp", ascending=False)

Unnamed: 0,metric,win%_yes,win%_no,lift_pp
0,Outrebound (TRB),0.877023,0.622642,25.438114
4,Rule (>=3/4),0.951807,0.718876,23.293173
1,Fewer TOs,0.88835,0.736842,15.150741
3,Opp < 30 at half,0.883721,0.735,14.872093
2,40% OR%,0.879397,0.75,12.939698


In [11]:
def table2x2_from_binary(y, x):
    y, x = pd.Series(y).astype(int), pd.Series(x).astype(int)
    a = int(((y==1) & (x==1)).sum())  # win & hit
    b = int(((y==0) & (x==1)).sum())  # loss & hit
    c = int(((y==1) & (x==0)).sum())  # win & not
    d = int(((y==0) & (x==0)).sum())  # loss & not
    return np.array([[a, b],
                     [c, d]], dtype=int)

def test_goal(y, x, label):
    tbl = table2x2_from_binary(y, x)
    t2 = Table2x2(tbl)
    chi2, chi2_p, _, _ = stats.chi2_contingency(tbl, correction=False)
    fisher_odds, fisher_p = stats.fisher_exact(tbl, alternative="two-sided")
    or_est = t2.oddsratio
    or_ci  = t2.oddsratio_confint()
    rr_est = t2.riskratio
    rr_ci  = t2.riskratio_confint()
    win_when_yes = tbl[0,0] / (tbl[0,0] + tbl[0,1]) if (tbl[0,0]+tbl[0,1])>0 else np.nan
    win_when_no  = tbl[1,0] / (tbl[1,0] + tbl[1,1]) if (tbl[1,0]+tbl[1,1])>0 else np.nan
    return {
        "metric": label,
        "table": tbl,
        "chi2_p": chi2_p,
        "fisher_p": fisher_p,
        "odds_ratio": or_est,
        "odds_ratio_ci": or_ci,
        "risk_ratio": rr_est,
        "risk_ratio_ci": rr_ci,
        "win%_yes": win_when_yes,
        "win%_no": win_when_no,
        "diff_pp": 100*(win_when_yes - win_when_no)
    }

In [12]:
tests = []
tests.append(test_goal(df["win"], df["goal_reb"], "Outrebound (TRB)"))
tests.append(test_goal(df["win"], df["goal_to"], "Fewer TOs"))
tests.append(test_goal(df["win"], df["goal_orb"], "40% OR%"))
tests.append(test_goal(df["win"], df["goal_def30"], "Opp < 30 at half"))
tests.append(test_goal(df["win"], df["rule3"], "Rule (>=3/4)"))

pd.DataFrame({
    "metric": [t["metric"] for t in tests],
    "win% yes": [t["win%_yes"] for t in tests],
    "win% no": [t["win%_no"] for t in tests],
    "diff (pp)": [t["diff_pp"] for t in tests],
    "Fisher p": [t["fisher_p"] for t in tests],
    "Odds Ratio": [t["odds_ratio"] for t in tests],
    "OR 95% CI": [t["odds_ratio_ci"] for t in tests],
    "Risk Ratio": [t["risk_ratio"] for t in tests],
    "RR 95% CI": [t["risk_ratio_ci"] for t in tests],
}).sort_values("diff (pp)", ascending=False)

Unnamed: 0,metric,win% yes,win% no,diff (pp),Fisher p,Odds Ratio,OR 95% CI,Risk Ratio,RR 95% CI
0,Outrebound (TRB),0.877023,0.622642,25.438114,4.292334e-08,4.322169,"(2.571828647125411, 7.263759735919341)",1.408552,"(1.2075472535011587, 1.6430143171099447)"
4,Rule (>=3/4),0.951807,0.718876,23.293173,2.644874e-10,7.723464,"(3.604338454464015, 16.550024943650378)",1.324022,"(1.2162751850491405, 1.4413145933030698)"
1,Fewer TOs,0.88835,0.736842,15.150741,8.588483e-05,2.841615,"(1.669597138968421, 4.836361473235767)",1.205617,"(1.0970327672369005, 1.3249493290739687)"
3,Opp < 30 at half,0.883721,0.735,14.872093,0.0001404606,2.740136,"(1.6258040796617843, 4.618235180160249)",1.202341,"(1.0919429806212473, 1.3239014049653235)"
2,40% OR%,0.879397,0.75,12.939698,0.0009919109,2.430556,"(1.4361160848874883, 4.113595252367647)",1.172529,"(1.0688239338638656, 1.2862969726175344)"


In [13]:
model_cols = ["win","goal_reb","goal_to","goal_orb","goal_def30"]
logit_df = df[model_cols].dropna().copy()

model = smf.logit("win ~ goal_reb + goal_to + goal_orb + goal_def30", data=logit_df).fit(disp=False)
print(model.summary())

params = model.params
conf   = model.conf_int()
or_df = pd.DataFrame({
    "feature": params.index,
    "odds_ratio": np.exp(params.values),
    "ci_low": np.exp(conf[0].values),
    "ci_high": np.exp(conf[1].values),
    "p_value": model.pvalues.values
})
or_df

                           Logit Regression Results                           
Dep. Variable:                    win   No. Observations:                  415
Model:                          Logit   Df Residuals:                      410
Method:                           MLE   Df Model:                            4
Date:                Tue, 19 Aug 2025   Pseudo R-squ.:                  0.1691
Time:                        21:31:56   Log-Likelihood:                -166.64
converged:                       True   LL-Null:                       -200.54
Covariance Type:            nonrobust   LLR p-value:                 6.554e-14
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.7250      0.301     -2.410      0.016      -1.315      -0.135
goal_reb       1.5501      0.327      4.743      0.000       0.910       2.191
goal_to        1.4363      0.310      4.637      0.0

Unnamed: 0,feature,odds_ratio,ci_low,ci_high,p_value
0,Intercept,0.484315,0.268577,0.873348,0.015947
1,goal_reb,4.711952,2.483337,8.940587,2e-06
2,goal_to,4.205141,2.291586,7.71658,4e-06
3,goal_orb,1.583576,0.851902,2.943662,0.146156
4,goal_def30,2.387706,1.362479,4.184387,0.002362


In [14]:
def season_start_int(s):
    try:
        return int(str(s).split("-")[0])
    except:
        return np.nan

df["season_start"] = df["season"].apply(season_start_int)

use_cols = ["win","goal_reb","goal_to","goal_orb","goal_def30","season_start"]
work = df[use_cols].dropna().copy()

# Hold out the most recent 2 seasons for testing (change if you like)
cut = np.sort(work["season_start"].unique())[-2:]
train = work[~work["season_start"].isin(cut)].copy()
test  = work[ work["season_start"].isin(cut)].copy()

X_train = train[["goal_reb","goal_to","goal_orb","goal_def30"]].values
y_train = train["win"].values.astype(int)
X_test  = test[ ["goal_reb","goal_to","goal_orb","goal_def30"]].values
y_test  = test["win"].values.astype(int)

print("Train seasons:", np.unique(train["season_start"]))
print("Test seasons :", np.unique(test["season_start"]))

Train seasons: [2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2021 2022]
Test seasons : [2023 2024]


In [15]:
clf = LogisticRegression(solver="liblinear", random_state=42)
clf.fit(X_train, y_train)

proba = clf.predict_proba(X_test)[:,1]
pred  = (proba >= 0.5).astype(int)

print("LogReg — Test @0.5")
print("ACC:", accuracy_score(y_test, pred))
print("PREC:", precision_score(y_test, pred, zero_division=0))
print("REC:", recall_score(y_test, pred, zero_division=0))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print("\nConfusion\n", confusion_matrix(y_test, pred))

LogReg — Test @0.5
ACC: 0.7833333333333333
PREC: 0.7931034482758621
REC: 0.9787234042553191
ROC-AUC: 0.7144026186579377

Confusion
 [[ 1 12]
 [ 1 46]]


In [16]:
test_idx = test.index
rule_pred = (df.loc[test_idx, "rule3"] == 1).astype(int).values
print("Coach Rule — Test")
print("ACC:", accuracy_score(y_test, rule_pred))
print("PREC:", precision_score(y_test, rule_pred, zero_division=0))
print("REC:", recall_score(y_test, rule_pred, zero_division=0))
print("Confusion\n", confusion_matrix(y_test, rule_pred))

Coach Rule — Test
ACC: 0.48333333333333334
PREC: 0.9
REC: 0.3829787234042553
Confusion
 [[11  2]
 [29 18]]


In [17]:
gkf = GroupKFold(n_splits=min(5, work["season_start"].nunique()))
X = work[["goal_reb","goal_to","goal_orb","goal_def30"]].values
y = work["win"].values.astype(int)
groups = work["season_start"].values

clf_cv = LogisticRegression(solver="liblinear", random_state=42)
auc_scores = cross_val_score(clf_cv, X, y, groups=groups, cv=gkf, scoring="roc_auc")
print("GroupKFold ROC-AUC (mean ± sd):", auc_scores.mean(), "±", auc_scores.std())

GroupKFold ROC-AUC (mean ± sd): 0.7721825481260443 ± 0.07500054812218927


In [18]:
prec, rec, thr = precision_recall_curve(y_test, proba)
f1s = 2*prec*rec/(prec+rec+1e-9)
best_i = np.nanargmax(f1s)
best_thr = thr[best_i-1] if best_i>0 else 0.5

print("Best threshold by F1:", best_thr)
pred_best = (proba >= best_thr).astype(int)
print("ACC:", accuracy_score(y_test, pred_best))
print("PREC:", precision_score(y_test, pred_best, zero_division=0))
print("REC:", recall_score(y_test, pred_best, zero_division=0))
print("Confusion\n", confusion_matrix(y_test, pred_best))

Best threshold by F1: 0.5
ACC: 0.7833333333333333
PREC: 0.7931034482758621
REC: 0.9787234042553191
Confusion
 [[ 1 12]
 [ 1 46]]


In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_vif = sm.add_constant(work[["goal_reb","goal_to","goal_orb","goal_def30"]].values)
vif = [variance_inflation_factor(X_vif, i) for i in range(X_vif.shape[1])]
pd.DataFrame({"feature": ["const","goal_reb","goal_to","goal_orb","goal_def30"], "VIF": vif})

Unnamed: 0,feature,VIF
0,const,6.110749
1,goal_reb,1.226731
2,goal_to,1.025383
3,goal_orb,1.187528
4,goal_def30,1.022122


In [20]:
results = {
    "logreg_test_acc": accuracy_score(y_test, pred),
    "logreg_test_prec": precision_score(y_test, pred, zero_division=0),
    "logreg_test_rec": recall_score(y_test, pred, zero_division=0),
    "logreg_test_auc": roc_auc_score(y_test, proba),
    "coach_acc": accuracy_score(y_test, rule_pred),
    "coach_prec": precision_score(y_test, rule_pred, zero_division=0),
    "coach_rec": recall_score(y_test, rule_pred, zero_division=0),
    "cv_auc_mean": auc_scores.mean(),
    "cv_auc_sd": auc_scores.std(),
    "best_thr_by_f1": float(best_thr)
}
REPORTS.mkdir(exist_ok=True)
pd.DataFrame([results]).to_csv(REPORTS/"phase2_results_summary.csv", index=False)
pd.DataFrame([results])

Unnamed: 0,logreg_test_acc,logreg_test_prec,logreg_test_rec,logreg_test_auc,coach_acc,coach_prec,coach_rec,cv_auc_mean,cv_auc_sd,best_thr_by_f1
0,0.783333,0.793103,0.978723,0.714403,0.483333,0.9,0.382979,0.772183,0.075001,0.5


In [21]:
full = work.copy()
clf_all = LogisticRegression(solver="liblinear", random_state=42).fit(
    full[["goal_reb","goal_to","goal_orb","goal_def30"]].values,
    full["win"].values.astype(int)
)
df["model_prob_win"] = clf_all.predict_proba(
    df[["goal_reb","goal_to","goal_orb","goal_def30"]].fillna(0).values
)[:,1]

df.to_csv(DATA/"cnu_games_with_goals_and_model.csv", index=False)
df[["date","season","opponent","win","goals_hit","model_prob_win"]].head(10)

Unnamed: 0,date,season,opponent,win,goals_hit,model_prob_win
0,2011-02-25,2010-2011,Ferrum,0,2,0.787381
1,2011-02-22,2010-2011,Shenandoah,1,2,0.787381
2,2011-02-19,2010-2011,"Greensboro College 11-14,",1,3,0.948271
3,2011-02-18,2010-2011,"Methodist 9-15,",1,1,0.571399
4,2011-02-13,2010-2011,"19 Ferrum 21-3, 10-1 USAS",0,3,0.891487
5,2011-02-12,2010-2011,Averett 2-21 (0-10),1,2,0.767891
6,2011-02-06,2010-2011,Roanoke,1,2,0.892046
7,2011-02-05,2010-2011,"N.C. Wesleyan 15-6,",0,2,0.839496
8,2011-02-01,2010-2011,11 Randolph-Macon,0,1,0.571399
9,2011-01-27,2010-2011,"Shenandoah Hornets 3-15,",1,3,0.891487


## Key Takeaways
- Coach rule (≥3 of 4) shows high precision but lower recall.
- Fisher tests + OR/RR quantify each goal’s association with winning.
- Logistic regression using only the 4 goals provides baseline predictive lift vs. the rule on recent seasons.
- Season-aware CV (GroupKFold) yields stable ROC-AUC across years.
- Next: enrich features (opponent strength, home/away), try tree-based models, add calibration, and extend the dashboard with per-game probabilities and what-if toggles.
