## dataset description and four-factor normalization demo


In [8]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

# get project root for file paths and add project root to python path so imports work from notebooks folder
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from hmm.preprocess import (
    DEAN_OLIVER_WEIGHTS,
    OPTIMIZED_WEIGHTS,
    read_games_from_csv,
    compute_season_factor_stats,
)

# set up paths
PROJECT_ROOT = Path("..").resolve()
DATA_PATH = PROJECT_ROOT / "data" / "allseasons.csv"


- total number of regular-season games and seasons
- total number of unique teams
- total number of training sequences

In [19]:
# load games grouped by season
season_to_games = read_games_from_csv(str(DATA_PATH))

# flatten to a dataframe for easier counting
records = []
for season, games in season_to_games.items():
    for g in games:
        # build a row summarizing the game record
        records.append(
            {
                "season": g.season,
                "team": g.team,
                "date": g.date,
                "result": g.state,
                "eFG%": g.factors[0],
                "TOV%": g.factors[1],
                "ORB%": g.factors[2],
                "FT/FGA": g.factors[3],
            }
        )
    
holdout_sequences = [2018, 2024]

games_df = pd.DataFrame.from_records(records)

# training sequences is the string of games for each team in each season except the holdout sequences
train_sequences = (
    games_df[~games_df["season"].isin(holdout_sequences)]
    .groupby(["season", "team"])
    .size()
)

# number of training sequences
n_train_sequences = len(train_sequences)

n_games = len(games_df)
seasons = sorted(games_df["season"].unique())
n_seasons = len(seasons)
teams = sorted(games_df["team"].unique())
n_teams = len(teams)

print(f"total regular-season games: {n_games}")
print(f"number of seasons: {n_seasons} ({seasons[0]}–{seasons[-1]})")
print(f"number of unique teams: {n_teams}")
print(f"number of training sequences: {len(train_sequences)}")


total regular-season games: 59854
number of seasons: 25 (2000–2024)
number of unique teams: 37
number of training sequences: 686


### csv schema

we use a combined regular-season file `allseasons.csv` constructed from scraped box-score data. the columns relevant for the hmm are:

- `date`: game date
- `team`: team abbreviation
- `W/L`: `1` for win, `0` for loss (from the team perspective)
- `eFG%`: effective field goal percentage
- `TOV%`: turnover percentage
- `ORB%`: offensive rebounding percentage
- `FT/FGA`: free throws per field goal attempt
- `IsRegular`: `1` for regular season games, `0` otherwise
- `Season`: season start year as an integer (for example `2000` for the 2000–01 season)

in the code, each game is represented as a `GameRecord` with a `factors` vector `[eFG%, TOV%, ORB%, FT/FGA]`.


### per-season z-score normalization of the four factors

for each season, we compute the mean and standard deviation of the four factors across all regular-season games in that season.
for a given game with raw factor vector $\mathbf{x} = [x_{\text{eFG}}, x_{\text{TOV}}, x_{\text{ORB}}, x_{\text{FT/FGA}}]$, and per-season mean $\boldsymbol{\mu}$ and standard deviation $\boldsymbol{\sigma}$, we form the per-season z-scores:

$$
\mathbf{z} = \frac{\mathbf{x} - \boldsymbol{\mu}}{\boldsymbol{\sigma}}.
$$

following dean oliver, we treat higher turnover percentage as worse, so after standardization we flip the sign of the turnover component:

$$
z_{\text{TOV}} \leftarrow -z_{\text{TOV}}.
$$

these per-game z-scores are then combined into a single scalar performance score using a weighted sum of the four factors.


In [10]:
# compute per-season factor statistics (mean and std)
season_stats = compute_season_factor_stats(season_to_games)

# pick an example season to display statistics
example_season = seasons[0]
mean_vec, std_vec = season_stats[example_season]

factors = ["eFG%", "TOV%", "ORB%", "FT/FGA"]

season_stats_df = pd.DataFrame(
    {
        "factor": factors,
        "mean": mean_vec,
        "std": std_vec,
    }
)

print(f"example season: {example_season}")
season_stats_df


example season: 2000


Unnamed: 0,factor,mean,std
0,eFG%,0.474448,0.063522
1,TOV%,14.122918,3.697485
2,ORB%,27.999874,7.554377
3,FT/FGA,0.234708,0.088165


In [11]:
# demonstrate z-score normalization for a single example game
example_game = next(g for g in season_to_games[example_season])

raw_factors = example_game.factors
z_factors = (raw_factors - mean_vec) / std_vec

# flip turnover sign so that higher is better in the weighted score
z_factors_flipped = z_factors.copy()
z_factors_flipped[1] = -z_factors_flipped[1]

norm_example_df = pd.DataFrame(
    {
        "factor": factors,
        "raw": raw_factors,
        "z_score": z_factors,
        "z_score_flipped_TOV": z_factors_flipped,
    }
)

print(f"example game: team={example_game.team}, date={example_game.date.date()}, result={example_game.state}")
norm_example_df


example game: team=ATL, date=2000-10-31, result=0


Unnamed: 0,factor,raw,z_score,z_score_flipped_TOV
0,eFG%,0.407,-1.061813,-1.061813
1,TOV%,12.6,-0.41188,0.41188
2,ORB%,22.9,-0.675089,-0.675089
3,FT/FGA,0.198,-0.416351,-0.416351


### four-factor weights: dean oliver vs. learned weights

we compare the original dean oliver four-factor weights to the data-driven weights learned from our dataset via logistic regression.

- dean oliver (baseline): `[0.40, 0.25, 0.20, 0.15]` for `[eFG%, TOV%, ORB%, FT/FGA]`
- optimized weights (`OPTIMIZED_WEIGHTS` in `hmm/preprocess.py`): learned on all training seasons using game outcomes as labels.

for reporting in the paper, we use the normalized learned weights (scaled to sum to 1 in absolute value) alongside the dean oliver baseline.


In [14]:
# display dean oliver vs. optimized weights

# dean oliver baseline
oliver_weights = DEAN_OLIVER_WEIGHTS

# optimized logistic-regression weights stored in preprocess
optimized_weights = OPTIMIZED_WEIGHTS

weights_df = pd.DataFrame(
    {
        "factor": factors,
        "dean_oliver": oliver_weights,
        "optimized_weights": optimized_weights,
    }
)

weights_df


Unnamed: 0,factor,dean_oliver,optimized_weights
0,eFG%,0.4,0.4962
1,TOV%,0.25,0.1721
2,ORB%,0.2,0.1583
3,FT/FGA,0.15,0.1734
