In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nba_api.stats.endpoints import leaguegamefinder
import line_profiler

In [2]:
conference_df = pd.read_csv("data/conferences.csv")
conf_lists = {
    "east": conference_df[conference_df["conference"] == "East"]["team"].to_list(),
    "west": conference_df[conference_df["conference"] == "West"]["team"].to_list(),
}

In [3]:
df = pd.read_csv("data/trim_df.csv")
df["winner"] = np.where(df["home_win"], df["home"], df["away"])

In [4]:
def rank_to_win_prob(rank):
    if rank <= 6: return 1 / 16
    elif rank <= 8: return 3 / 64
    elif rank <= 10: return 1 / 64
    else: return 0

In [5]:
def get_home_away_probs_given_lock(df, conf_lists, lock_index, iters=1000):
    # Gets probs of winning league for teams at lock index
    # Assuming the results for lock index and above
    # note: dataframe's index is sequential starting at 0

    # Initial housekeeping
    df["lock"] = df.index <= lock_index
    home, away = df.loc[lock_index, "home"],  df.loc[lock_index, "away"]
    home_conf = "east" if home in conf_lists["east"] else "west"
    away_conf = "east" if away in conf_lists["east"] else "west"
    home_playoff_prob, away_playoff_prob = 0, 0
    home_champ_prob, away_champ_prob = 0, 0
    team_names = df["home"].unique()
    start_standings = pd.Series(df[df["lock"]]["winner"].value_counts(), index=team_names).fillna(0)
        # if you do all value_counts inside the loop, it takes about 2/3 of runtime
        # so doing the locked value_counts here saves about 1/2 that time

    # Run the sims
    for _ in range(iters):
        df["winner"] = np.where( # choose random winners where not already locked
            df["lock"],
            df["winner"],
            np.where(
                np.random.uniform(0, 1, len(df)) > 0.5,
                df["home"],
                df["away"]
            )
        )
        # Get standings in each division
        future_results = pd.Series(df[~df["lock"]]["winner"].value_counts(), index=team_names).fillna(0)
        standings = start_standings + future_results
        standings += np.random.uniform(0, 0.001, len(standings)) # noise to break ties randomly
        conf_standings = {
            "east": standings.loc[conf_lists["east"]].sort_values(ascending=False),
            "west": standings.loc[conf_lists["west"]].sort_values(ascending=False)
        }
        # print(conf_standings)
        home_rank = conf_standings[home_conf].index.tolist().index(home) + 1
        home_playoff_prob += home_rank <= 10
        home_champ_prob += rank_to_win_prob(home_rank)
        away_rank = conf_standings[away_conf].index.tolist().index(away) + 1
        away_playoff_prob += away_rank <= 10
        away_champ_prob += rank_to_win_prob(away_rank)

    return (
        home_playoff_prob / iters, 
        home_champ_prob / iters, 
        away_playoff_prob / iters,
        away_champ_prob / iters
    )

In [6]:
def get_comparative_outcomes_for_index(df, conf_lists, game_index, iters=1000):
    print(f"\tGame index {game_index}")
    copy_df = df.copy()
    copy_df.loc[game_index, "winner"] = copy_df.loc[game_index, "home"]
    hpg, hcg, apb, acb = get_home_away_probs_given_lock(copy_df, conf_lists, game_index, iters=iters)
    copy_df.loc[game_index, "winner"] = copy_df.loc[game_index, "away"]
    hpb, hcb, apg, acg = get_home_away_probs_given_lock(copy_df, conf_lists, game_index, iters=iters)
    print("\t\t", copy_df.loc[game_index, "home"], copy_df.loc[game_index, "away"])
    print("\t\t", hpg, hpb, hcg, hcb, apg, apb, acg, acb)
    print("\t\t", hpg - hpb, hcg - hcb, apg - apb, acg - acb)
    # return hpg - hpb, hcg - hcb, apg - apb, acg - acb
    return hpg, hpb, hcg, hcb, apg, apb, acg, acb

In [7]:
def get_cli_columns_whole_df(df, conf_lists, iters=1000):
    # Ensure the index is gorgeous
    copy_df = df.reset_index(drop=True)
    cli_df = pd.DataFrame(
        [
            get_comparative_outcomes_for_index(copy_df, conf_lists, i, iters=iters)
            for i in copy_df.index
            # for i in range(0, len(copy_df), 250) # (Using this for line profiler)
        ],
        columns=["hpg", "hpb", "hcg", "hcb", "apg", "apb", "acg", "acb"],
        index=copy_df.index
    )
    return cli_df

In [8]:
def get_season_df(start_year):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable=f"{start_year}-{str(start_year + 1)[2:]}"
    )
    games_df_raw = gamefinder.get_data_frames()[0]
    reg_season_doubled = games_df_raw[
        (games_df_raw["GAME_ID"].str[:3] == "002")
        # 00 means NBA
        # 2 means regular season
    ].sort_values(by="GAME_DATE")
    trim_df = reg_season_doubled[reg_season_doubled["MATCHUP"].str.contains("vs.")]
    short_to_long_dict = dict(zip(trim_df["MATCHUP"].str[:3], trim_df["TEAM_NAME"]))
    new_df = pd.DataFrame({
        "home": trim_df["TEAM_NAME"],
        "away": trim_df["MATCHUP"].str[-3:].map(short_to_long_dict),
        "game_id": trim_df["GAME_ID"].str[2:], #trim to match format of current data
        "home_win": trim_df["WL"] == "W",
        "season_num": start_year - 1952, 
        "date": trim_df["GAME_DATE"],
        "reg_season": True,
    })
    new_df["winner"] = np.where(new_df["home_win"], new_df["home"], new_df["away"])
    return new_df.sort_values(by="date")

In [9]:
# Download season data from nba api
season_starts = [2020, 2021, 2022, 2023, 2024]
for start_year in season_starts:
    season_df = get_season_df(start_year)
    season_df.to_csv(f"data/{start_year}-{str(start_year + 1)[2:]}_season.csv", index=False)

KeyboardInterrupt: 

In [11]:
iters = 10000 # going big..
for start_year in [2024]:
    print(f"STARTING SEASON {start_year}")
    season_df = pd.read_csv(f"data/{start_year}-{str(start_year + 1)[2:]}_season.csv")
    cli_df = get_cli_columns_whole_df(season_df, conf_lists, iters=iters)
    df_plus_cli = pd.concat([season_df, cli_df], axis=1)
    df_plus_cli.to_csv(f"data/{start_year}-{str(start_year + 1)[2:]}_season_cli.csv", index=False)

STARTING SEASON 2024
	Game index 0
		 Los Angeles Lakers Minnesota Timberwolves
		 0.715 0.6368 0.0367078125 0.0311390625 0.7173 0.6382 0.0366421875 0.03125625
		 0.07819999999999994 0.005568750000000001 0.07910000000000006 0.0053859375
	Game index 1


KeyboardInterrupt: 