In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
df = pd.read_csv("data/nrl.csv")

In [45]:
rename_dict = {
    "Date": "date", "Home Team": "home", "Away Team": "away", "Home Score": "home_score", "Away Score": "away_score",
    "Play Off Game?": "elim", "Home Odds": "home_odds", "Away Odds": "away_odds", "Draw Odds": "draw_odds"
}

In [4]:
def prelim_processing_inplace(df):
    df["winner"] = np.where(
        df["home_score"] > df["away_score"],
        df["home"],
        np.where(
            df["home_score"] < df["away_score"],
            df["away"],
            np.nan
        )
    )
    df["elim"] = df["elim"].map({np.nan: False, "Y": True})
    df.reset_index(drop=True)

In [34]:
def add_market_cols_inplace(df):
    spread_width = 0.04
    df["recip_home_bid"] = 1 / df["home_odds"]
    df["recip_home_ask"] = 1 - 1 / df["away_odds"]
    df["mid"] = (df["recip_home_bid"] + df["recip_home_ask"]) / 2
    df["home_bid"] = df["mid"] - spread_width / 2
    df["home_ask"] = df["mid"] + spread_width / 2
    df["home_win"] = df["winner"] == df["home"]
    df["away_win"] = df["winner"] == df["away"] # eugh gotta catch draws
    df["pro_home_pnl"] = df["home_win"] - df["home_ask"]
    df["anti_home_pnl"] = df["home_bid"] - ~df["away_win"] 

In [6]:
def get_home_away_probs_given_lock(df, lock_index, iters=1000):
    # Gets probs of winning league for teams at lock index
    # Assuming the results for lock index and above
    # note: dataframe's index is 0, 1, ..., 212

    # Initial housekeeping
    df["lock"] = df.index <= lock_index
    home = df.loc[lock_index, "home"]
    away = df.loc[lock_index, "away"]
    home_prob, away_prob = 0, 0
    draw_df = df[df["lock"] & df["winner"].isna()]
    draw_points = pd.Series(
        pd.concat([draw_df["home"], draw_df["away"]]).value_counts(),
        index=df["home"].iloc[:-9].unique()
    )

    # Run the sims
    for _ in range(iters):
        df["winner"] = np.where( # choose random winners where not already locked
            df["lock"],
            df["winner"],
            np.where(
                np.random.uniform(0, 1, len(df)) > 0.5,
                df["home"],
                df["away"]
            )
        )
        ladder = df.iloc[:-9]["winner"].value_counts() * 2 
        ladder += pd.Series(draw_points, index=ladder.index).fillna(0)
        ladder += np.random.uniform(0, 0.01, len(ladder)) # noise to simulate tiebreaking
        ladder.sort_values(ascending=False, inplace=True)
        # Get probs for finals analytically: means you get 16x mileage per sim which is nice
        home_prob += 3/16 if home in ladder.index[:4] else (1/16 if home in ladder.index[4:8] else 0)
        away_prob += 3/16 if away in ladder.index[:4] else (1/16 if away in ladder.index[4:8] else 0)

    return home_prob / iters, away_prob / iters

In [7]:
def get_cli_for_index(df, game_index, iters=1000):
    print(f"\tRunning on index {game_index}")
    copy_df = df.copy()
    copy_df.loc[game_index, "winner"] = copy_df.loc[game_index, "home"]
    home_good_case, away_bad_case = get_home_away_probs_given_lock(copy_df, game_index, iters=iters)
    copy_df.loc[game_index, "winner"] = copy_df.loc[game_index, "away"]
    home_bad_case, away_good_case = get_home_away_probs_given_lock(copy_df, game_index, iters=iters)
    home_cli = home_good_case - home_bad_case
    away_cli = away_good_case - away_bad_case
    # print(home_good_case, home_bad_case, away_good_case, away_bad_case)
    # print(copy_df.loc[game_index, "home"], home_cli)
    # print(copy_df.loc[game_index, "away"], away_cli)
    return home_cli, away_cli

In [14]:
def get_full_year_df(full_df, year, iters=1000):
    print(f"Processing {year}")
    year_df = full_df[full_df["Date"].str[:4] == str(year)].sort_values(
        by=["Date", "Kick-off (local)"]
    ).rename(columns=rename_dict).copy().reset_index()[rename_dict.values()]
    prelim_processing_inplace(year_df)
    add_market_cols_inplace(year_df)
    cli_df = pd.DataFrame(
        [get_cli_for_index(year_df, i, iters=iters) for i in range(len(year_df) - 9)],
        columns=["home_cli", "away_cli"]
    )
    wider_df = pd.concat([year_df, cli_df], axis=1)
    return wider_df

In [46]:
year_df_dict = {}

In [47]:
# Running like this so if one year screws it you don't lose all progress
for year in range(2009, 2025): # ignore 2025 because it's the current season
    if year in year_df_dict:
        print(f"{year} already processed")
    else:
        wide_df = get_full_year_df(df, year, iters=10000) # gotta do 10k to lower the variance :(
        year_df_dict[year] = wide_df

Processing 2009
	Running on index 0
	Running on index 1
	Running on index 2
	Running on index 3
	Running on index 4
	Running on index 5
	Running on index 6
	Running on index 7
	Running on index 8
	Running on index 9
	Running on index 10
	Running on index 11
	Running on index 12
	Running on index 13
	Running on index 14
	Running on index 15
	Running on index 16
	Running on index 17
	Running on index 18
	Running on index 19
	Running on index 20
	Running on index 21
	Running on index 22
	Running on index 23
	Running on index 24
	Running on index 25
	Running on index 26
	Running on index 27
	Running on index 28
	Running on index 29
	Running on index 30
	Running on index 31
	Running on index 32
	Running on index 33
	Running on index 34
	Running on index 35
	Running on index 36
	Running on index 37
	Running on index 38
	Running on index 39
	Running on index 40
	Running on index 41
	Running on index 42
	Running on index 43
	Running on index 44
	Running on index 45
	Running on index 46
	Runnin

Note that this is raw cli, and not adjusted so a round 1 game has cli 1. 

In [48]:
wide_df = pd.concat(year_df_dict.values())

In [50]:
add_market_cols_inplace(wide_df)

In [51]:
wide_df.to_csv("data/nrl_with_cli_10k.csv", index=False)