# example code for training and inference on threshold models

## define imports and constants

In [None]:
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm

# get project root for file paths and add project root to python path so imports work from notebooks folder
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from threshold import *

# constants
DATA_PATH = str(PROJECT_ROOT / "data" / "allseasons.csv")  # path where data is located

## load data

In [2]:
def load_data(clusters):
    # can look up specific team and season with raw_data
    team_ff_scores = get_ff_scores(DATA_PATH)

    tiered_teams = {f"tier_{i}":[] for i in range(clusters)}
    test_data = {
        "2018": [],
        "2024": []
    }

    for team_season in team_ff_scores:
        # Think more about how we are going to use test data
        if "2018" in team_season:
            test_data["2018"].extend(team_ff_scores[team_season])
            continue
        if "2024" in team_season:
            test_data["2024"].extend(team_ff_scores[team_season])
            continue
        data = team_ff_scores[team_season]
        wins = sum([d[1] for d in data])
        # 2 clusters
        if clusters == 2:
            if wins > 37.5:
                tiered_teams["tier_1"].extend(data)
            else:
                tiered_teams["tier_0"].extend(data)
        # 3 clusters
        elif clusters == 3:
            if wins > 45:
                tiered_teams["tier_2"].extend(data)
            elif wins > 30.5:
                tiered_teams["tier_1"].extend(data)
            else:
                tiered_teams["tier_0"].extend(data)
        # 4 clusters
        elif clusters == 4:
            if wins > 52.5:
                tiered_teams["tier_3"].extend(data)
            elif wins > 42:
                tiered_teams["tier_2"].extend(data)
            elif wins > 30:
                tiered_teams["tier_1"].extend(data)
            else:
                tiered_teams["tier_0"].extend(data)
        # 5 clusters
        elif clusters == 5:
            if wins > 59:
                tiered_teams["tier_4"].extend(data)
            elif wins > 51:
                tiered_teams["tier_3"].extend(data)
            elif wins > 42:
                tiered_teams["tier_2"].extend(data)
            elif wins > 30:
                tiered_teams["tier_1"].extend(data)
            else:
                tiered_teams["tier_0"].extend(data)
    
    return team_ff_scores, tiered_teams

## infer and measure performance using models

In [3]:
def run_for_team_season(team, season, clusters, team_ff_scores, tiered_teams, f):
    actual_season = team_ff_scores[team + season]
    actual_season_wins = [a[1] for a in actual_season]
    actual_season_ff_scores = [a[0] for a in actual_season]
    f.write(f"{team} {season} actual season {actual_season_wins}\n")
    f.write("\n")
    # print(f"{team} {season} actual season", actual_season_wins)
    # print()
    wins = sum(actual_season_wins)

    predicted_wins, tier = infer_threshold_season(wins, clusters, tiered_teams, actual_season_ff_scores)
               
    f.write(f"{tier} threshold predictions {predicted_wins}\n")
    # print(f"{tier} threshold predictions {predicted_wins}\n")
    
    accuracy = sequence_accuracy(actual_season_wins, predicted_wins)
    f.write(f"{tier} threshold accuracy: {accuracy}\n")
    f.write("\n")
    # print(f"{tier} threshold model accuracy:", accuracy)
    # print()

    return accuracy

In [4]:
SEASON = "2024"
TEAMS = [
    "ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DAL", "DEN", "DET", "GSW",
    "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", "MIN", "NOP", "NYK",
    "OKC", "ORL", "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"
]

In [5]:
cluster_win_thresholds = {
    2: [[0, 37], [36, 82]],
    3: [[0, 31], [30, 45], [44, 82]],
    4: [[0, 30], [29, 42], [41, 52], [51, 82]], 
    5: [[0, 30], [29, 42], [41, 51], [50, 59], [58, 82]]
}

Infer with threshold model across all cluster numbers and tiers

In [6]:
for cluster in tqdm(cluster_win_thresholds, desc="Inferring on different cluster numbers", total=4):
    CLUSTERS = cluster
    print(cluster)
    tiers = len(cluster_win_thresholds[cluster])
    team_ff_scores, tiered_teams = load_data(CLUSTERS)
    for t in range(tiers):
        TIER = f"tier_{t}"
        print(f"tier_{t}")
        BOTTOM_THRESHOLD = cluster_win_thresholds[cluster][t][0]
        # print(cluster_win_thresholds[cluster][t][0])
        TOP_THRESHOLD = cluster_win_thresholds[cluster][t][1]
        # print(cluster_win_thresholds[cluster][t][1])
        
        file_path = f"results/cluster_{CLUSTERS}_threshold/threshold_results_{TIER}_{SEASON}.txt"
        avg_accuracy = 0
        num_teams_in_tier = 0
            
        with open(file_path, "w") as f:
            for team in tqdm(TEAMS, desc=f"Inferring for total clusters: {CLUSTERS}, tier: {TIER}, {SEASON} season", total=30):
                actual_season = team_ff_scores[team + SEASON]
                wins = sum([a[1] for a in actual_season])
                if BOTTOM_THRESHOLD < wins < TOP_THRESHOLD:
                    acc = run_for_team_season(team, SEASON, CLUSTERS, team_ff_scores, tiered_teams, f)
                    avg_accuracy += acc
                    num_teams_in_tier += 1
            
            avg_accuracy /= num_teams_in_tier if num_teams_in_tier > 0 else 1
            f.write("-" * 40 + "\n")
            f.write(f"average accuracy: {avg_accuracy}")

Inferring on different cluster numbers:   0%|          | 0/4 [00:00<?, ?it/s]

2
tier_0


Inferring for total clusters: 2, tier: tier_0, 2024 season: 100%|██████████| 30/30 [00:08<00:00,  3.53it/s]


tier_1


Inferring for total clusters: 2, tier: tier_1, 2024 season: 100%|██████████| 30/30 [00:36<00:00,  1.21s/it]
Inferring on different cluster numbers:  25%|██▌       | 1/4 [00:45<02:15, 45.04s/it]

3
tier_0


Inferring for total clusters: 3, tier: tier_0, 2024 season: 100%|██████████| 30/30 [00:02<00:00, 11.85it/s]


tier_1


Inferring for total clusters: 3, tier: tier_1, 2024 season: 100%|██████████| 30/30 [00:08<00:00,  3.44it/s]


tier_2


Inferring for total clusters: 3, tier: tier_2, 2024 season: 100%|██████████| 30/30 [00:09<00:00,  3.09it/s]
Inferring on different cluster numbers:  50%|█████     | 2/4 [01:06<01:01, 30.97s/it]

4
tier_0


Inferring for total clusters: 4, tier: tier_0, 2024 season: 100%|██████████| 30/30 [00:02<00:00, 14.16it/s]


tier_1


Inferring for total clusters: 4, tier: tier_1, 2024 season: 100%|██████████| 30/30 [00:05<00:00,  5.70it/s]


tier_2


Inferring for total clusters: 4, tier: tier_2, 2024 season: 100%|██████████| 30/30 [00:05<00:00,  5.67it/s]


tier_3


Inferring for total clusters: 4, tier: tier_3, 2024 season: 100%|██████████| 30/30 [00:00<00:00, 55.43it/s]
Inferring on different cluster numbers:  75%|███████▌  | 3/4 [01:19<00:22, 22.93s/it]

5
tier_0


Inferring for total clusters: 5, tier: tier_0, 2024 season: 100%|██████████| 30/30 [00:02<00:00, 13.65it/s]


tier_1


Inferring for total clusters: 5, tier: tier_1, 2024 season: 100%|██████████| 30/30 [00:05<00:00,  5.61it/s]


tier_2


Inferring for total clusters: 5, tier: tier_2, 2024 season: 100%|██████████| 30/30 [00:04<00:00,  7.34it/s]


tier_3


Inferring for total clusters: 5, tier: tier_3, 2024 season: 100%|██████████| 30/30 [00:00<00:00, 33.69it/s]


tier_4


Inferring for total clusters: 5, tier: tier_4, 2024 season: 100%|██████████| 30/30 [00:00<00:00, 508.15it/s]
Inferring on different cluster numbers: 100%|██████████| 4/4 [01:32<00:00, 23.07s/it]
