# example code for training and inference on ngram models

## define imports and constants

In [72]:
import sys
from pathlib import Path

# get project root for file paths and add project root to python path so imports work from notebooks folder
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from ngram import *

# constants
DATA_PATH = str(PROJECT_ROOT / "data" / "allseasons.csv")  # path where data is located
CLUSTERS = 5
TIER = "tier_4" # which tier to train this run on
BOTTOM_THRESHOLD = 58
TOP_THRESHOLD = 82

## load and peek data

data is expected to be in format:
<pre>
[
    [0, 1, ...], # list expected to contain 82 entries of 0's or 1's (1's representing wins and 0's losses) 
    [0, 0, 1, ...],
    # can contain as many lists as needed
]
</pre>

In [73]:
# can look up specific team and season with raw_data
real_raw_data = read_data(DATA_PATH)

tiered_teams = {
    "tier_0": [],
    "tier_1": [],
    "tier_2": [],
    "tier_3": [],
    "tier_4": [],
}
test_data = {
    "2018": [],
    "2024": []
}

for team_season in real_raw_data:
    if "2018" in team_season:
        test_data["2018"].append(real_raw_data[team_season])
        continue
    if "2024" in team_season:
        test_data["2024"].append(real_raw_data[team_season])
        continue
    data = real_raw_data[team_season]
    wins = sum(data)
    if wins > 59:
        tiered_teams["tier_4"].append(data)
    elif wins > 51:
        tiered_teams["tier_3"].append(data)
    elif wins > 42:
        tiered_teams["tier_2"].append(data)
    elif wins > 30:
        tiered_teams["tier_1"].append(data)
    else:
        tiered_teams["tier_0"].append(data)

total_games = 0
print("real tiered sample: {")
for tier in tiered_teams:
    print(tier)
    print(tiered_teams[tier][:2])
    print("len:", len(tiered_teams[tier]))
    total_games += len(tiered_teams[tier])
print("}")

print(f"includes records from {total_games} teams")

real tiered sample: {
tier_0
[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1]]
len: 168
tier_1
[[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0

## train models using mock data

In [74]:
real_data = tiered_teams[TIER]

unigram_model = train_unigram(real_data)
print("unigram model cpts", unigram_model)
print()

bigram_model = train_bigram(real_data)
print("bigram model cpts", bigram_model)
print()

trigram_model = train_trigram(real_data)
print("trigram model cpts", trigram_model)

Processing team records: 100%|██████████| 29/29 [00:00<00:00, 120311.39it/s]


completed training unigram model
unigram model cpts {'initial': {1: 0.7728228859907447, 0: 0.22717711400925536}}



Processing team records: 100%|██████████| 29/29 [00:00<00:00, 87886.43it/s]


completed training bigram model
bigram model cpts {'initial': {1: 0.6551724137931034, 0: 0.3448275862068966}, 'transition': {1: {1: 0.777838727372463, 0: 0.22216127262753702}, 0: {1: 0.7619047619047619, 0: 0.23809523809523808}}}



Processing team records: 100%|██████████| 29/29 [00:00<00:00, 66832.32it/s]

completed training trigram model
trigram model cpts {'initial': {'<start>': {1: 0.6551724137931034, 0: 0.3448275862068966}, 1: {1: 0.7368421052631579, 0: 0.2631578947368421}, 0: {1: 0.8, 0: 0.2}}, 'transition': {1: {1: {1: 0.7773826458036984, 0: 0.22261735419630158}, 0: {1: 0.7732997481108312, 0: 0.22670025188916876}}, 0: {1: {1: 0.7814070351758794, 0: 0.2185929648241206}, 0: {1: 0.7203389830508474, 0: 0.2796610169491525}}}}





## infer and measure performance using models

In [76]:
# mock actual season
# actual_season = generate_season_data() # random generates season data
# get actual season from real data
SEASON = "2018"
TEAMS = [
    "ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DAL", "DEN", "DET", "GSW",
    "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", "MIN", "NOP", "NYK",
    "OKC", "ORL", "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"
]

def run_for_team_season(team, season, f):
    actual_season = real_raw_data[team + season]
    f.write(f"{team} {season} actual season {actual_season}\n")
    f.write("\n")
    # print(f"{team} {season} actual season", actual_season)
    # print()

    def infer_and_eval_model(model_type, model):
        if model_type == "unigram":
            predictions = infer_unigram_season(model)
        elif model_type == "bigram":
            predictions = infer_bigram_season(model)
        elif model_type == "trigram":
            predictions = infer_trigram_season(model)
        else:
            raise ValueError("Invalid model type")
        
        f.write(f"{model_type} predictions {predictions}\n")
        # print(f"{model_type} predictions", predictions)
        
        accuracy = sequence_accuracy(actual_season, predictions)
        f.write(f"{model_type} model accuracy: {accuracy}\n")
        f.write("\n")
        # print(f"{model_type} model accuracy:", accuracy)
        # print()
        return accuracy

    models = [("unigram", unigram_model), ("bigram", bigram_model), ("trigram", trigram_model)]
    model_stats = {
        "unigram": {},
        "bigram": {},
        "trigram": {}
    }
    for model_type, model in models:
        results = infer_and_eval_model(model_type, model)
        model_stats[model_type] = results

    return model_stats

team_stats = {}
file_path = f"results/cluster_{CLUSTERS}/ngram_results_{TIER}_{BOTTOM_THRESHOLD}_{TOP_THRESHOLD}_{SEASON}.txt"

with open(file_path, "w") as f:
    for team in TEAMS:
        wins = sum(real_raw_data[team + SEASON])
        if wins > BOTTOM_THRESHOLD and wins < TOP_THRESHOLD:
            team_stats[team] = run_for_team_season(team, SEASON, f)

    averages = {
        "unigram": 0,
        "bigram": 0,
        "trigram": 0
    }
    for team in team_stats:
        for model_type in averages.keys():
            averages[model_type] += team_stats[team][model_type]

    for key in averages.keys():
        averages[key] /= len(team_stats)

    f.write("-" * 40 + "\n")
    f.write(f"averages: {averages}")

print(f"wrote results to {file_path}")

wrote results to results/cluster_5/ngram_results_tier_4_58_82_2018.txt
