# example code for training and inference on ngram models

## define imports and constants

In [1]:
import sys
from pathlib import Path

# get project root for file paths and add project root to python path so imports work from notebooks folder
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from ngram import *

# constants
DATA_PATH = str(PROJECT_ROOT / "data" / "scrape_results_2019_2022.csv")  # path where data is located

## load and peek data

data is expected to be in format:
<pre>
[
    [0, 1, ...], # list expected to contain 82 entries of 0's or 1's (1's representing wins and 0's losses) 
    [0, 0, 1, ...],
    # can contain as many lists as needed
]
</pre>

In [2]:
# can look up specific team and season with raw_data
real_raw_data = read_data(DATA_PATH)

# real_data is processed for training
real_data = list(real_raw_data.values())
mock_data = [generate_season_data() for _ in range(50)] # generate random data for 50 season long records   

print("mock data sample: [")
for season in mock_data[:2]:
    print(season)
print("...]")
print()

print("real data sample: [")
for season in real_data[:2]:
    print(season)
    print("len:", len(season))
print("...]")

print("all data", len(real_data))

mock data sample: [
[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1]
[1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]
...]

real data sample: [
[1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]
len: 64
[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0]
len: 63
...]
all data 90


## train models using mock data

In [3]:
unigram_model = train_unigram(real_data)
print("unigram model cpts", unigram_model)
print()

bigram_model = train_bigram(real_data)
print("bigram model cpts", bigram_model)
print()

trigram_model = train_trigram(real_data)
print("trigram model cpts", trigram_model)

Processing team records: 100%|██████████| 90/90 [00:00<00:00, 163343.73it/s]


completed training unigram model
unigram model cpts {'initial': {1: 0.5, 0: 0.5}}



Processing team records: 100%|██████████| 90/90 [00:00<00:00, 97315.64it/s]


completed training bigram model
bigram model cpts {'initial': {1: 0.5, 0: 0.5}, 'transition': {1: {1: 0.5390793945010812, 0: 0.4609206054989188}, 0: {1: 0.4609206054989188, 0: 0.5390793945010812}}}



Processing team records: 100%|██████████| 90/90 [00:00<00:00, 75770.24it/s]

completed training trigram model
trigram model cpts {'initial': {'<start>': {1: 0.5, 0: 0.5}, 1: {1: 0.6, 0: 0.4}, 0: {1: 0.4, 0: 0.6}}, 'transition': {1: {1: {1: 0.574245939675174, 0: 0.425754060324826}, 0: {1: 0.5023825731790333, 0: 0.49761742682096666}}, 0: {1: {1: 0.49591280653950953, 0: 0.5040871934604905}, 0: {1: 0.4271619268717353, 0: 0.5728380731282646}}}}





## infer and measure performance using models

In [4]:
# mock actual season
# actual_season = generate_season_data() # random generates season data
# get actual season from real data
SEASON = "2021"
TEAMS = [
    "ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DAL", "DEN", "DET", "GSW",
    "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", "MIN", "NOP", "NYK",
    "OKC", "ORL", "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"
]

def run_for_team_season(team, season, f):
    actual_season = real_raw_data[team + season]
    f.write(f"{team} {season} actual season {actual_season}\n")
    f.write("\n")
    # print(f"{team} {season} actual season", actual_season)
    # print()

    def infer_and_eval_model(model_type, model):
        if model_type == "unigram":
            predictions = infer_unigram_season(model)
        elif model_type == "bigram":
            predictions = infer_bigram_season(model)
        elif model_type == "trigram":
            predictions = infer_trigram_season(model)
        else:
            raise ValueError("Invalid model type")
        
        f.write(f"{model_type} predictions {predictions}\n")
        # print(f"{model_type} predictions", predictions)
        
        accuracy = sequence_accuracy(actual_season, predictions)
        f.write(f"{model_type} model accuracy: {accuracy}\n")
        f.write("\n")
        # print(f"{model_type} model accuracy:", accuracy)
        # print()
        return accuracy

    models = [("unigram", unigram_model), ("bigram", bigram_model), ("trigram", trigram_model)]
    model_stats = {
        "unigram": {},
        "bigram": {},
        "trigram": {}
    }
    for model_type, model in models:
        results = infer_and_eval_model(model_type, model)
        model_stats[model_type] = results

    return model_stats

team_stats = {}
with open("team_season_ngram_results.txt", "w") as f:
    for team in TEAMS:
        team_stats[team] = run_for_team_season(team, SEASON, f)

averages = {
    "unigram": 0,
    "bigram": 0,
    "trigram": 0
}
for team in TEAMS:
    for model_type in averages.keys():
        averages[model_type] += team_stats[team][model_type]

for key in averages.keys():
    averages[key] /= len(TEAMS)

print("averages:", averages)

print("wrote results to team_season_ngram_results.txt")

averages: {'unigram': 0.5018167218709223, 'bigram': 0.4977416440831075, 'trigram': 0.5160744755595703}
wrote results to team_season_ngram_results.txt
