# example code for training and inference on ngram models

## define imports and constants

In [None]:
import sys
from pathlib import Path

# get project root for file paths and add project root to python path so imports work from notebooks folder
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from ngram import *

# constants
DATA_PATH = str(PROJECT_ROOT / "data" / "allseasons.csv")  # path where data is located

## load and peek data

data is expected to be in format:
<pre>
[
    [0, 1, ...], # list expected to contain 82 entries of 0's or 1's (1's representing wins and 0's losses) 
    [0, 0, 1, ...],
    # can contain as many lists as needed
]
</pre>

In [2]:
# load in mock and real data
real_data = read_data(DATA_PATH)
mock_data = [generate_season_data() for _ in range(50)] # generate random data for 50 season long records   

print("mock data sample: [")
for season in mock_data[:2]:
    print(season)
print("...]")
print()

print("real data sample: [")
for season in real_data[:2]:
    print(season)
print("...]")


mock data sample: [
[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1]
[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1]
...]

real data sample: [
[0, 0]
[1, 1]
...]


## train models using mock data

In [3]:
unigram_model = train_unigram(mock_data)
print("unigram model cpts", unigram_model)
print()

bigram_model = train_bigram(mock_data)
print("bigram model cpts", bigram_model)
print()

trigram_model = train_trigram(mock_data)
print("trigram model cpts", trigram_model)

Processing team records: 100%|██████████| 50/50 [00:00<00:00, 69626.56it/s]


completed training unigram model
unigram model cpts {'initial': {1: 0.5151219512195122, 0: 0.4848780487804878}}



Processing team records: 100%|██████████| 50/50 [00:00<00:00, 58826.14it/s]


completed training bigram model
bigram model cpts {'initial': {1: 0.56, 0: 0.44}, 'transition': {1: {1: 0.5114942528735632, 0: 0.4885057471264368}, 0: {1: 0.5178389398572885, 0: 0.48216106014271154}}}



Processing team records: 100%|██████████| 50/50 [00:00<00:00, 40587.42it/s]

completed training trigram model
trigram model cpts {'initial': {'<start>': {1: 0.56, 0: 0.44}, 1: {1: 0.42857142857142855, 0: 0.5714285714285714}, 0: {1: 0.6818181818181818, 0: 0.3181818181818182}}, 'transition': {1: {1: {1: 0.5156993339676499, 0: 0.48430066603235017}, 0: {1: 0.5332671300893744, 0: 0.4667328699106256}}, 0: {1: {1: 0.5094152626362736, 0: 0.4905847373637265}, 0: {1: 0.49732047159699894, 0: 0.5026795284030011}}}}





## infer and measure performance using models

In [4]:
actual_season = generate_season_data() # random generates season data
print("actual season", actual_season)
print()

def infer_and_eval_model(model_type, model):
    if model_type == "unigram":
        predictions = infer_unigram_season(model)
    elif model_type == "bigram":
        predictions = infer_bigram_season(model)
    elif model_type == "trigram":
        predictions = infer_trigram_season(model)
    else:
        raise ValueError("Invalid model type")
    
    print(f"{model_type} predictions", predictions)
    
    accuracy = sequence_accuracy(actual_season, predictions)
    print(f"{model_type} model accuracy:", accuracy)
    print()

models = [("unigram", unigram_model), ("bigram", bigram_model), ("trigram", trigram_model)]
for model_type, model in models:
    infer_and_eval_model(model_type, model)

actual season [1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0]

unigram predictions [1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1]
unigram model accuracy: 0.4634146341463415

bigram predictions [1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1]
bigram model accuracy: 0.5121951219512195

trigram predictions [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0