In [5]:
import n_gram
import generator
import numpy as np
import matplotlib.pyplot as plt
import json

In [25]:
def intrinsic_eval_set(n_gram_list, test_corpus):
    print("starting intrinsic eval for set")
    perplexities = []
    for n_gram in n_gram_list:
        print("N-gram of order: ", n_gram.ndim)
        print(test_corpus[:20])
        perplexity = n_gram.perplexity(test_corpus)
        perplexities.append(perplexity)
    perplexities = np.array(perplexities)
    return perplexities

In [32]:
def intrinsic_eval_all(paths_train, paths_test, vocab_list):
    print("starting intrinsic evaluation")
    all_perplexities = []
    all_n_grams = []
    for path_train, path_test, vocab in zip(paths_train, paths_test, vocab_list):
       
        with open(path_train, "r") as f:
            n_gram_corps_train = f.read().split() 

        with open(path_test, "r") as f:
            n_gram_corps_test = f.read().split() 
        
        
        with open(vocab, "r", encoding="utf-8") as f:
            vocab = json.load(f)

        our_n_grams = generate_n_grams(n_gram_corps_train, 4 , vocab)
        # unigram always idx 0, bigram always idx 1, etc.
        perplexities = intrinsic_eval_set(our_n_grams, n_gram_corps_test)
        all_perplexities.append(perplexities)
        all_n_grams.append(our_n_grams)
    all_perplexities = np.array(all_perplexities)
    return all_perplexities, all_n_grams


In [27]:
def plot_perplexities(all_perplexities, fig_path):
    n_grams = [1, 2, 3, 4]
    versions = ["Best", "2nd", "3rd"]
    plt.figure(figsize=(8,5))
    for i in range(all_perplexities.shape[0]):
        plt.plot(n_grams, all_perplexities[i], marker='o', label=versions[i])
        plt.xticks(n_grams)
        plt.xlabel("N-gram Order")
        plt.ylabel("Perplexity")
        plt.title("N-gram Perplexity Across Dataset Versions")
        plt.legend()
        plt.grid(True)
        plt.savefig(fig_path)

In [28]:

def plot_prplx_diff(all_perplexities, fig_path):
    n_grams = [1, 2, 3, 4]
    versions = ["2nd", "3rd"]

    # compute differences relative to "Best"
    diffs = all_perplexities[1:] - all_perplexities[0]

    plt.figure(figsize=(8,5))
    for i in range(diffs.shape[0]):
        plt.plot(n_grams, diffs[i], marker='o', label=versions[i])

    plt.xticks(n_grams)
    plt.xlabel("N-gram Order")
    plt.ylabel("Δ Perplexity (vs. Best)")
    plt.title("Perplexity Difference Relative to Best Merge")
    plt.axhline(0, color="black", linewidth=1, linestyle="--")
    plt.legend()
    plt.grid(True)
    plt.savefig(fig_path)

In [29]:
def extrinsic_eval_all(all_n_grams, all_vocabs, context="cleopatra is my", out_path=None):
    # all_n_grams: list of lists of N_gram objects for each dataset version
    all_outputs = []
    for idx, n_gram_list in enumerate(all_n_grams):
        vocab = all_vocabs[idx]
        # convert objects → dicts expected by generate()
        ngram_dicts = [ng.n_gram_probs for ng in n_gram_list]
        outputs = []
        for order in range(1, len(ngram_dicts)+1):
            outputs.append(generator.generate(context, ngram_dicts, order, vocab))
        all_outputs.append(outputs)

    # Optional: to a markdown table
    try:
        import pandas as pd
        df = pd.DataFrame(
            all_outputs,
            columns=["Unigram", "Bigram", "Trigram", "4-gram"],
            index=[f"Version {i+1}" for i in range(len(all_outputs))]
        )
        if out_path:
            df.to_markdown(out_path)
        else:
            print(df.to_markdown())
    except Exception:
        pass

    return all_outputs


In [30]:
def extrinsic_eval_all_prob_fine(all_n_grams, all_vocabs, context="cleopatra is my", out_path=None):
    """
    Run extrinsic evaluation on multiple models/versions and return generated samples.

    Args:
        all_n_grams: list of lists, each inner list = [unigram_model, bigram_model, ...]
        all_vocab: list of vocabs for each merge, each contains list of subword tokens
        context: str, starting context for generation
        out_path: optional path to save results as CSV/Markdown

    Returns:
        results: list of lists of strings
            results[i][j] = sample from model i, n-gram order j
    """
    print("Starting extrinsic eval")
    all_outputs = []

    for model_idx, n_gram_list in enumerate(all_n_grams):
        print(f"Next dataset version: {model_idx+1}")
        vocab = all_vocabs[model_idx]
        outputs_per_model = []
        n_list = [ng.n_gram_probs for ng in n_gram_list]  # dicts only
        for n in range(1, len(n_list)+1):
            print(f"  Generating with {n}-gram")
            n_gram_out = generator.generate(context, n_list, n, vocab)
            outputs_per_model.append(n_gram_out)
        
        all_outputs.append(outputs_per_model)

    # Optional: save as markdown table
    if out_path:
        import pandas as pd
        df = pd.DataFrame(
            all_outputs,
            columns=["Unigram", "Bigram", "Trigram", "4-gram"],
            index=[f"Merge {i+1}" for i in range(len(all_outputs))]
        )
        df.to_markdown(out_path)

    return all_outputs


In [None]:
import json
from pathlib import Path

all_train = ("../Shakespeare_best_merge_train.txt", "../Shakespeare_2nd_best_merge_train.txt", "../Shakespeare_3rd_best_merge_train.txt")
all_test = ("../Shakespeare_best_merge_test.txt", "../Shakespeare_2nd_best_merge_test.txt", "../Shakespeare_3rd_best_merge_test.txt")
all_vocab = ("../vocab_best.json", "../vocab_2nd.json", "../vocab_3rd.json")

   



NameError: name 'intrinsic_eval_all' is not defined

In [5]:
import json
from pathlib import Path
import n_gram

with open("../vocab_best.json", "r", encoding="utf-8") as f:
        vocab = json.load(f)

with open("../Shakespeare_best_merge_train.txt", "r") as f:
    n_gram_corps_train = f.read().split() 

with open("../Shakespeare_best_merge_test.txt", "r") as f:
    n_gram_corps_test = f.read().split() 

best_fourgram = n_gram.N_gram(n_gram_corps_train, 4, vocab)

: 

In [None]:
best_fourgram.perplexity(n_gram_corps_test)

In [None]:
fig_path_1 = Path('img') / 'n_gram_perplexities.png'
fig_path_2 = Path('img') / 'n_gram_prplx_diff.png'
plot_perplexities(all_perplexities, fig_path_1)
plot_prplx_diff(all_perplexities, fig_path_2)
out_path_df = Path('n_gram') / 'n_gram_sample.md'
all_text_output = extrinsic_eval_all(all_n_grams, all_vocab, context="cleopatra is my", out_path=out_path_df)
print(all_text_output)