### UnsMoothed Unigram & Bigram Language Models (MLE)

1) Loads a tokenized corpus where each line is one review (space-delimited tokens).
2) Builds unsmoothed unigram and bigram counts.
3) Converts counts to MLE probabilities.
4) Shows top-K previews as interactive tables.
5) Saves counts & probabilities to TSV/JSON for downstream use.

📌 Notes
- This is PURE MLE (no smoothing, no UNK). We'll add smoothing & perplexity later.
- For bigrams, using sentence boundary tokens (<s>, </s>) is helpful; toggle via ADD_BOUNDARIES below.

In [1]:
!pip install --upgrade pip
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
from __future__ import annotations

from collections import Counter
from typing import List, Tuple, Dict, Iterable, Set
import os
import json
import math

import pandas as pd

In [3]:
BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"

def trans_lower(text) :
    for i in text:
        i = i.lower()
    return text

def read_corpus(path: str, add_boundaries: bool = False) -> List[List[str]]:
    """
    Reads a tokenized corpus; each line is a review with space-delimited tokens.
    If add_boundaries=True, wraps each line with BOS/EOS.
    Returns: List[List[str]] where each inner list is the token sequence for one line.
    """
    sents: List[List[str]] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            toks = line.split()
            toks = trans_lower(toks)
            if add_boundaries:
                toks = [BOS] + toks + [EOS]
            sents.append(toks)
    return sents

def iter_corpus(path: str, add_boundaries: bool = False) -> Iterable[List[str]]:
    """
    Memory-light generator version. Yields token lists per line.
    Useful if your corpus is huge.
    """
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            toks = line.split()
            toks = trans_lower(toks)
            if add_boundaries:
                toks = [BOS] + toks + [EOS]
            yield toks

def count_unigrams_bigrams(sents: Iterable[List[str]]) -> Tuple[Counter, Counter]:
    """
    Accumulates unigram and bigram counts from an iterable of token lists.
    """
    uni = Counter()
    bi = Counter()
    for toks in sents:
        uni.update(toks)
        for i in range(1, len(toks)):
            bi[(toks[i-1], toks[i])] += 1
    return uni, bi

def mle_unigram_probs(uni: Counter) -> Dict[str, float]:
    """
    UnsMoothed Unigram MLE: P(w) = count(w) / total_tokens
    """
    total = sum(uni.values())
    return {w: c / total for w, c in uni.items()}

def mle_bigram_probs(uni: Counter, bi: Counter) -> Dict[Tuple[str, str], float]:
    """
    UnsMoothed Bigram MLE: P(w2 | w1) = count(w1, w2) / count(w1)
    Only defined for observed bigrams.
    """
    probs: Dict[Tuple[str, str], float] = {}
    for (w1, w2), c12 in bi.items():
        denom = uni[w1]
        if denom > 0:
            probs[(w1, w2)] = c12 / denom
    return probs

def save_tsv_unigram(path: str, uni_counts: Counter, uni_probs: Dict[str, float]) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write("token\tcount\tprob\n")
        for w, c in uni_counts.most_common():
            f.write(f"{w}\t{c}\t{uni_probs.get(w, 0.0):.12g}\n")

def save_tsv_bigram(path: str, bi_counts: Counter, bi_probs: Dict[Tuple[str,str], float]) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write("w1\tw2\tcount\tprob\n")
        for (w1, w2), c in sorted(bi_counts.items(), key=lambda kv: (-kv[1], kv[0][0], kv[0][1])):
            f.write(f"{w1}\t{w2}\t{c}\t{bi_probs.get((w1,w2), 0.0):.12g}\n")

def save_json(path: str, obj) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False)

In [4]:
# --- Configure paths & options ---
TRAIN_PATH = "train.txt"
OUT_DIR = "/outputs"
ADD_BOUNDARIES = True               # Wrap each line with <s> ... </s> for better bigrams
TOPK = 30                           # How many top items to preview

# --- Run pipeline ---
sents = read_corpus(TRAIN_PATH, add_boundaries=ADD_BOUNDARIES)
uni_counts, bi_counts = count_unigrams_bigrams(sents)
uni_probs = mle_unigram_probs(uni_counts)
bi_probs  = mle_bigram_probs(uni_counts, bi_counts)

In [5]:
# --- Basic stats ---
num_types = len(uni_counts)
num_tokens = sum(uni_counts.values())
num_bigram_types = len(bi_counts)

stats = pd.DataFrame({
    "metric": ["#unigram_types", "#tokens", "#bigram_types", "add_boundaries"],
    "value":  [num_types, num_tokens, num_bigram_types, ADD_BOUNDARIES]
})

In [6]:
# --- Preview top-K ---
df_uni = pd.DataFrame(uni_counts.most_common(TOPK), columns=["token", "count"])
df_uni["prob"] = df_uni["token"].apply(lambda w: uni_probs[w])

df_bi = pd.DataFrame(bi_counts.most_common(TOPK), columns=["bigram", "count"])
df_bi[["w1","w2"]] = pd.DataFrame(df_bi["bigram"].tolist(), index=df_bi.index)
df_bi["prob"] = df_bi.apply(lambda r: bi_probs.get((r["w1"], r["w2"]), 0.0), axis=1)
df_bi = df_bi[["w1","w2","count","prob"]]

In [7]:
# --- Show results ---
print("=== N-gram Stats ===")
print(stats)

print("\n=== Top Unigrams ===")
display(df_uni)

print("\n=== Top Bigrams ===")
display(df_bi)

=== N-gram Stats ===
           metric  value
0  #unigram_types   7258
1         #tokens  90708
2   #bigram_types  40023
3  add_boundaries   True

=== Top Unigrams ===


Unnamed: 0,token,count,prob
0,.,4692,0.051726
1,the,4250,0.046854
2,",",2949,0.032511
3,and,2552,0.028134
4,a,2215,0.024419
5,to,2077,0.022898
6,was,1820,0.020064
7,I,1659,0.018289
8,in,1212,0.013362
9,of,1040,0.011465



=== Top Bigrams ===


Unnamed: 0,w1,w2,count,prob
0,.,The,858,0.182864
1,.,I,522,0.111253
2,in,the,388,0.320132
3,.,</s>,371,0.079071
4,of,the,342,0.328846
5,.,We,339,0.072251
6,",",and,322,0.10919
7,at,the,321,0.447699
8,the,hotel,287,0.067529
9,and,the,268,0.105016


In [8]:
# --- Save outputs ---
os.makedirs(OUT_DIR, exist_ok=True)
save_tsv_unigram(os.path.join(OUT_DIR, "unigram_mle.tsv"), uni_counts, uni_probs)
save_tsv_bigram (os.path.join(OUT_DIR, "bigram_mle.tsv"),  bi_counts,  bi_probs)
save_json(os.path.join(OUT_DIR, "unigram_counts.json"), {w: int(c) for w, c in uni_counts.items()})
save_json(os.path.join(OUT_DIR, "bigram_counts.json"),  {" || ".join([w1, w2]): int(c) for (w1, w2), c in bi_counts.items()})
save_json(os.path.join(OUT_DIR, "unigram_probs.json"),  uni_probs)
save_json(os.path.join(OUT_DIR, "bigram_probs.json"),   {" || ".join([w1, w2]): p for (w1, w2), p in bi_probs.items()})

print("Saved files:")
print(" - Unigram counts (JSON):", os.path.join(OUT_DIR, "unigram_counts.json"))
print(" - Bigram counts   (JSON):", os.path.join(OUT_DIR, "bigram_counts.json"))
print(" - Unigram probs   (JSON):", os.path.join(OUT_DIR, "unigram_probs.json"))
print(" - Bigram probs    (JSON):", os.path.join(OUT_DIR, "bigram_probs.json"))
print(" - Unigram MLE (TSV):     ", os.path.join(OUT_DIR, "unigram_mle.tsv"))
print(" - Bigram MLE  (TSV):     ", os.path.join(OUT_DIR, "bigram_mle.tsv"))

Saved files:
 - Unigram counts (JSON): /outputs\unigram_counts.json
 - Bigram counts   (JSON): /outputs\bigram_counts.json
 - Unigram probs   (JSON): /outputs\unigram_probs.json
 - Bigram probs    (JSON): /outputs\bigram_probs.json
 - Unigram MLE (TSV):      /outputs\unigram_mle.tsv
 - Bigram MLE  (TSV):      /outputs\bigram_mle.tsv


In [9]:
# NLP A1 — Section 4: Unknown-word handling + Smoothing (Jupyter-ready)
# -------------------------------------------------------------------
# This cell extends the previous unsmoothed n-gram code with:
# 1) Unknown-word handling via <unk> using a frequency threshold.
# 2) Two smoothing methods for probabilities:
#    (A) Add-k / Laplace smoothing (k configurable; k=1 is Laplace).
#    (B) Jelinek–Mercer (JM) interpolation for bigrams: 
#        P_JM(w2|w1) = λ * P_MLE_bigram(w2|w1) + (1-λ) * P_unigram(w2)
#
# Design:
# - Build vocabulary from training counts with a min_freq threshold.
# - Map all tokens not in vocab to <unk>, then re-count unigrams/bigrams.
# - Provide probability functions for each smoothing strategy.
#
# This prepares everything you need for Section 5 (perplexity on validation).

In [10]:
# -----------------------
# Utilities (from before)
# -----------------------
# def read_corpus(path: str, add_boundaries: bool = True) -> List[List[str]]:
#     sents: List[List[str]] = []
#     with open(path, "r", encoding="utf-8") as f:
#         for line in f:
#             line = line.strip()
#             if not line:
#                 continue
#             toks = line.split()
#             if add_boundaries:
#                 toks = [BOS] + toks + [EOS]
#             sents.append(toks)
#     return sents

def read_corpus(path: str, add_boundaries: bool = False) -> List[List[str]]:
    """
    Reads a tokenized corpus; each line is a review with space-delimited tokens.
    If add_boundaries=True, wraps each line with BOS/EOS.
    Returns: List[List[str]] where each inner list is the token sequence for one line.
    """
    sents: List[List[str]] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            toks = line.split()
            toks = trans_lower(toks)
            if add_boundaries:
                toks = [BOS] + toks + [EOS]
            sents.append(toks)
    return sents
    
def count_unigrams_bigrams(sents: Iterable[List[str]]) -> Tuple[Counter, Counter]:
    uni = Counter()
    bi = Counter()
    for toks in sents:
        uni.update(toks)
        for i in range(1, len(toks)):
            bi[(toks[i-1], toks[i])] += 1
    return uni, bi

# -----------------------
# Unknown-word handling
# -----------------------
def build_vocab_from_counts(uni_counts: Counter, min_freq: int = 1,
                            include_special: bool = True) -> Set[str]:
    """
    Returns a vocabulary set using a frequency threshold.
    All tokens with count < min_freq will be mapped to <unk> later.
    """
    vocab = {w for w, c in uni_counts.items() if c >= min_freq}
    if include_special:
        vocab |= {BOS, EOS, UNK}
    return vocab

def map_tokens_to_vocab(sents: Iterable[List[str]], vocab: Set[str]) -> List[List[str]]:
    """
    Replace OOV tokens with <unk> given a fixed vocabulary.
    """
    mapped: List[List[str]] = []
    for toks in sents:
        mapped.append([t if t in vocab else UNK for t in toks])
    return mapped

# -----------------------
# Smoothing A: Add-k
# -----------------------
def add_k_unigram_prob(w: str, uni_counts: Counter, V: int, k: float) -> float:
    N = sum(uni_counts.values())
    return (uni_counts[w] + k) / (N + k * V)

def add_k_bigram_prob(w1: str, w2: str, uni_counts: Counter, bi_counts: Counter,
                      V: int, k: float) -> float:
    # denominator: count(w1) smoothed with k*V
    return (bi_counts[(w1, w2)] + k) / (uni_counts[w1] + k * V)

# -----------------------
# Smoothing B: Jelinek–Mercer
# -----------------------
def mle_unigram_prob(w: str, uni_counts: Counter) -> float:
    N = sum(uni_counts.values())
    return uni_counts[w] / N if N > 0 else 0.0

def mle_bigram_prob(w1: str, w2: str, uni_counts: Counter, bi_counts: Counter) -> float:
    denom = uni_counts[w1]
    return (bi_counts[(w1, w2)] / denom) if denom > 0 else 0.0

def jm_bigram_prob(w1: str, w2: str, uni_counts: Counter, bi_counts: Counter,
                   lamb: float, # 0 <= lamb <= 1
                   backoff_unigram: str = "addk",
                   addk_k: float = 1.0,
                   V: int = None) -> float:
    """
    Jelinek–Mercer interpolation:
      P_JM(w2|w1) = λ * P_MLE_bigram(w2|w1) + (1-λ) * P_unigram(w2)
    For the unigram component, you can use MLE or add-k; default uses add-k for robustness.
    """
    p_bg = mle_bigram_prob(w1, w2, uni_counts, bi_counts)
    if backoff_unigram == "mle":
        p_uni = mle_unigram_prob(w2, uni_counts)
    else:
        assert V is not None, "V (vocab size) required for add-k unigram."
        p_uni = add_k_unigram_prob(w2, uni_counts, V=V, k=addk_k)
    return lamb * p_bg + (1.0 - lamb) * p_uni

In [11]:
# -----------------------
# Train with UNK mapping
# -----------------------
TRAIN_PATH = "train.txt"
ADD_BOUNDARIES = True
MIN_FREQ = 2   # tokens with count < MIN_FREQ become <unk>

# 1) Raw counts
raw_sents = read_corpus(TRAIN_PATH, add_boundaries=ADD_BOUNDARIES)
raw_uni, raw_bi = count_unigrams_bigrams(raw_sents)

# 2) Build vocab & map to <unk>
vocab = build_vocab_from_counts(raw_uni, min_freq=MIN_FREQ, include_special=True)
mapped_sents = map_tokens_to_vocab(raw_sents, vocab)

# 3) Re-count after mapping (this is the model's actual counts)
uni_counts, bi_counts = count_unigrams_bigrams(mapped_sents)
V = len(vocab)  # vocabulary size INCLUDING <unk>, <s>, </s>

# Preview vocabulary stats
stats = pd.DataFrame({
    "metric": ["min_freq", "V (vocab size)", "#tokens (N)", "#bigram types"],
    "value":  [MIN_FREQ, V, sum(uni_counts.values()), len(bi_counts)]
})
display(stats)

Unnamed: 0,metric,value
0,min_freq,2
1,V (vocab size),3412
2,#tokens (N),90708
3,#bigram types,34160


In [12]:
# -----------------------
# Example: compute probs with each smoother
# -----------------------
TOPK = 20
df_uni = pd.DataFrame(uni_counts.most_common(TOPK), columns=["token", "count"])
# Add-k unigram (k=1 Laplace; try also k=0.1)
df_uni["P_addk_k1"]  = df_uni["token"].apply(lambda w: add_k_unigram_prob(w, uni_counts, V=V, k=1.0))
df_uni["P_addk_k01"] = df_uni["token"].apply(lambda w: add_k_unigram_prob(w, uni_counts, V=V, k=0.1))
display(df_uni)

# For bigrams, show top by count and compute smoothed probs
df_bi = pd.DataFrame(bi_counts.most_common(TOPK), columns=["bigram", "count"])
df_bi[["w1","w2"]] = pd.DataFrame(df_bi["bigram"].tolist(), index=df_bi.index)

# Add-k bigram (k=1 and k=0.1)
df_bi["P_addk_k1"]  = df_bi.apply(lambda r: add_k_bigram_prob(r["w1"], r["w2"], uni_counts, bi_counts, V=V, k=1.0), axis=1)
df_bi["P_addk_k01"] = df_bi.apply(lambda r: add_k_bigram_prob(r["w1"], r["w2"], uni_counts, bi_counts, V=V, k=0.1), axis=1)

# Jelinek–Mercer with λ=0.7 (tune later)
LAMB = 0.7
df_bi["P_JM_l0.7"] = df_bi.apply(lambda r: jm_bigram_prob(r["w1"], r["w2"], uni_counts, bi_counts,
                                                          lamb=LAMB, backoff_unigram="addk", addk_k=0.1, V=V), axis=1)
display(df_bi[["w1","w2","count","P_addk_k1","P_addk_k01","P_JM_l0.7"]])

Unnamed: 0,token,count,P_addk_k1,P_addk_k01
0,.,4692,0.049862,0.051534
1,the,4250,0.045166,0.046679
2,<unk>,3847,0.040884,0.042253
3,",",2949,0.031343,0.03239
4,and,2552,0.027125,0.02803
5,a,2215,0.023544,0.024329
6,to,2077,0.022078,0.022813
7,was,1820,0.019348,0.01999
8,I,1659,0.017637,0.018222
9,in,1212,0.012888,0.013313


Unnamed: 0,w1,w2,count,P_addk_k1,P_addk_k01,P_JM_l0.7
0,.,The,858,0.105997,0.170488,0.131416
1,.,I,522,0.064536,0.103731,0.083344
2,<unk>,.,418,0.057721,0.099828,0.091519
3,in,the,388,0.084126,0.249871,0.238096
4,.,</s>,371,0.045903,0.07373,0.057037
5,of,the,342,0.077044,0.247683,0.244196
6,.,We,339,0.041955,0.067373,0.052095
7,",",and,322,0.050778,0.097897,0.084842
8,at,the,321,0.077985,0.30344,0.327393
9,the,hotel,287,0.037588,0.062533,0.050312


In [13]:
# -----------------------
# Save artifacts for report/reuse
# -----------------------
OUT_DIR = "/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Save vocabulary
with open(os.path.join(OUT_DIR, "vocab.txt"), "w", encoding="utf-8") as f:
    for w in sorted(vocab):
        f.write(w + "\n")

# Save counts
with open(os.path.join(OUT_DIR, "unigram_counts.json"), "w", encoding="utf-8") as f:
    json.dump({w: int(c) for w, c in uni_counts.items()}, f, ensure_ascii=False)
with open(os.path.join(OUT_DIR, "bigram_counts.json"), "w", encoding="utf-8") as f:
    json.dump({" || ".join([w1, w2]): int(c) for (w1, w2), c in bi_counts.items()}, f, ensure_ascii=False)

print("Artifacts saved in:", OUT_DIR)

Artifacts saved in: /outputs


In [14]:
# NLP A1 — Section 5: Perplexity on Validation Set (Jupyter-ready)
# -----------------------------------------------------------------
# This cell computes perplexity on /mnt/data/val.txt for multiple models:
#   - Unigram Add-k (k in {1.0, 0.1})
#   - Bigram Add-k (k in {1.0, 0.1})
#   - Bigram Jelinek–Mercer (λ in {0.7, 0.5})
#
# It REBUILDS the training vocab and counts (with UNK mapping) to be self-contained.
# Validation tokens are mapped to <unk> using the TRAIN vocabulary.
#
# PP = exp( (1/N) * sum_i [ -log P(w_i | history) ] )
# We use natural log; PP uses exp of mean negative log-probability.
#
# You can tweak: MIN_FREQ, ADD_BOUNDARIES, K values, and LAMBDA values below.

In [15]:
from collections import Counter
from typing import List, Tuple, Dict, Iterable, Set
import os, json, math
import pandas as pd
from IPython.display import display

In [16]:
# -----------------------
# Constants / Settings
# -----------------------
BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"

TRAIN_PATH = "train.txt"
VAL_PATH   = "val.txt"
ADD_BOUNDARIES = True
MIN_FREQ = 2

ADDK_GRID_UNI = [1.0, 0.1]
ADDK_GRID_BI  = [1.0, 0.1]
JM_LAMBDAS    = [0.7, 0.5]
JM_UNIGRAM_BACKOFF = "addk"   # "addk" or "mle"
JM_ADDK_K = 0.1               # used if backoff == "addk"

In [17]:
# -----------------------
# Helpers
# -----------------------
def read_corpus(path: str, add_boundaries: bool = True) -> List[List[str]]:
    sents: List[List[str]] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            toks = line.split()
            if add_boundaries:
                toks = [BOS] + toks + [EOS]
            sents.append(toks)
    return sents

# def read_corpus(path: str, add_boundaries: bool = False) -> List[List[str]]:
#     """
#     Reads a tokenized corpus; each line is a review with space-delimited tokens.
#     If add_boundaries=True, wraps each line with BOS/EOS.
#     Returns: List[List[str]] where each inner list is the token sequence for one line.
#     """
#     sents: List[List[str]] = []
#     with open(path, "r", encoding="utf-8") as f:
#         for line in f:
#             line = line.strip()
#             if not line:
#                 continue
#             toks = line.split()
#             toks = trans_lower(toks)
#             if add_boundaries:
#                 toks = [BOS] + toks + [EOS]
#             sents.append(toks)
#     return sents

def count_unigrams_bigrams(sents: Iterable[List[str]]) -> Tuple[Counter, Counter]:
    uni = Counter()
    bi = Counter()
    for toks in sents:
        uni.update(toks)
        for i in range(1, len(toks)):
            bi[(toks[i-1], toks[i])] += 1
    return uni, bi

def build_vocab_from_counts(uni_counts: Counter, min_freq: int = 1,
                            include_special: bool = True) -> Set[str]:
    vocab = {w for w, c in uni_counts.items() if c >= min_freq}
    if include_special:
        vocab |= {BOS, EOS, UNK}
    return vocab

def map_tokens_to_vocab(sents: Iterable[List[str]], vocab: Set[str]) -> List[List[str]]:
    return [[t if t in vocab else UNK for t in toks] for toks in sents]

# Prob functions
def add_k_unigram_prob(w: str, uni_counts: Counter, V: int, k: float) -> float:
    N = sum(uni_counts.values())
    return (uni_counts[w] + k) / (N + k * V)

def add_k_bigram_prob(w1: str, w2: str, uni_counts: Counter, bi_counts: Counter,
                      V: int, k: float) -> float:
    return (bi_counts[(w1, w2)] + k) / (uni_counts[w1] + k * V)

def mle_unigram_prob(w: str, uni_counts: Counter) -> float:
    N = sum(uni_counts.values())
    return uni_counts[w] / N if N > 0 else 0.0

def mle_bigram_prob(w1: str, w2: str, uni_counts: Counter, bi_counts: Counter) -> float:
    denom = uni_counts[w1]
    return (bi_counts[(w1, w2)] / denom) if denom > 0 else 0.0

def jm_bigram_prob(w1: str, w2: str, uni_counts: Counter, bi_counts: Counter,
                   lamb: float, backoff_unigram: str = "addk",
                   addk_k: float = 0.1, V: int = None) -> float:
    p_bg = mle_bigram_prob(w1, w2, uni_counts, bi_counts)
    if backoff_unigram == "mle":
        p_uni = mle_unigram_prob(w2, uni_counts)
    else:
        assert V is not None
        p_uni = add_k_unigram_prob(w2, uni_counts, V=V, k=addk_k)
    return lamb * p_bg + (1.0 - lamb) * p_uni

# Perplexity
def corpus_perplexity_unigram(sents: Iterable[List[str]], prob_fn) -> float:
    logprob = 0.0
    N = 0
    for toks in sents:
        for w in toks:
            p = prob_fn(w)
            if p <= 0.0:
                return float("inf")
            logprob += math.log(p)
            N += 1
    return math.exp(-logprob / max(N, 1))

def corpus_perplexity_bigram(sents: Iterable[List[str]], prob_fn) -> float:
    logprob = 0.0
    N = 0
    for toks in sents:
        for i in range(1, len(toks)):
            w1, w2 = toks[i-1], toks[i]
            p = prob_fn(w1, w2)
            if p <= 0.0:
                return float("inf")
            logprob += math.log(p)
            N += 1
    return math.exp(-logprob / max(N, 1))

In [18]:
# -----------------------
# Train vocab & counts
# -----------------------
train_raw = read_corpus(TRAIN_PATH, add_boundaries=ADD_BOUNDARIES)
train_uni_raw, _ = count_unigrams_bigrams(train_raw)
vocab = build_vocab_from_counts(train_uni_raw, min_freq=MIN_FREQ, include_special=True)

# Map both train and validation to vocab
train = map_tokens_to_vocab(train_raw, vocab)
val_raw = read_corpus(VAL_PATH, add_boundaries=ADD_BOUNDARIES)
val = map_tokens_to_vocab(val_raw, vocab)

# Re-count on mapped train (model counts)
uni_counts, bi_counts = count_unigrams_bigrams(train)
V = len(vocab)

In [19]:
# -----------------------
# Evaluate multiple models
# -----------------------
records = []

# Unigram Add-k
for k in ADDK_GRID_UNI:
    def _uni_pk(w, k=k):
        return add_k_unigram_prob(w, uni_counts, V=V, k=k)
    pp = corpus_perplexity_unigram(val, _uni_pk)
    records.append({"model": "unigram-addk", "params": f"k={k}", "PP": pp})

# Bigram Add-k
for k in ADDK_GRID_BI:
    def _bi_pk(w1, w2, k=k):
        return add_k_bigram_prob(w1, w2, uni_counts, bi_counts, V=V, k=k)
    pp = corpus_perplexity_bigram(val, _bi_pk)
    records.append({"model": "bigram-addk", "params": f"k={k}", "PP": pp})

# Bigram JM
for lamb in JM_LAMBDAS:
    def _bi_pjm(w1, w2, lamb=lamb):
        return jm_bigram_prob(w1, w2, uni_counts, bi_counts, lamb=lamb,
                              backoff_unigram=JM_UNIGRAM_BACKOFF, addk_k=JM_ADDK_K, V=V)
    pp = corpus_perplexity_bigram(val, _bi_pjm)
    records.append({"model": "bigram-jm", "params": f"lambda={lamb}, backoff={JM_UNIGRAM_BACKOFF}, k={JM_ADDK_K}", "PP": pp})

results = pd.DataFrame(records).sort_values("PP")
display(results)

# Save
OUT_DIR = "/mnt/data/ngram_outputs_section5"
os.makedirs(OUT_DIR, exist_ok=True)
results.to_csv(os.path.join(OUT_DIR, "perplexity_results.csv"), index=False)
with open(os.path.join(OUT_DIR, "train_vocab.txt"), "w", encoding="utf-8") as f:
    for w in sorted(vocab):
        f.write(w + "\n")
with open(os.path.join(OUT_DIR, "config.json"), "w", encoding="utf-8") as f:
    json.dump({
        "ADD_BOUNDARIES": ADD_BOUNDARIES,
        "MIN_FREQ": MIN_FREQ,
        "ADDK_GRID_UNI": ADDK_GRID_UNI,
        "ADDK_GRID_BI": ADDK_GRID_BI,
        "JM_LAMBDAS": JM_LAMBDAS,
        "JM_UNIGRAM_BACKOFF": JM_UNIGRAM_BACKOFF,
        "JM_ADDK_K": JM_ADDK_K,
        "V": V
    }, f, ensure_ascii=False, indent=2)

print("Saved results to:", os.path.join(OUT_DIR, "perplexity_results.csv"))

Unnamed: 0,model,params,PP
4,bigram-jm,"lambda=0.7, backoff=addk, k=0.1",105.37636
5,bigram-jm,"lambda=0.5, backoff=addk, k=0.1",110.984653
3,bigram-addk,k=0.1,193.872483
1,unigram-addk,k=0.1,313.099999
0,unigram-addk,k=1.0,315.863378
2,bigram-addk,k=1.0,467.468485


Saved results to: /mnt/data/ngram_outputs_section5\perplexity_results.csv
