# **Week 1 Practical Exercise: Clean and Tokenize**

In this notebook, You will clean a mini-corpus of 1 000 Amazon reviews, apply three different tokenisers, collect simple coverage metrics, and decide which approach best suits this dataset.

**SECTION 0 — Imports & Dataset** -------------------------------------------------
We pull 1 000 Amazon reviews from the Hugging Face Hub.  Feel free to
replace with a local CSV by editing the `load_dataset` call.

In [None]:
!pip -q install --upgrade datasets unidecode sentencepiece spacy numpy pandas

from datasets import load_dataset
from unidecode import unidecode
import re, html, unicodedata, random, json
from pathlib import Path
from collections import Counter

# Download a 1 000‑example slice (balanced pos/neg)
raw_ds = load_dataset("amazon_polarity", split="train[:1000]")
print("Loaded", len(raw_ds), "Amazon reviews")

**SECTION 1 — Cleaning helpers**  (✅ already implemented)

In [None]:
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
EMOJI_RE = re.compile(r"[\U00010000-\U0010ffff]", flags=re.UNICODE)

def clean_text(text: str, *, keep_emoji: bool = True) -> str:
    """Basic cleaner: HTML → NFC → lowercase → optional emoji strip."""
    text = html.unescape(text)
    text = unicodedata.normalize("NFC", text)
    text = URL_RE.sub("", text)
    if not keep_emoji:
        text = EMOJI_RE.sub("", text)
    text = unidecode(text)           # strip diacritics
    return text.lower().strip()

# Preview
print(raw_ds[0]["content"][:120])
print(clean_text(raw_ds[0]["content"])[:120])

**SECTION 2—Tokenizers **

Implement three tokenizers:
   1.  Whitespace + spaCy rules (English)
   2.  WordPiece (BERT base uncased)
   3.  SentencePiece Unigram (train vocab = 8 000)
 Each should expose a `.encode(text) -> list[int]` method.

In [None]:
import spacy, torch
from transformers import BertTokenizerFast
import sentencepiece as spm

# 2.1  spaCy whitespace / rule‑based ------------------------------------------
print("Loading spaCy…")
_nlp = spacy.blank("en")

class SpacyTokenizer:
    def encode(self, text):
        return [t.text for t in _nlp.tokenizer(text)]

spacy_tok = SpacyTokenizer()

# 2.2  WordPiece from BERT -----------------------------------------------------
bert_tok = BertTokenizerFast.from_pretrained("bert-base-uncased")

# 2.3  SentencePiece Unigram  (train on the *cleaned* corpus) ------------------
print("Training SentencePiece (8k)…")
TXT = Path("reviews_clean.txt"); TXT.write_text("\n".join(clean_text(x["content"]) for x in raw_ds), encoding="utf-8")
spm.SentencePieceTrainer.Train(input=str(TXT), model_prefix="sp", vocab_size=6803, model_type="unigram", character_coverage=1.0)
sp_tok = spm.SentencePieceProcessor(model_file="sp.model")

# Helper wrapper for uniform API
class SPUnigram:
    def encode(self, text):
        return sp_tok.encode(text)

sp_unigram_tok = SPUnigram()

**SECTION 3—Coverage Metrics ** (## TODO)
For each tokeniser compute:
   * vocab_size             -> len(dictionary)
   * OOV rate               -> % tokens not in vocab (spaCy + WordPiece only)
   * avg sequence length    -> mean tokens per review
 Fill the results dict and print nicely.

In [None]:
from statistics import mean

def metrics(tok, name):
    lengths, oov = [], 0
    all_tokens = [] # To collect tokens for spaCy vocab size
    for ex in raw_ds:
        ids = tok.encode(clean_text(ex["content"]))
        lengths.append(len(ids))
        if name == "spacy":
            all_tokens.extend(ids) # Collect tokens for spaCy
        # WordPiece has id 100 for [UNK]; spaCy uses strings so count missing
        if name == "wordpiece":
            oov += ids.count(100)

    vocab_size = None
    if name == "spacy":
        vocab_size = len(set(all_tokens)) # Calculate spaCy vocab size
    elif name == "wordpiece":
         vocab_size = tok.vocab_size
    elif name == "sentencepiece":
         vocab_size = tok.get_piece_size() # Access the underlying sp_tok object

    return {
        "tokeniser": name,
        "vocab_size": vocab_size,
        "oov_rate": oov / max(1, sum(lengths)) if name == "wordpiece" else "N/A",
        "avg_len": mean(lengths)
    }

# ## TODO: compute metrics for all three tokenisers ---------------------------
results = []
results.append(metrics(spacy_tok,        "spacy"))      # TODO adjust vocab_size logic if desired
results.append(metrics(bert_tok,        "wordpiece"))
results.append(metrics(sp_tok,  "sentencepiece")) # Pass the sp_tok object directly

print(json.dumps(results, indent=2))

**SECTION 4 — Qualitative check**

In [None]:
TEST_SENT = "I’m LOVING ‘Café naïve’ in 2025!! 😊🔥"
print("\nTest sentence:", TEST_SENT)
for name, tok in [("spaCy", spacy_tok), ("WordPiece", bert_tok), ("SentencePiece", sp_unigram_tok)]:
    print(f"\n{name} tokens →", tok.encode(TEST_SENT))