### Imports

In [None]:
import json
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import pandas as pd  

### Directories / Setup

In [5]:
cleaned_path = Path("../data/cleaned")
ngrams_path = Path("../data/ngrams")
vocab_path = Path("../data/vocab")
ngrams_path.mkdir(parents=True, exist_ok=True)
vocab_path.mkdir(parents=True, exist_ok=True)

n = 3 # n-gram size

### Generate N-Grams and Master Vocab

In [9]:
def generate_char_ngrams(text: str, n: int = 3):
    """
    Generate overlapping character n-grams from text.
    Example: "hello" (n=3) -> ['hel', 'ell', 'llo']
    """
    # replace spaces with underscore to preserve word boundaries
    text = text.replace(" ", "_")
    ngrams = [text[i:i+n] for i in range(len(text) - n + 1)]
    return ngrams

In [11]:
all_vocab = set()  # use a set to store master vocab so no duplicates
language_files = list(cleaned_path.glob("*.txt"))
language_ngram_freqs = {}  # Dictionary to store n-gram frequencies per language

# iterate through cleaned text files
for file in tqdm(language_files, desc="Generating character ngrams"):
    lang = file.stem.replace("_cleaned", "")

    # read clean text and store as in variable text
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()
    
    # generate ngrams and their frequencies using helper function
    ngrams = generate_char_ngrams(text, n)
    freq = Counter(ngrams) # ngram -> frequency (key value pair)
    
    # save ngram and their frequencies as json per language to ../data/ngrams/
    out_path = ngrams_path / f"{lang}_char{n}_ngrams.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(freq, f, ensure_ascii=False, indent=2)
    
    # add ngrams to master vocab set (deduplicated)
    all_vocab.update(freq.keys())
    
    # store frequency dictionary for the language ("english": {ngram: freq, ...})
    language_ngram_freqs[lang] = freq

# create df with master vocab as rows and languages as columns
df = pd.DataFrame(index=list(all_vocab))  # initialize master vocab as index

# populate the df with frequencies for each language
for lang, freq in language_ngram_freqs.items():
    df[lang] = df.index.map(freq).fillna(0)  # map frequencies to the master vocab, fill missing with 0

df.to_csv(vocab_path / f"master_vocab_char{n}_ngrams.csv", encoding="utf-8")

vocab_file = vocab_path / f"master_char{n}_vocab.txt"
with open(vocab_file, "w", encoding="utf-8") as f:
    for token in sorted(all_vocab):
        f.write(token + "\n")

print(f"\nFinished generating n-grams for {len(language_files)} languages.")
print(f"Master vocabulary size: {len(all_vocab):,} unique {n}-grams.")
print(f"Saved master vocabulary and frequencies to CSV.")

Generating character ngrams: 100%|██████████| 16/16 [00:01<00:00,  8.92it/s]




Finished generating n-grams for 16 languages.
Master vocabulary size: 6,378 unique 3-grams.
Saved master vocabulary and frequencies to CSV.
