1. Clean data


In [60]:
import re

def clean_wikipedia_text(text):
    # Remove citations like [1], [2][3], [citation needed]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove parenthetical information
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove special formatting (e.g., ''italic'' or '''bold''')
    text = re.sub(r"''+|\<.*?\>", '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove lines starting with special characters (-, *, digits)
    text = '\n'.join(line for line in text.split('\n') if not line.strip().startswith(('-', '*', '1.')))
    
    return text

# Load text and clean
with open('data/wiki_dataset.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

cleaned_text = clean_wikipedia_text(raw_text)

# Save cleaned text
with open('data/cleaned_wiki_text.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

2. Spit data into 3 subset for training, testing and validation 

In [61]:
import random
import re
from nltk.tokenize import word_tokenize, sent_tokenize

# Load and preprocess corpus
with open('data/cleaned_wiki_text.txt', 'r') as file:
    # Convert to lowercase
    corpus = file.read().lower()

# Clean up text: remove extra newlines and leading/trailing spaces
corpus = re.sub(r'\n+', ' ', corpus).strip()  # Replace multiple newlines with a space
corpus = re.sub(r'\s+', ' ', corpus) 
 # Replace multiple spaces with a single space

# Split into sentences
# sentences = corpus.split('.')
sentences = sent_tokenize(corpus)

# Remove empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Shuffle sentences
random.shuffle(sentences)

# Split into training, validation, and testing sets
train_split = int(0.7 * len(sentences))
val_split = int(0.8 * len(sentences))

train_set = sentences[:train_split]
val_set = sentences[train_split:val_split]
test_set = sentences[val_split:]

# Verify results
print(f"Total sentences: {len(sentences)}")
print(f"Training set: {len(train_set)} sentences")
print(f"Validation set: {len(val_set)} sentences")
print(f"Testing set: {len(test_set)} sentences")

# Optional: Save the sets to files for further use
with open('data/wiki_article/train_set.txt', 'w') as file:
    file.write('. '.join(train_set) + '.')

with open('data/wiki_article/val_set.txt', 'w') as file:
    file.write('. '.join(val_set) + '.')

with open('data/wiki_article/test_set.txt', 'w') as file:
    file.write('. '.join(test_set) + '.')


Total sentences: 507
Training set: 354 sentences
Validation set: 51 sentences
Testing set: 102 sentences


In [62]:
import re
from collections import Counter

# Tokenize
tokens = word_tokenize(' '.join(train_set))
vocab_size = 20000 # Limit vocabulary size
token_counts = Counter(tokens)
vocab = {word for word, _ in token_counts.most_common(vocab_size)}

# Clean up unwanted characters
def clean_tokens(tokens):
    cleaned = []
    for sentence in tokens:
        cleaned_sentence = [re.sub(r'[^\w.]', '', word) for word in sentence]  # Keep only words and periods
        cleaned_sentence = [word for word in cleaned_sentence if word]  # Remove empty tokens
        cleaned.append(cleaned_sentence)
    return cleaned

# Replace words not in vocab with <UNK> and clean
def replace_with_unk(data):
    return [
        [word if word in vocab else '<UNK>' for word in word_tokenize(sentence)]
        for sentence in data
    ]

train_tokens = replace_with_unk(train_set)
val_tokens = replace_with_unk(val_set)
test_tokens = replace_with_unk(test_set)

# Clean the tokenized data
train_tokens = clean_tokens(train_tokens)
val_tokens = clean_tokens(val_tokens)
test_tokens = clean_tokens(test_tokens)

# Check results
print(train_tokens[:10])  # Show first 10 sentences from train_tokens
print(val_tokens[:10])    # Show first 10 sentences from val_tokens
print(test_tokens[:10])   # Show first 10 sentences from test_tokens


[['angkor', 'wat', 'is', 'the', 'bestpreserved', 'example', 'of', 'khmer', 'architecture', 'from', 'the', 'angkorian', 'era', 'along', 'with', 'hundreds', 'of', 'other', 'temples', 'that', 'have', 'been', 'discovered', 'in', 'and', 'around', 'the', 'region', '.'], ['the', 'attack', 'against', 'the', 'president', 'of', 'the', 'overseas', 'press', 'club', 'of', 'cambodia', 'rick', 'valenzuela', 'was', 'captured', 'on', 'video', '.'], ['a', 'map', 'of', 'indochina', 'in', '1760', 'the', 'hill', 'tribe', 'people', 'were', 'hunted', 'incessantly', 'and', 'carried', 'off', 'as', 'slaves', 'by', 'the', 'siamese', 'the', 'annamites', 'and', 'the', 'cambodians', '.'], ['cambodia', 'is', 'the', '70th', 'most', 'peaceful', 'country', 'in', 'the', 'world', 'according', 'to', 'the', '2024', 'global', 'peace', 'index', '.'], ['textiles', 'the', 'garment', 'industry', 'represents', 'the', 'largest', 'portion', 'of', 'cambodia', 's', 'manufacturing', 'sector', 'accounting', 'for', '80', 'of', 'the', '

In [63]:
import pandas as pd 
from collections import Counter 
from nltk.tokenize import word_tokenize
from itertools import chain
# for i in train_tokens:
#     train_corpus = train_tokens[i]+1
# print(train_corpus)
flattened_tokens = list(chain.from_iterable(train_tokens))
word_counts = Counter(flattened_tokens)
tokens = word_tokenize(corpus.lower()) 
df = pd.DataFrame(word_counts.items(),columns=['word','Frequency'])
df = df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

# Display the DataFrame
print(df)
for word, count in word_counts.items():
    print(f'{word}: {count}')

          word  Frequency
0          the        644
1            .        354
2           of        326
3          and        302
4           in        258
...        ...        ...
2768  allowing          1
2769    normal          1
2770      sink          1
2771       end          1
2772  malaysia          1

[2773 rows x 2 columns]
angkor: 6
wat: 3
is: 98
the: 644
bestpreserved: 1
example: 1
of: 326
khmer: 50
architecture: 1
from: 61
angkorian: 3
era: 4
along: 5
with: 51
hundreds: 2
other: 19
temples: 3
that: 38
have: 20
been: 21
discovered: 3
in: 258
and: 302
around: 10
region: 7
.: 354
attack: 1
against: 3
president: 4
overseas: 1
press: 3
club: 1
cambodia: 165
rick: 1
valenzuela: 1
was: 62
captured: 1
on: 44
video: 1
a: 149
map: 4
indochina: 4
1760: 1
hill: 3
tribe: 1
people: 24
were: 24
hunted: 1
incessantly: 1
carried: 3
off: 3
as: 60
slaves: 1
by: 73
siamese: 2
annamites: 1
cambodians: 10
70th: 1
most: 22
peaceful: 1
country: 39
world: 15
according: 5
to: 175
2024: 2
global: 2

In [64]:
from collections import defaultdict

def build_ngram_model(data, n):
    model = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i+n])
            prefix, token = ngram[:-1], ngram[-1]
            model[prefix][token] += 1
    return model
def safe_backoff_prob(model, ngram):
    try:
        prefix, token = ngram[:-1], ngram[-1]
        if prefix in model and sum(model[prefix].values()) > 0:
            if token in model[prefix]:
                return model[prefix][token] / sum(model[prefix].values())
        return 1e-10  # Fallback probability
    except Exception as e:
        print(f"Error in backoff_prob: {e}")
        return 1e-10

lm1 = build_ngram_model(train_tokens, 4)


In [65]:
# Build the LM2 interpolated method 
def safe_interpolated_prob(model, ngram, lambdas, k):
    try:
        n = len(ngram)
        prob = 0
        for i in range(1, n+1):
            prefix, token = ngram[:i-1], ngram[i-1]
            count = model[prefix][token] + k
            total_count = sum(model[prefix].values()) + k * vocab_size
            prob += lambdas[i-1] * (count / total_count)
        return max(prob, 1e-10)
    except Exception as e:
        print(f"Error in interpolated_prob: {e}")
        return 1e-10

lambdas = [0.1, 0.2, 0.3, 0.4]  # Example weights
k = 1  # Example smoothing parameter

lm2  = interpolated_prob(train_tokens, 4, lambdas, k)


TypeError: object of type 'int' has no len()

In [None]:

import math
import numpy as np

def perplexity(model, tokens, prob_func):
    log_prob_sum = 0
    total_tokens = 0
    
    for sentence in tokens:
        # Skip very short sentences
        if len(sentence) < 4:
            continue
        
        for i in range(3, len(sentence)):
            # Create n-gram
            ngram = tuple(sentence[i-3:i+1])
            
            try:
                # Calculate probability
                prob = prob_func(model, ngram)
                
                # Handle zero or negative probabilities
                if prob <= 0:
                    prob = 1e-10  # Small positive value
                
                # Sum log probabilities
                log_prob = math.log2(prob)
                log_prob_sum += log_prob
                total_tokens += 1
            
            except Exception as e:
                print(f"Error processing n-gram {ngram}: {e}")
                continue
    
    # Avoid division by zero
    if total_tokens == 0:
        return float('inf')
    
    # Calculate perplexity
    perplexity_score = 2 ** (-log_prob_sum / total_tokens)
    
    return perplexity_score

pp_lm1 = perplexity(lm1, test_tokens, backoff_prob)
pp_lm2 = perplexity(lm1, test_tokens, lambda m, n: interpolated_prob(m, n, lambdas, k))

print(f"pp_lm1: {pp_lm1}")
print(f"pp_lm2: {pp_lm2}")

pp_lm1: 6375633958.296782
pp_lm2: 19802.941892147817


In [None]:
import random
import math

def generate_text(model, start, length, prob_func):
    text = list(start)
    attempts = 0
    max_attempts = 100  # Prevent infinite loops

    while len(text) < length and attempts < max_attempts:
        attempts += 1
        
        prefix = tuple(text[max(0, len(text)-3):])
        
        if prefix in model and model[prefix]:
            candidates = list(model[prefix].items())
            total = sum(count for _, count in candidates)
            
            r = random.uniform(0, total)
            cumulative = 0
            for word, count in candidates:
                cumulative += count
                if r <= cumulative:
                    text.append(word)
                    break
        else:
            next_word = random.choice(list(chain.from_iterable(train_tokens)))
            text.append(next_word)

        if attempts > length * 2:
            break

    return ' '.join(text)

# Usage
start_tokens = list(random.choice(train_tokens)[:3])  # Use train tokens for start
backoff_text = generate_text(lm1, start_tokens, 20, backoff_prob)
interpolation_text = generate_text(lm, start_tokens, 20, backoff_prob)
print(f"Backoff Model: {backoff_text}")
print(f"Interpolation Model: {interpolation_text}")


besides the main interprovincial traffic artery connecting phnom penh with koh kong and hence there is now uninterrupted road access
