1. Clean data


In [39]:
import re

def clean_wikipedia_text(text):
    # Remove citations like [1], [2][3], [citation needed]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove parenthetical information
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove special formatting (e.g., ''italic'' or '''bold''')
    text = re.sub(r"''+|\<.*?\>", '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove lines starting with special characters (-, *, digits)
    text = '\n'.join(line for line in text.split('\n') if not line.strip().startswith(('-', '*', '1.')))
    
    return text

# Load text and clean
with open('data/wiki_dataset.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

cleaned_text = clean_wikipedia_text(raw_text)

# Save cleaned text
with open('data/cleaned_wiki_text.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

2. Spit data into 3 subset for training, testing and validation 

In [40]:
import random
import re
from nltk.tokenize import word_tokenize, sent_tokenize

# Load and preprocess corpus
with open('data/cleaned_wiki_text.txt', 'r', encoding='utf-8') as file:
    # Convert to lowercase
    corpus = file.read().lower()

# Clean up text: remove extra newlines and leading/trailing spaces
corpus = re.sub(r'\n+', ' ', corpus).strip()  # Replace multiple newlines with a space
corpus = re.sub(r'\s+', ' ', corpus) 
 # Replace multiple spaces with a single space

# Split into sentences
# sentences = corpus.split('.')
sentences = sent_tokenize(corpus)

# Remove empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Shuffle sentences
random.shuffle(sentences)

# Split into training, validation, and testing sets
train_split = int(0.7 * len(sentences))
val_split = int(0.8 * len(sentences))

train_set = sentences[:train_split]
val_set = sentences[train_split:val_split]
test_set = sentences[val_split:]

# Verify results
print(f"Total sentences: {len(sentences)}")
print(f"Training set: {len(train_set)} sentences")
print(f"Validation set: {len(val_set)} sentences")
print(f"Testing set: {len(test_set)} sentences")

# Optional: Save the sets to files for further use
with open('data/wiki_article/train_set.txt', 'w', encoding='utf-8') as file:
    file.write('. '.join(train_set) + '.')

with open('data/wiki_article/val_set.txt', 'w', encoding='utf-8') as file:
    file.write('. '.join(val_set) + '.')

with open('data/wiki_article/test_set.txt', 'w', encoding='utf-8') as file:
    file.write('. '.join(test_set) + '.')


Total sentences: 507
Training set: 354 sentences
Validation set: 51 sentences
Testing set: 102 sentences


In [41]:
import re
from collections import Counter

# Tokenize
tokens = word_tokenize(' '.join(train_set))
vocab_size = 20000 # Limit vocabulary size
token_counts = Counter(tokens)
vocab = {word for word, _ in token_counts.most_common(vocab_size)}

# Clean up unwanted characters
def clean_tokens(tokens):
    cleaned = []
    for sentence in tokens:
        cleaned_sentence = [re.sub(r'[^\w.]', '', word) for word in sentence]  # Keep only words and periods
        cleaned_sentence = [word for word in cleaned_sentence if word]  # Remove empty tokens
        cleaned.append(cleaned_sentence)
    return cleaned

# Replace words not in vocab with <UNK> and clean
def replace_with_unk(data):
    return [
        [word if word in vocab else '<UNK>' for word in word_tokenize(sentence)]
        for sentence in data
    ]

train_tokens = replace_with_unk(train_set)
val_tokens = replace_with_unk(val_set)
test_tokens = replace_with_unk(test_set)

# Clean the tokenized data
train_tokens = clean_tokens(train_tokens)
val_tokens = clean_tokens(val_tokens)
test_tokens = clean_tokens(test_tokens)

# Check results
print(train_tokens[:10])  # Show first 10 sentences from train_tokens
print(val_tokens[:10])    # Show first 10 sentences from val_tokens
print(test_tokens[:10])   # Show first 10 sentences from test_tokens


[['demonstrators', 'were', 'injured', 'and', 'killed', 'in', 'phnom', 'penh', 'where', 'a', 'reported', '20000', 'protesters', 'gathered', 'with', 'some', 'clashing', 'with', 'riot', 'police', '.'], ['in', '2012', 'the', 'exports', 'grew', 'to', '4.61', 'billion', 'up', '8', 'over', '2011.', 'in', 'the', 'first', 'half', 'of', '2013', 'the', 'garment', 'industry', 'reported', 'exports', 'worth', '1.56', 'billion', '.'], ['hun', 'sen', 'remains', 'the', 'de', 'facto', 'ruler', 'of', 'cambodia', 'through', 'his', 'continued', 'leadership', 'of', 'the', 'cambodian', 'people', 's', 'party', '.'], ['that', 'same', 'year', 'it', 'was', 'estimated', 'that', 'there', 'were', 'about', '100000', 'sex', 'workers', 'in', 'cambodia', '.'], ['in', '2018', 'they', 'handled', 'a', 'record', 'of', '10', 'million', 'passengers', '.'], ['they', 'are', 'taken', 'care', 'of', 'by', 'wrapping', 'in', 'cloth', 'to', 'protect', 'from', 'moisture', 'and', 'the', 'climate', '.'], ['the', 'australian', 'hip', 'h

In [42]:
import pandas as pd 
from collections import Counter 
from nltk.tokenize import word_tokenize
from itertools import chain

flattened_tokens = list(chain.from_iterable(train_tokens))
word_counts = Counter(flattened_tokens)
tokens = word_tokenize(corpus.lower()) 
df = pd.DataFrame(word_counts.items(),columns=['word','Frequency'])
df = df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

# Display the DataFrame
print(df)
for word, count in word_counts.items():
    print(f'{word}: {count}')

            word  Frequency
0            the        636
1              .        354
2             of        342
3            and        302
4             in        258
...          ...        ...
2705       2011.          1
2706       worth          1
2707        1.56          1
2708      option          1
2709  travellers          1

[2710 rows x 2 columns]
demonstrators: 1
were: 23
injured: 2
and: 302
killed: 3
in: 258
phnom: 26
penh: 24
where: 6
a: 145
reported: 6
20000: 1
protesters: 2
gathered: 1
with: 49
some: 14
clashing: 1
riot: 1
police: 1
.: 354
2012: 2
the: 636
exports: 3
grew: 3
to: 167
4.61: 1
billion: 3
up: 6
8: 3
over: 10
2011.: 1
first: 8
half: 4
of: 342
2013: 5
garment: 6
industry: 6
worth: 1
1.56: 1
hun: 19
sen: 16
remains: 2
de: 5
facto: 5
ruler: 1
cambodia: 168
through: 5
his: 15
continued: 1
leadership: 1
cambodian: 56
people: 22
s: 75
party: 12
that: 33
same: 3
year: 4
it: 27
was: 58
estimated: 3
there: 12
about: 11
100000: 2
sex: 2
workers: 5
2018: 6
they: 7
hand

In [None]:
from collections import defaultdict

def build_ngram_model(data, n):
    model = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i+n])
            prefix, token = ngram[:-1], ngram[-1]
            model[prefix][token] += 1
    return model

def backoff_prob(model, ngram):
    prefix, token = ngram[:-1], ngram[-1]
    if prefix in model and sum(model[prefix].values()) > 0:
        if token in model[prefix]:
            return model[prefix][token] / sum(model[prefix].values())
    elif len(prefix) > 1:
        return backoff_prob(model, prefix[1:] + (token,))
    return 0  # Default probability if no valid prefix or tokens

ngram_model = build_ngram_model(train_tokens, 4)


In [None]:
# Build the LM2 interpolated method
def interpolated_prob(model, ngram, lambdas, k):
    n = len(ngram)
    prob = 0
    for i in range(1, n+1):
        prefix, token = ngram[:i-1], ngram[i-1]
        count = model[prefix][token] + k
        total_count = sum(model[prefix].values()) + k * vocab_size
        prob += lambdas[i-1] * (count / total_count)
    return prob

lambdas = [0.1, 0.2, 0.3, 0.4]  # Example weights
k = 1  # Example smoothing parameter


In [62]:

import math
import numpy as np

def perplexity(model, tokens, prob_func):
    log_prob_sum = 0
    total_tokens = 0
    
    for sentence in tokens:
        # Skip very short sentences
        if len(sentence) < 4:
            continue
        
        for i in range(3, len(sentence)):
            # Create n-gram
            ngram = tuple(sentence[i-3:i+1])
            
            try:
                # Calculate probability
                prob = prob_func(model, ngram)
                
                # Handle zero or negative probabilities
                if prob <= 0:
                    prob = 1e-10  # Small positive value
                
                # Sum log probabilities
                log_prob = math.log2(prob)
                log_prob_sum += log_prob
                total_tokens += 1
            
            except Exception as e:
                print(f"Error processing n-gram {ngram}: {e}")
                continue
    
    # Avoid division by zero
    if total_tokens == 0:
        return float('inf')
    
    # Calculate perplexity
    perplexity_score = 2 ** (-log_prob_sum / total_tokens)
    
    return perplexity_score

pp_lm1 = perplexity(model=ngram_model, tokens=test_tokens, prob_func=backoff_prob)
pp_lm2 = perplexity(ngram_model, test_tokens, lambda m, n: interpolated_prob(m, n, lambdas, k))

print(f"pp_lm1: {pp_lm1}")
print(f"pp_lm2: {pp_lm2}")

pp_lm1: 5478938442.122225
pp_lm2: 19745.429675291467


In [None]:
import random

def generate_text(model, start, length, prob_func):
    text = list(start)
    for _ in range(length - len(start)):
        prefix = tuple(text[-3:])
        if prefix not in model or not model[prefix] or sum(model[prefix].values()) == 0:
            next_words = random.choice(train_tokens)[:3]
            for next_word in next_words:
                text.append(next_word)
        else:
            next_word = max(model[prefix], key=lambda x: prob_func(model, prefix + (x,)))
            text.append(next_word)
    return ' '.join(text)

# Usage
start_tokens = list(random.choice(train_tokens)[:3])
backoff_text = generate_text(ngram_model, start_tokens, 100, backoff_prob)
start_tokens = list(random.choice(train_tokens)[:3])
interpolation_text = generate_text(ngram_model, start_tokens, 100, lambda m, n: interpolated_prob(m, n, lambdas, k))
print(f"Backoff Model: {backoff_text}")
print(f"Interpolation Model: {interpolation_text}")


AttributeError: 'list' object has no attribute 'capitalize'