In [26]:
import wikipediaapi

# Set up the Wikipedia API with a valid user-agent
wiki = wikipediaapi.Wikipedia(
    language="en", 
    user_agent="YourAppName/1.0 (your-email@example.com)"
)

# Fetch the article
topic = "Cambodia"
article = wiki.page(topic)

if article.exists():
    with open(f"./data/wiki_dataset.txt", "w", encoding="utf-8") as file:
        file.write(article.text)
    
else:
    print(f"The article '{topic}' does not exist.")


1. Clean data


In [27]:
import re

def clean_wikipedia_text(text):
    # Remove citations like [1], [2][3], [citation needed]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove parenthetical information
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove special formatting (e.g., ''italic'' or '''bold''')
    text = re.sub(r"''+|\<.*?\>", '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove lines starting with special characters (-, *, digits)
    text = '\n'.join(line for line in text.split('\n') if not line.strip().startswith(('-', '*', '1.')))
    
    return text

# Load text and clean
with open('data/wiki_dataset.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

cleaned_text = clean_wikipedia_text(raw_text)

# Save cleaned text
with open('data/cleaned_wiki_text.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

2. Spit data into 3 subset for training, testing and validation 

In [28]:
import random
import re
from nltk.tokenize import word_tokenize, sent_tokenize

# Load and preprocess corpus
with open('data/cleaned_wiki_text.txt', 'r', encoding='utf-8') as file:
    # Convert to lowercase
    corpus = file.read().lower()

# Clean up text: remove extra newlines and leading/trailing spaces
corpus = re.sub(r'\n+', ' ', corpus).strip()  # Replace multiple newlines with a space
corpus = re.sub(r'\s+', ' ', corpus) 
 # Replace multiple spaces with a single space

# Split into sentences
sentences = sent_tokenize(corpus)

# Remove empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Shuffle sentences
random.shuffle(sentences)

# Split into training, validation, and testing sets
train_split = int(0.7 * len(sentences))
val_split = int(0.8 * len(sentences))

train_set = sentences[:train_split]
val_set = sentences[train_split:val_split]
test_set = sentences[val_split:]

# Verify results
print(f"Total sentences: {len(sentences)}")
print(f"Training set: {len(train_set)} sentences")
print(f"Validation set: {len(val_set)} sentences")
print(f"Testing set: {len(test_set)} sentences")

# Optional: Save the sets to files for further use
with open('data/wiki_article/train_set.txt', 'w', encoding='utf-8') as file:
    file.write('. '.join(train_set) + '.')

with open('data/wiki_article/val_set.txt', 'w', encoding='utf-8') as file:
    file.write('. '.join(val_set) + '.')

with open('data/wiki_article/test_set.txt', 'w', encoding='utf-8') as file:
    file.write('. '.join(test_set) + '.')


Total sentences: 502
Training set: 351 sentences
Validation set: 50 sentences
Testing set: 101 sentences


In [29]:
import re
from collections import Counter

# Tokenize
tokens = word_tokenize(' '.join(train_set))
vocab_size = 20000
token_counts = Counter(tokens)
vocab = {word for word, _ in token_counts.most_common(vocab_size)}

# Clean up unwanted characters
def clean_tokens(tokens):
    cleaned = []
    for sentence in tokens:
        cleaned_sentence = [re.sub(r'[^\w.]', '', word) for word in sentence]  # Keep only words and periods
        cleaned_sentence = [word for word in cleaned_sentence if word]  # Remove empty tokens
        cleaned.append(cleaned_sentence)
    return cleaned

# Replace words not in vocab with <UNK> and clean
def replace_with_unk(data):
    return [
        [word if word in vocab else '<UNK>' for word in word_tokenize(sentence)]
        for sentence in data
    ]

train_tokens = replace_with_unk(train_set)
val_tokens = replace_with_unk(val_set)
test_tokens = replace_with_unk(test_set)

# Clean the tokenized data
train_tokens = clean_tokens(train_tokens)
val_tokens = clean_tokens(val_tokens)
test_tokens = clean_tokens(test_tokens)

# Check results
print(train_tokens[:10])  # Show first 10 sentences from train_tokens
print(val_tokens[:10])    # Show first 10 sentences from val_tokens
print(test_tokens[:10])   # Show first 10 sentences from test_tokens


[['when', 'it', 'is', 'dark', 'enough', 'the', 'tea', 'is', 'decanted', 'into', 'another', 'cup', 'and', 'plenty', 'of', 'sugar', 'added', 'but', 'no', 'milk', '.'], ['more', 'rural', 'forms', 'of', 'music', 'include', 'chapei', 'and', 'ayai', '.'], ['in', 'the', '1980s', 'keo', 'surath', 'and', 'others', 'carried', 'on', 'the', 'legacy', 'of', 'the', 'classic', 'singers', 'often', 'remaking', 'their', 'popular', 'songs', '.'], ['this', 'era', 'gave', 'rise', 'to', 'the', 'term', 'killing', 'fields', 'and', 'the', 'prison', 'tuol', 'sleng', 'became', 'known', 'for', 'its', 'history', 'of', 'mass', 'killing', '.'], ['in', '2017', 'cambodia', 's', 'supreme', 'court', 'dissolved', 'the', 'main', 'opposition', 'party', 'cambodia', 'national', 'rescue', 'party', 'paving', 'the', 'way', 'for', 'a', 'return', 'to', 'a', 'yet', 'more', 'authoritarian', 'political', 'system', '.'], ['cambodia', 'was', 'ranked', '103rd', 'in', 'the', 'global', 'innovation', 'index', 'in', '2024.', 'energy', 'cam

In [30]:
import pandas as pd 
from collections import Counter 
from nltk.tokenize import word_tokenize
from itertools import chain

flattened_tokens = list(chain.from_iterable(train_tokens))
word_counts = Counter(flattened_tokens)
tokens = word_tokenize(corpus.lower()) 
df = pd.DataFrame(word_counts.items(),columns=['word','Frequency'])
df = df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

# Display the DataFrame
print(df)
for word, count in word_counts.items():
    print(f'{word}: {count}')

           word  Frequency
0           the        622
1             .        351
2           and        315
3            of        308
4            in        229
...         ...        ...
2595     surath          1
2596  breakdown          1
2597     spread          1
2598  sculpture          1
2599  exchanged          1

[2600 rows x 2 columns]
when: 12
it: 21
is: 98
dark: 1
enough: 1
the: 622
tea: 7
decanted: 1
into: 13
another: 4
cup: 1
and: 315
plenty: 2
of: 308
sugar: 3
added: 1
but: 10
no: 3
milk: 2
.: 351
more: 12
rural: 4
forms: 2
music: 10
include: 8
chapei: 1
ayai: 1
in: 229
1980s: 2
keo: 2
surath: 1
others: 2
carried: 2
on: 36
legacy: 2
classic: 2
singers: 1
often: 9
remaking: 1
their: 15
popular: 10
songs: 1
this: 11
era: 2
gave: 2
rise: 3
to: 167
term: 5
killing: 2
fields: 1
prison: 3
tuol: 1
sleng: 1
became: 5
known: 5
for: 53
its: 25
history: 2
mass: 3
2017: 3
cambodia: 139
s: 66
supreme: 2
court: 3
dissolved: 4
main: 11
opposition: 6
party: 16
national: 22
rescue: 2
pa

In [31]:
from collections import defaultdict

def build_ngram_model(data, n):
    model = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i+n])
            prefix, token = ngram[:-1], ngram[-1]
            model[prefix][token] += 1
    return model

def backoff_prob(model, ngram):
    prefix, token = ngram[:-1], ngram[-1]
    if prefix in model and sum(model[prefix].values()) > 0:
        if token in model[prefix]:
            return model[prefix][token] / sum(model[prefix].values())
    elif len(prefix) > 1:
        return backoff_prob(model, prefix[1:] + (token,))
    return 0  # Default probability if no valid prefix or tokens

ngram_model = build_ngram_model(train_tokens, 4)


In [32]:
# Build the LM2 interpolated method
def interpolated_prob(model, ngram, lambdas, k):
    n = len(ngram)
    prob = 0
    for i in range(1, n+1):
        prefix, token = ngram[:i-1], ngram[i-1]
        count = model[prefix][token] + k
        total_count = sum(model[prefix].values()) + k * vocab_size
        prob += lambdas[i-1] * (count / total_count)
    return prob

lambdas = [0.1, 0.2, 0.3, 0.4]  # Example weights
k = 1  # Example smoothing parameter


In [33]:

import math
import numpy as np

def perplexity(model, tokens, prob_func):
    log_prob_sum = 0
    total_tokens = 0
    
    for sentence in tokens:
        if len(sentence) < 4:
            continue
        
        for i in range(3, len(sentence)):
            # Create n-gram
            ngram = tuple(sentence[i-3:i+1])
            
            try:
                # Calculate probability
                prob = prob_func(model, ngram)
                
                # Handle zero or negative probabilities
                if prob <= 0:
                    prob = 1e-10  # Small positive value
                
                # Sum log probabilities
                log_prob = math.log2(prob)
                log_prob_sum += log_prob
                total_tokens += 1
            
            except Exception as e:
                print(f"Error processing n-gram {ngram}: {e}")
                continue
    
    # Avoid division by zero
    if total_tokens == 0:
        return float('inf')
    
    # Calculate perplexity
    perplexity_score = 2 ** (-log_prob_sum / total_tokens)
    
    return perplexity_score

pp_lm1 = perplexity(model=ngram_model, tokens=test_tokens, prob_func=backoff_prob)
pp_lm2 = perplexity(ngram_model, test_tokens, lambda m, n: interpolated_prob(m, n, lambdas, k))

print(f"pp_lm1: {pp_lm1}")
print(f"pp_lm2: {pp_lm2}")

pp_lm1: 5839811413.019416
pp_lm2: 19780.5554358151


In [None]:
import random

def generate_text(model, start, length, prob_func):
    text = list(start)
    for _ in range(length - len(start)):
        prefix = tuple(text[-3:])
        if prefix not in model or not model[prefix] or sum(model[prefix].values()) == 0:
            next_words = random.choice(train_tokens)[:3]
            for next_word in next_words:
                text.append(next_word)
        else:
            next_word = max(model[prefix], key=lambda x: prob_func(model, prefix + (x,)))
            text.append(next_word)
    return ' '.join(text)

start_tokens = list(random.choice(train_tokens)[:3])
backoff_text = generate_text(ngram_model, start_tokens, 50, backoff_prob)
start_tokens = list(random.choice(train_tokens)[:3])
interpolation_text = generate_text(ngram_model, start_tokens, 50, lambda m, n: interpolated_prob(m, n, lambdas, k))
print(f"Backoff Model: {backoff_text}")
print(f"Interpolation Model: {interpolation_text}")


Backoff Model: most are urban dwellers engaged primarily in commerce . pol pot was determined to keep his power and disenfranchise any enemies or potential threats and thus increased his violent and aggressive actions against his people . a powersharing agreement was agreed with ranariddh and hun sen of the cambodian people s party . this marked the beginning of the 21st century . cambodia has two major ports phnom penh and cambodia s 150 casinos are the main attractions for foreign tourists . flowing south through cambodia s eastern regions is the mekong river of the silt vietnam needs for its rice basket . in response to border raids by the khmer rouge
Interpolation Model: bribes are often demanded from companies operating in cambodia when obtaining licences and permits such as constructionrelated permits . the election was won by funcinpec led by sihanouk s son ranariddh in a hung parliament . in 2018 they handled a record of 10 million passengers . according to the international de