In [None]:
import nltk('all')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import random
import math
from collections import defaultdict
from datasets import load_dataset
import nltk


def load_and_preprocess_data(num_sentences=50000):

    print("Step 1: Loading dataset from Hugging Face... ")
    # Using a subset of the Wikipedia dataset
    wiki_dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split='train') # Load the wikitext dataset

    print(f"Step 2: Processing {num_sentences} sentences...")
    sentences = []
    count = 0
    # Explicitly load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Load the punkt tokenizer
    for paragraph in wiki_dataset: # Iterate through paragraphs in the dataset
        if count >= num_sentences: # Check if the desired number of sentences has been reached
            break
        text = paragraph['text'].strip() # Get the text of the paragraph and remove leading/trailing whitespace
        # Ignore empty lines and document headers
        if text and not text.startswith('='): # Check if the text is not empty and not a header
            sents_in_paragraph = tokenizer.tokenize(text) # Tokenize the paragraph into sentences
            for sent in sents_in_paragraph: # Iterate through sentences in the paragraph
                if count >= num_sentences: # Check if the desired number of sentences has been reached
                    break
                # Add start/end tokens and lowercase.
                # Three start tokens are added to support up to a 4-gram model.
                tokenized_sent = ["<s>", "<s>", "<s>"] + nltk.word_tokenize(sent.lower()) + ["</s>"] # Tokenize the sentence, convert to lowercase, and add start/end tokens
                sentences.append(tokenized_sent) # Add the processed sentence to the list
                count += 1 # Increment the sentence count

    print(f" Successfully processed {len(sentences)} sentences.")
    return sentences # Return the list of processed sentences


def train_ngram_model(sentences, n, vocab_size):

    model = defaultdict(lambda: defaultdict(lambda: 0)) # Initialize the model with default dictionaries for counts

    print(f"Step 3: Training {n}-gram model...")
    # Count frequencies of n-grams
    for sent in sentences: # Iterate through each sentence
        for i in range(len(sent) - n + 1): # Iterate through the sentence to get n-grams
            context = tuple(sent[i : i + n - 1]) # Get the context (n-1 words before the target)
            target = sent[i + n - 1] # Get the target word
            model[context][target] += 1 # Increment the count for the n-gram (context, target)

    # Calculate probabilities with Laplace Smoothing
    for context in model: # Iterate through each context in the model
        total_count = float(sum(model[context].values())) # Calculate the total count for the context
        # The denominator is increased by the vocabulary size for smoothing
        denominator = total_count + vocab_size # Calculate the denominator for probability calculation with smoothing
        for target in model[context]: # Iterate through each target word for the context
            model[context][target] = (model[context][target] + 1) / denominator # Calculate the smoothed probability of the target word given the context

    return model # Return the trained n-gram model

def generate_sentence(model, n, start_prompt, max_len=12):

    sentence = start_prompt[:] # Initialize the sentence with the start prompt

    # Pad the beginning to establish the initial context
    context_tokens = ["<s>"] * (n - 1) + sentence # Create initial context with start tokens

    while len(sentence) < max_len: # Continue generating until max length is reached
        # Determine the current context (the last n-1 words)
        context = tuple(context_tokens[-(n-1):]) # Get the current context (last n-1 tokens)

        # If the context is unknown, we cannot continue.
        if context not in model: # Check if the context exists in the model
            break # Stop if context is unknown

        next_word_dist = model[context] # Get the probability distribution for the next word given the context
        if not next_word_dist: # Check if there are any possible next words for the context
            break # Stop if no next words are possible

        words = list(next_word_dist.keys()) # Get the list of possible next words
        probabilities = list(next_word_dist.values()) # Get the list of corresponding probabilities

        # Choose the next word based on its probability distribution
        next_word = random.choices(words, weights=probabilities, k=1)[0] # Select the next word randomly based on its probability

        # Stop if an end-of-sentence token is generated
        if next_word == "</s>": # Check if the generated word is the end-of-sentence token
            break # Stop if end token is generated

        sentence.append(next_word) # Add the generated word to the sentence
        context_tokens.append(next_word) # Add the generated word to the context tokens

    return " ".join(sentence) # Join the tokens to form a sentence string


def calculate_perplexity(model, n, test_sentences, vocab_size, unknown_prob):

    total_log_prob = 0 # Initialize total log probability
    word_count = 0 # Initialize word count

    for sent in test_sentences: # Iterate through each sentence in the test set
        # Do not count the start padding tokens in the total word count
        word_count += len(sent) - (n - 1) # Update word count, excluding start padding tokens
        for i in range(len(sent) - n + 1): # Iterate through the sentence to get n-grams
            context = tuple(sent[i : i + n - 1]) # Get the context
            target = sent[i + n - 1] # Get the target word

            # Get the probability of the target word given the context
            context_dist = model.get(context) # Get the distribution for the context
            if context_dist: # Check if the context exists in the model
                # Use the calculated probability, or a small default if target is new
                prob = context_dist.get(target, unknown_prob) # Get the probability of the target word given the context, using unknown_prob for unseen words
            else:
                # If the entire context is new, use the default probability
                prob = unknown_prob # Use unknown probability if the entire context is new

            if prob > 0: # Ensure probability is greater than 0 for log calculation
                total_log_prob += math.log2(prob) # Add the log probability of the word to the total

    # Perplexity is 2 raised to the power of the negative average log probability
    cross_entropy = -total_log_prob / word_count # Calculate cross-entropy (average negative log probability)
    perplexity = math.pow(2, cross_entropy) # Calculate perplexity (2 to the power of cross-entropy)
    return perplexity # Return the calculated perplexity

if __name__ == "__main__":
    # --- Data Preparation ---
    all_sentences = load_and_preprocess_data(num_sentences=50000) # Load and preprocess data
    random.shuffle(all_sentences) # Shuffle the sentences randomly
    # Split data into 90% training and 10% testing
    train_size = int(len(all_sentences) * 0.9) # Calculate the size of the training set
    train_sentences = all_sentences[:train_size] # Split the data into training set
    test_sentences = all_sentences[train_size:] # Split the data into testing set

    print(f"\nTraining on {len(train_sentences)} sentences, testing on {len(test_sentences)}.")

    # Build vocabulary from the training set only to avoid data leakage
    all_words = [word for sent in train_sentences for word in sent] # Flatten the list of training sentences into a list of words
    vocab = set(all_words) # Create a set of unique words (vocabulary) from the training data
    vocab_size = len(vocab) # Get the size of the vocabulary
    print(f"Vocabulary size: {vocab_size} unique words.")

    # --- Train All Models ---
    bigram_model = train_ngram_model(train_sentences, n=2, vocab_size=vocab_size) # Train the bigram model (n=2)
    trigram_model = train_ngram_model(train_sentences, n=3, vocab_size=vocab_size) # Train the trigram model (n=3)
    fourgram_model = train_ngram_model(train_sentences, n=4, vocab_size=vocab_size) # Train the 4-gram model (n=4)


    # --- Generate Sentences (Qualitative Evaluation) ---
    start_prompt = ["the", "man"] # Define the starting prompt for sentence generation
    print(f"\n--- Generating Sentences (start: '{' '.join(start_prompt)}') ---")
    print("This fulfills the qualitative evaluation task of comparing fluency.")

    print("\n## Bigram Model (n=2):")
    for i in range(5): # Generate 5 sentences using the bigram model
        print(f"{i+1}: {generate_sentence(bigram_model, 2, start_prompt)}")

    print("\n## Trigram Model (n=3):")
    for i in range(5): # Generate 5 sentences using the trigram model
        print(f"{i+1}: {generate_sentence(trigram_model, 3, start_prompt)}")

    print("\n## 4-gram Model (n=4):")
    for i in range(5): # Generate 5 sentences using the 4-gram model
        print(f"{i+1}: {generate_sentence(fourgram_model, 4, start_prompt)}")

    # --- Calculate Perplexity (Quantitative Evaluation) ---
    print("\n--- Calculating Perplexity on Test Set ---")

    # Base probability for unseen n-grams after smoothing
    unknown_prob = 1 / vocab_size # Calculate the probability for unknown words with Laplace smoothing

    bi_perplexity = calculate_perplexity(bigram_model, 2, test_sentences, vocab_size, unknown_prob) # Calculate perplexity for bigram model
    print(f"Bigram Model Perplexity: {bi_perplexity:.2f}")

    tri_perplexity = calculate_perplexity(trigram_model, 3, test_sentences, vocab_size, unknown_prob) # Calculate perplexity for trigram model
    print(f"Trigram Model Perplexity: {tri_perplexity:.2f}")
    four_perplexity = calculate_perplexity(fourgram_model, 4, test_sentences, vocab_size, unknown_prob) # Calculate perplexity for 4-gram model
    print(f"4-gram Model Perplexity: {four_perplexity:.2f}")

Step 1: Loading dataset from Hugging Face... 
Step 2: Processing 50000 sentences...
 Successfully processed 50000 sentences.

Training on 45000 sentences, testing on 5000.
Vocabulary size: 48391 unique words.
Step 3: Training 2-gram model...
Step 3: Training 3-gram model...
Step 3: Training 4-gram model...

--- Generating Sentences (start: 'the man') ---
This fulfills the qualitative evaluation task of comparing fluency.

## Bigram Model (n=2):
1: the man comic role for 198 minutes and blues asked her type
2: the man 10 miles before redoing segments of a variety of controlled
3: the man , vuthipong demanded that thin and in the foppish tattle
4: the man named `` , according to solely producing video transmissions of
5: the man theory .

## Trigram Model (n=3):
1: the man of genius though he admitted the team .
2: the man who has attempted to recruit mainly in eastern south africa
3: the man 's ideas for the trinity mirror group , in order
4: the man is an american singer gwen stefani ,