In [2]:
# Import necessary libraries
from nltk import sent_tokenize, word_tokenize
from collections import defaultdict
from math import log
import sys, time

# Function to read lines from a file
def get_lines(path):
    with open(path, 'r') as f:
        train = f.read()
    return train

# Function to get unigrams
def get_unigrams(sentences):
    unigram_dict = {}
    words = ['<s>'] + sentences.split() + ['</s>']
    for word in words:
        if word in unigram_dict:
            unigram_dict[word] += 1
        else:
            unigram_dict[word] = 1
    return unigram_dict

# Function to get bigrams
def get_bigrams(sentences):
    bigram_dict = {}
    words = ['<s>'] + sentences.split() + ['</s>']
    for i in range(len(words) - 1):
        bi_word = (words[i], words[i + 1])
        if bi_word in bigram_dict:
            bigram_dict[bi_word] += 1
        else:
            bigram_dict[bi_word] = 1
    return bigram_dict

# Function to calculate surprisal
def get_surprisal(p):
    s = -log(p, 2)
    return s

# Function to calculate bigram surprisal
def get_bigram_surprisal(unigram_dict, bigram_dict):
    bi_sur_dict = {}
    V = len(unigram_dict)  # Vocabulary size including all unigrams in the training set
    
    for bi_word in bigram_dict:
        first_word, second_word = bi_word
        unigram_count = unigram_dict.get(first_word, 0)
        bigram_count = bigram_dict.get(bi_word, 0)

        # Laplace smoothing for conditional probability calculation
        if unigram_count == 0:
            cond_p = 1 / V
        else:
            cond_p = (bigram_count + 1) / (unigram_count + V)

        # Calculate surprisal
        s = -log(cond_p, 2)
        bi_sur_dict[bi_word] = s
    return bi_sur_dict

# Function to calculate perplexity
def get_perplexity(bi_surp, test):
    total_surp = 0.0  # Total surprisal
    word_count = 0    # Total word count in the test data
    
    total_bigrams = len(bi_surp)
    default_surp = -log(1 / total_bigrams, 2)
    
    print(f"Default surprisal for unseen bigrams: {default_surp}")
    
    for sentence in [test]:
        words = ['<s>'] + sentence.split() + ['</s>']
        word_count += len(words) - 1
        print(f"Processed sentence: {words}")
        
        for i in range(len(words) - 1):
            bigram = (words[i], words[i + 1])
            surprisal_value = bi_surp.get(bigram, default_surp)
            total_surp += surprisal_value
            print(f"Bigram: {bigram}, Surprisal: {surprisal_value}")
    
    avg_surprisal = total_surp / word_count
    print(f"Average surprisal (entropy): {avg_surprisal}")
    
    perplexity = 2 ** avg_surprisal
    print(f"Calculated perplexity: {perplexity}")
    
    return perplexity

# Main function
def main():
    train_sentences = "I would much rather eat pizza than ice cream ."
    test_sentences = "I love anchovies on my pizza . "
    unigram_freq = get_unigrams(train_sentences)
    bigram_freq = get_bigrams(train_sentences)

    print("Unigram Frequencies:")
    for key, value in unigram_freq.items():
        print(f"{key},{value}")
    
    print("\nBigram Frequencies:")
    for key, value in bigram_freq.items():
        print(f"{key},{value}")
    
    bi_sur_dict = get_bigram_surprisal(unigram_freq, bigram_freq)
    
    print("\nBigram Surprisal Details:")
    for key, value in bi_sur_dict.items():
        print(f"{key}: {value}")
    
    perplexity = get_perplexity(bi_sur_dict, test_sentences)
    print(f"\nPerplexity on Test Set: {perplexity}")

# Run the main function
if __name__ == "__main__":
    main()


Unigram Frequencies:
<s>,1
I,1
would,1
much,1
rather,1
eat,1
pizza,1
than,1
ice,1
cream,1
.,1
</s>,1

Bigram Frequencies:
('<s>', 'I'),1
('I', 'would'),1
('would', 'much'),1
('much', 'rather'),1
('rather', 'eat'),1
('eat', 'pizza'),1
('pizza', 'than'),1
('than', 'ice'),1
('ice', 'cream'),1
('cream', '.'),1
('.', '</s>'),1

Bigram Surprisal Details:
('<s>', 'I'): 2.700439718141092
('I', 'would'): 2.700439718141092
('would', 'much'): 2.700439718141092
('much', 'rather'): 2.700439718141092
('rather', 'eat'): 2.700439718141092
('eat', 'pizza'): 2.700439718141092
('pizza', 'than'): 2.700439718141092
('than', 'ice'): 2.700439718141092
('ice', 'cream'): 2.700439718141092
('cream', '.'): 2.700439718141092
('.', '</s>'): 2.700439718141092
Default surprisal for unseen bigrams: 3.4594316186372978
Processed sentence: ['<s>', 'I', 'love', 'anchovies', 'on', 'my', 'pizza', '.', '</s>']
Bigram: ('<s>', 'I'), Surprisal: 2.700439718141092
Bigram: ('I', 'love'), Surprisal: 3.4594316186372978
Bigram: ('l