In [34]:
import random
import nltk
from nltk.util import ngrams
from collections import Counter, defaultdict
import os
import re
import math

In [35]:
# Load dataset
corpus = ""
corpus_folder = 'corpus'
for filename in os.listdir(corpus_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(corpus_folder, filename), 'r', encoding='utf-8') as f:
            text = f.read().lower()
            text = re.sub(r'\d+\.', '', text)  # Removing the numbers before the sentences in the dairies
            corpus += text + " "

In [36]:
# Tokenization using nltk library
words = nltk.word_tokenize(corpus)


In [37]:
# Unigram, Bigram, Trigram models
unigram_model = Counter(words)
bigram_model = Counter(ngrams(words, 2))
trigram_model = Counter(ngrams(words, 3))
# Create backward bigrams
backward_bigram_model = Counter((words[i+1], words[i]) for i in range(len(words)-1))


In [38]:
# Convert bigram and trigram counts into probability distributions
bigram_prob = defaultdict(lambda: defaultdict(float))
trigram_prob = defaultdict(lambda: defaultdict(float))

# Convert backward bigram counts into probability distributions
backward_bigram_prob = defaultdict(lambda: defaultdict(float))
for (w2, w1), count in backward_bigram_model.items():
    backward_bigram_prob[w2][w1] = count / unigram_model[w2]

In [39]:
for (w1, w2), count in bigram_model.items():
    bigram_prob[w1][w2] = count / unigram_model[w1]

for (w1, w2, w3), count in trigram_model.items():
    trigram_prob[(w1, w2)][w3] = count / bigram_model[(w1, w2)]

In [40]:
# Generate diary entries for each ngram model
def generate_unigram_sentence(n=10):
    return ' '.join(random.choices(list(unigram_model.keys()), k=n))

def generate_bigram_sentence(n=10):
    sentence = [random.choice(list(unigram_model.keys()))]
    for _ in range(n - 1):
        if sentence[-1] in bigram_prob:
            next_word = random.choices(list(bigram_prob[sentence[-1]].keys()),
                                       weights=bigram_prob[sentence[-1]].values())[0]
        else:
            next_word = random.choice(list(unigram_model.keys()))
        sentence.append(next_word)
    return ' '.join(sentence)

def generate_trigram_sentence(n=10):
    sentence = [random.choice(list(unigram_model.keys()))]
    if len(sentence) < 2:
        sentence.append(random.choice(list(unigram_model.keys())))
    for _ in range(n - 2):
        if len(sentence) >= 2 and (sentence[-2], sentence[-1]) in trigram_prob:
            next_word = random.choices(list(trigram_prob[(sentence[-2], sentence[-1])].keys()),
                                       weights=trigram_prob[(sentence[-2], sentence[-1])].values())[0]
        elif sentence[-1] in bigram_prob:
            next_word = random.choices(list(bigram_prob[sentence[-1]].keys()),
                                       weights=bigram_prob[sentence[-1]].values())[0]
        else:
            next_word = random.choice(list(unigram_model.keys()))
        sentence.append(next_word)
    return ' '.join(sentence)
def generate_backward_bigram_sentence(n=10):
    # Start with a random word
    sentence = [random.choice(list(unigram_model.keys()))]
    
    # Generate words backward
    for _ in range(n - 1):
        if sentence[-1] in backward_bigram_prob:
            prev_word = random.choices(list(backward_bigram_prob[sentence[-1]].keys()),
                                       weights=backward_bigram_prob[sentence[-1]].values())[0]
        else:
            prev_word = random.choice(list(unigram_model.keys()))
        sentence.append(prev_word)
    
    # Reverse the sentence to get the correct order
    sentence.reverse()
    return ' '.join(sentence)

In [41]:
def calculate_sentence_perplexity(model, sentence, n):
    total_log_prob = 0
    N = len(sentence)
    epsilon = 1e-10  # Small value to avoid log(0)
    
    for i in range(n-1, N):
        if n == 1:
            # Unigram
            word = sentence[i]
            prob = unigram_model[word] / sum(unigram_model.values())
        elif n == 2:
            # Bigram
            context = sentence[i-1]
            word = sentence[i]
            prob = bigram_prob[context].get(word, epsilon)
        elif n == 3:
            # Trigram
            context = (sentence[i-2], sentence[i-1])
            word = sentence[i]
            prob = trigram_prob[context].get(word, epsilon)
        
        # Accumulate the log probability
        total_log_prob += math.log2(prob + epsilon)  # Add epsilon to avoid log(0)
    
    # Calculate average log probability
    avg_log_prob = total_log_prob / N
    
    # Calculate perplexity
    perplexity = 2 ** (-avg_log_prob)
    return perplexity

def calculate_backward_bigram_perplexity(sentence):
    total_log_prob = 0
    N = len(sentence)
    epsilon = 1e-10  # Small value to avoid log(0)
    
    for i in range(N - 1):
        context = sentence[i + 1]  # Next word is the context
        word = sentence[i]         # Current word is being predicted
        prob = backward_bigram_prob[context].get(word, epsilon)
        
        # Accumulate the log probability
        total_log_prob += math.log2(prob + epsilon)
    
    # Calculate average log probability
    avg_log_prob = total_log_prob / N
    
    # Calculate perplexity
    perplexity = 2 ** (-avg_log_prob)
    return perplexity

In [45]:
def save_to_file(filename, entries):
    with open(filename, "w", encoding="utf-8") as file:
        file.writelines(entries)

diary_entries = {}

# Unigram Diary
entries = ["Unigram Diary:\n"]
for i in range(1, 11):
    sentence = generate_unigram_sentence(random.randint(7, 12)).split()
    perplexity = calculate_sentence_perplexity(unigram_model, sentence, n=1)
    entry = f"{i}. {' '.join(sentence)} (Perplexity: {perplexity:.2f})\n"
    entries.append(entry)

diary_entries["unigram_diary.txt"] = entries

# Bigram Diary
entries = ["Bigram Diary:\n"]
for i in range(1, 11):
    sentence = generate_bigram_sentence(random.randint(7, 12)).split()
    perplexity = calculate_sentence_perplexity(bigram_model, sentence, n=2)
    entry = f"{i}. {' '.join(sentence)} (Perplexity: {perplexity:.2f})\n"
    entries.append(entry)

diary_entries["bigram_diary.txt"] = entries

# Trigram Diary
entries = ["Trigram Diary:\n"]
for i in range(1, 11):
    sentence = generate_trigram_sentence(random.randint(7, 12)).split()
    perplexity = calculate_sentence_perplexity(trigram_model, sentence, n=3)
    entry = f"{i}. {' '.join(sentence)} (Perplexity: {perplexity:.2f})\n"
    entries.append(entry)

diary_entries["trigram_diary.txt"] = entries

# Backward Bigram Diary
entries = ["Backward Bigram Diary:\n"]
for i in range(1, 11):
    sentence = generate_backward_bigram_sentence(random.randint(7, 12)).split()
    perplexity = calculate_backward_bigram_perplexity(sentence)
    entry = f"{i}. {' '.join(sentence)} (Perplexity: {perplexity:.2f})\n"
    entries.append(entry)

diary_entries["backward_bigram_diary.txt"] = entries

# Save each diary to a separate text file
for filename, content in diary_entries.items():
    save_to_file(filename, content)