In [None]:
import os
import math
import random
import re
import nltk
nltk.download('punkt')
from collections import defaultdict, Counter
from nltk import ngrams, FreqDist, ConditionalFreqDist
from nltk.tokenize import word_tokenize

def load_corpus(directory):
    corpus = ""
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                corpus += file.read() + " "
    return corpus


def tokenize_text(text):
    return word_tokenize(text)

def preprocess_tokens(tokens):
    return [token for token in tokens if not re.search(r'\d', token)]

def train_ngram_models(tokens):
    unigram_fd = FreqDist(tokens)
    bigram_cfd = ConditionalFreqDist(ngrams(tokens, 2))
    trigram_cfd = ConditionalFreqDist([((w1, w2), w3) for w1, w2, w3 in ngrams(tokens, 3)])
    return unigram_fd, bigram_cfd, trigram_cfd

starting_words = ["subha", "raat", "aaj", "kal", "main", "hum", "uske", "phir"]

def generate_text(model, num_words=10, previous_sentence=None, direction="forward"):
    if model == "unigram":
        return " ".join(random.choices(list(unigram_fd.keys()), k=num_words))
    
    elif model == "bigram":
        sentence = []
        
        if direction == "backward":
            current_word = random.choice(list(backward_cfd.keys()))
        else:
            current_word = previous_sentence[-1] if previous_sentence else random.choice(starting_words)
        
        sentence.append(current_word)
        
        for _ in range(num_words - 1):
            next_word = None
            if current_word in bigram_cfd:
                next_word = random.choices(list(bigram_cfd[current_word].keys()), weights=list(bigram_cfd[current_word].values()))[0]
            if not next_word:
                break
            sentence.append(next_word)
            current_word = next_word
        
        return " ".join(sentence)
    
    elif model == "trigram":
        sentence = []
        
        if previous_sentence and len(previous_sentence) >= 2:
            current_context = (previous_sentence[-2], previous_sentence[-1])
        else:
            current_context = random.choice(list(trigram_cfd.keys()))
        
        sentence.extend(current_context)
        
        for _ in range(num_words - 2):
            next_word = None
            if current_context in trigram_cfd:
                next_word = random.choices(list(trigram_cfd[current_context].keys()), weights=list(trigram_cfd[current_context].values()))[0]
            if not next_word:
                break
            sentence.append(next_word)
            current_context = (current_context[1], next_word)
        
        return " ".join(sentence)
    
    elif model == "bidirectional":
        forward_text = generate_text("bigram", num_words//2, direction="forward")
        backward_text = generate_text("bigram", num_words//2, direction="backward")
        return " ".join(backward_text.split()[::-1]) + " " + forward_text
    
    return "Invalid model"

# Backward bigram Model
def backward_bigram_model(tokens):
    backward_bigrams = list(ngrams(tokens[::-1], 2))
    return ConditionalFreqDist(backward_bigrams)

#Bidirectional bigram Model
def bidirectional_bigram_model(tokens):
    forward_cfd = ConditionalFreqDist(list(ngrams(tokens, 2)))
    backward_cfd = backward_bigram_model(tokens)
    return forward_cfd, backward_cfd

# Calculate Perplexity
def calculate_perplexity(model, test_tokens, n):
    log_probability = 0
    N = len(test_tokens)
    
    if n == 1:
        #Unigram model
        total_tokens = sum(model.values())
        for token in test_tokens:
            prob = model[token] / total_tokens if token in model else 0
            log_probability += math.log2(prob) if prob > 0 else 0
    
    elif n == 2:
        #Bigram model
        for w1, w2 in ngrams(test_tokens, 2):
            prob = model[w1][w2] / sum(model[w1].values()) if w1 in model and w2 in model[w1] else 0
            log_probability += math.log2(prob) if prob > 0 else 0
    
    elif n == 3:
        #Trigram model
        for w1, w2, w3 in ngrams(test_tokens, 3):
            prob = model[(w1, w2)][w3] / sum(model[(w1, w2)].values()) if (w1, w2) in model and w3 in model[(w1, w2)] else 0
            log_probability += math.log2(prob) if prob > 0 else 0
    
    perplexity = 2 ** (-log_probability / N)
    return perplexity

if __name__ == "__main__":
    corpus = load_corpus("Diaries")
    tokens = preprocess_tokens(tokenize_text(corpus))
    
    #80% training, 20% test
    split = int(0.8 * len(tokens))
    train_tokens = tokens[:split]
    test_tokens = tokens[split:]
    
    unigram_fd, bigram_cfd, trigram_cfd = train_ngram_models(train_tokens)
    backward_cfd = backward_bigram_model(train_tokens)
    forward_cfd, backward_cfd = bidirectional_bigram_model(train_tokens)
    
    unigram_perplexity = calculate_perplexity(unigram_fd, test_tokens, 1)
    bigram_perplexity = calculate_perplexity(bigram_cfd, test_tokens, 2)
    trigram_perplexity = calculate_perplexity(trigram_cfd, test_tokens, 3)
    
    #unigram
    with open("unigram_output.txt", "w", encoding="utf-8") as f:
        f.write("=== Unigram Model ===\n")
        for i in range(10):
            sentence = generate_text('unigram', num_words=random.randint(7, 12))
            f.write(f"Sentence {i+1}: {sentence}\n")
    
    #save bigram
    with open("bigram_output.txt", "w", encoding="utf-8") as f:
        f.write("=== Bigram Model ===\n")
        previous_sentence = None
        for i in range(10):
            sentence = generate_text("bigram", num_words=random.randint(7, 12), previous_sentence=previous_sentence)
            f.write(f"Sentence {i+1}: {sentence}\n")
            previous_sentence = sentence.split()
    
    #save trigram
    with open("trigram_output.txt", "w", encoding="utf-8") as f:
        f.write("=== Trigram Model ===\n")
        previous_sentence = None
        for i in range(10):
            sentence = generate_text("trigram", num_words=random.randint(7, 12), previous_sentence=previous_sentence)
            f.write(f"Sentence {i+1}: {sentence}\n")
            previous_sentence = sentence.split()
    
    #backward bigram
    with open("backward_bigram_output.txt", "w", encoding="utf-8") as f:
        f.write("=== Backward Bigram Model ===\n")
        for i in range(10):
            sentence = generate_text('bigram', num_words=random.randint(7, 12), direction='backward')
            f.write(f"Sentence {i+1}: {sentence}\n")
    
    #bidirectional bigram
    with open("bidirectional_bigram_output.txt", "w", encoding="utf-8") as f:
        f.write("=== Bidirectional Bigram Model ===\n")
        for i in range(10):
            sentence = generate_text('bidirectional', num_words=random.randint(7, 12))
            f.write(f"Sentence {i+1}: {sentence}\n")

   
    print("\nPerplexity Results")
    print(f"Unigram Perplexity: {unigram_perplexity}")
    print(f"Bigram Perplexity: {bigram_perplexity}")
    print(f"Trigram Perplexity: {trigram_perplexity}")


Perplexity Results
Unigram Perplexity: 44.10074355133619
Bigram Perplexity: 1.739944707396139
Trigram Perplexity: 1.0849742603971948


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
