# Language Models and Smoothing

# Impots Libraries

In [107]:
from collections import defaultdict
import random
import math
import os.path
import sys
from collections import defaultdict

## Readfile

In [108]:

def readFileToCorpus(f):
    """ Reads in the text file f which contains one sentence per line.
    """
    if os.path.isfile(f):
        with open(f, "r", encoding="utf-8") as file:
            i = 0
            corpus = []
            print("Reading file ", f)
            for line in file:
                i += 1
                sentence = line.split()
                corpus.append(sentence)
                if i % 1000 == 0:
                    sys.stderr.write("Reading sentence " + str(i) + "\n")
        return corpus
    else:
        print("Error: corpus file ", f, " does not exist")
        sys.exit()


## PreProcesing 

In [109]:
# Constants
UNK = "UNK"
start = "<s>"
end = "</s>"

def preprocess(corpus):
    # Find all the rare words
    freqDict = defaultdict(int)
    for sen in corpus:
        for word in sen:
            freqDict[word] += 1

    # Replace rare words with UNK
    for sen in corpus:
        for i in range(len(sen)):
            word = sen[i]
            if freqDict[word] < 2:
                sen[i] = UNK

    # Bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)

    return corpus

def preprocessTest(vocab, corpus):
    # Replace test words that were unseen in the training with UNK
    for sen in corpus:
        for i in range(len(sen)):
            word = sen[i]
            if word not in vocab:
                sen[i] = UNK

    # Bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)

    return corpus


## Base Language model 

In [110]:
class LanguageModel:
    def __init__(self, corpus, **kwargs):
        self.smoothing_value_unigram = kwargs.get('smoothing_value_unigram', 1)
        self.smoothing_value_bigram = kwargs.get('smoothing_value_bigram', 0.5)
        self.unigram_dist = self.train_unigram(corpus)
        self.bigram_dist = self.train_bigram(corpus)
        self.smoothed_unigram_dist = self.train_smoothed_unigram(corpus)
        self.smoothed_bigram_dist = self.train_smoothed_bigram(corpus)

    def train_unigram(self, corpus):
        unigram_counts = defaultdict(int)
        total_words = 0

        for sentence in corpus:
            for word in sentence:
                unigram_counts[word] += 1
                total_words += 1

        unigram_dist = {word: count / total_words for word, count in unigram_counts.items()}
        return unigram_dist

    def train_bigram(self, corpus):
        bigram_counts = defaultdict(lambda: defaultdict(int))

        for sentence in corpus:
            for i in range(len(sentence) - 1):
                current_word = sentence[i]
                next_word = sentence[i + 1]

                bigram_counts[current_word][next_word] += 1

        bigram_dist = {word1: {word2: count for word2, count in bigram_counts[word1].items()} for word1 in bigram_counts}
        return bigram_dist

    def train_smoothed_unigram(self, corpus):
        unigram_counts = defaultdict(int)
        total_words = 0

        for sentence in corpus:
            for word in sentence:
                unigram_counts[word] += 1
                total_words += 1

        smoothed_unigram_dist = {word: (count + self.smoothing_value_unigram) / (total_words + self.smoothing_value_unigram * len(unigram_counts))
                                 for word, count in unigram_counts.items()}
        return smoothed_unigram_dist

    def train_smoothed_bigram(self, corpus):
        unigram_counts = defaultdict(int)
        bigram_counts = defaultdict(lambda: defaultdict(int))

        total_words = 0

        for sentence in corpus:
            for i in range(len(sentence)):
                word = sentence[i]
                unigram_counts[word] += 1
                total_words += 1

                if i < len(sentence) - 1:
                    next_word = sentence[i + 1]
                    bigram_counts[word][next_word] += 1

        smoothed_bigram_dist = {}
        for word1 in bigram_counts:
            total_bigrams = sum(bigram_counts[word1].values())
            smoothed_bigram_dist[word1] = defaultdict(float)
            for word2 in bigram_counts[word1]:
                prob = (1 - self.smoothing_value_bigram) * (bigram_counts[word1][word2] / unigram_counts[word1]) + \
                       self.smoothing_value_bigram * (unigram_counts[word2] / total_words)
                smoothed_bigram_dist[word1][word2] = prob

        return smoothed_bigram_dist


    def generateSentence(self):
        sentence = ['<s>']
        while True:
            # Assuming word1 is the last word in the sentence
            word1 = sentence[-1]
            
            # Corrected line for bigram model
            word2 = random.choices(
                list(self.bigram_dist.get(word1, {}).keys()),
                weights=list(self.bigram_dist.get(word1, {}).values())
            )[0]
            
            sentence.append(word2)
            
            # Break if the generated word is '</s>' or if sentence length exceeds 200
            if word2 == '</s>' or len(sentence) > 200: 
                break
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.unigram_dist.get(word, 0)
        return probability

    def getSentenceProbabilityBigram(self, sentence):
        probability = 1.0
        for i in range(1, len(sentence)):
            bigram = (sentence[i-1], sentence[i])
            probability *= self.bigram_dist.get(bigram, 0)
        return probability

    def getCorpusPerplexity(self, corpus):
        total_log_prob = 0.0
        total_words = 0

        for sentence in corpus:
            for word in sentence:
                total_log_prob += -1 * self.unigram_dist.get(word, 0)
                total_words += 1

        avg_log_prob = total_log_prob / total_words
        perplexity = 2 ** (-avg_log_prob)  # Correcting the formula
        return perplexity

    def getCorpusPerplexityBigram(self, corpus):
        total_log_prob = 0.0
        total_bigrams = 0

        for sentence in corpus:
            # Assuming sentence is a list of words
            for i in range(1, len(sentence)):
                # Calculate bigram probability
                bigram = (sentence[i-1], sentence[i])
                total_log_prob += -1 * self.bigram_dist.get(bigram, 0)
                total_bigrams += 1

        avg_log_prob = total_log_prob / total_bigrams
        perplexity = 2 ** avg_log_prob
        return perplexity

    def generateSentencesToFile(self, numberOfSentences, filename, modelName):
        with open(filename, 'w+') as filePointer:
            print(modelName, file=filePointer)
            for i in range(numberOfSentences):
                sen = self.generateSentence()
                prob = self.getSentenceProbability(sen)

                stringGenerated = f"{prob} {' '.join(sen)}"
                print(stringGenerated, file=filePointer)

    def generateSentencesToFileBigram(self, numberOfSentences, filename, modelName):
        with open(filename, 'w+') as filePointer:
            print(modelName, file=filePointer)
            for i in range(numberOfSentences):
                sen = self.generateSentence()
                prob = self.getSentenceProbabilityBigram(sen)

                stringGenerated = f"{prob} {' '.join(sen)}"
                print(stringGenerated, file=filePointer)


## UniGram Model Without Smoothing

In [111]:
class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)

    def train_unigram(self, corpus):
        unigram_dist = super().train_unigram(corpus)
        return unigram_dist

    # Override the general generateSentence method
    def generateSentence(self):
        sentence = super().generateSentence()
        return sentence

    # Override the general getSentenceProbability method
    def getSentenceProbability(self, sen):
        probability = super().getSentenceProbability(sen)
        return probability

    # Override the general getCorpusPerplexity method
    def getCorpusPerplexity(self, corpus):
        perplexity = super().getCorpusPerplexity(corpus)
        return perplexity
    def generateSentencesToFile(self, numberOfSentences, filename,model):
        # Call the function from the parent class using super()
        super().generateSentencesToFile(numberOfSentences, filename,model)


## Smoothed Unigram MOdel

In [112]:
class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.unigram_dist = self.train(corpus)

    def train(self, corpus):
        smoothed_unigram_dist = super().train_smoothed_unigram(corpus)
        return smoothed_unigram_dist

    # Override the general generateSentence method
    def generateSentence(self):
        sentence = super().generateSentence()
        return sentence

    # Override the general getSentenceProbability method
    def getSentenceProbability(self, sen):
        probability = super().getSentenceProbability(sen)
        return probability

    # Override the general getCorpusPerplexity method
    def getCorpusPerplexity(self, corpus):
        perplexity = super().getCorpusPerplexity(corpus)
        return perplexity

    def generateSentencesToFile(self, numberOfSentences, filename, model):
        # Call the function from the parent class using super()
        super().generateSentencesToFile(numberOfSentences, filename, model)


## UN Smoothed BIgram Model 

In [113]:

class UnsmoothedBigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.bigram_dist = self.train(corpus)

    def train(self, corpus):
        unsmoothed_bigram_dist =  super().train_bigram(corpus)
        return unsmoothed_bigram_dist

    def generateSentence(self):
        sentence = super().generateSentence()
        return sentence

    def getSentenceProbability(self, sen):
        probability = super().getSentenceProbabilityBigram(sen)
        return probability

    def getCorpusPerplexity(self, corpus):
        perplexity = super().getCorpusPerplexityBigram(corpus)
        return perplexity

    def generateSentencesToFile(self, numberOfSentences, filename, model):
        super().generateSentencesToFileBigram(numberOfSentences, filename, model)


## Smoothed Bigram MOdel

In [114]:
class SmoothedBigramModelLI(LanguageModel):
    def __init__(self, corpus, **kwargs):
        super().__init__(corpus)
        self.bigram_dist = self.train(corpus, **kwargs)

    def train(self, corpus, **kwargs):
        smoothed_bigram_dist = super().train_smoothed_bigram(corpus)

        return smoothed_bigram_dist

    def generateSentence(self):
        sentence = super().generateSentence()
        return sentence

    def getSentenceProbability(self, sen):
        probability = super().getSentenceProbabilityBigram(sen)
        return probability

    def getCorpusPerplexity(self, corpus):
        perplexity = super().getCorpusPerplexityBigram(corpus)
        return perplexity

    def generateSentencesToFile(self, numberOfSentences, filename, model):
        super().generateSentencesToFileBigram(numberOfSentences, filename, model)


In [115]:
#-------------------------------------------
# The main routine
#-------------------------------------------
if __name__ == "__main__":

    trainCorpus = readFileToCorpus("D:/6th_Semester/NLP/Assignments/Assignment_1/train.txt")
    posTestCorpus = readFileToCorpus("D:/6th_Semester/NLP/Assignments/Assignment_1/neg_test.txt")
    negTestCorpus = readFileToCorpus("D:/6th_Semester/NLP/Assignments/Assignment_1/pos_test.txt")

    trainCorpus = preprocess(trainCorpus)


    # Create a unigram language model
    unigramModel = UnigramModel(trainCorpus)
    # Create Smoothed Language model 
    smoothed_model = SmoothedUnigramModel(trainCorpus)

    # Create  UnsmoothedBigramModel Language Model
    unsmoothed_bigram_model = UnsmoothedBigramModel(trainCorpus)

     # Create  smoothedBigramModel Language Model   
    smoothed_bigram_model_li = SmoothedBigramModelLI(trainCorpus, smoothing_value=0.5)


    vocab = set()
    for sentence in trainCorpus:
        for word in sentence:
            vocab.add(word)

    # Preprocess test corpora
    posTestCorpus = preprocessTest(vocab, posTestCorpus)
    negTestCorpus = preprocessTest(vocab, negTestCorpus)


    # Calculate the probability for each sentence in the test corpora Unigram 
    pos_sentence_probabilities_unigram = [unigramModel.getSentenceProbability(sentence) for sentence in posTestCorpus]
    neg_sentence_probabilities_unigram = [unigramModel.getSentenceProbability(sentence) for sentence in negTestCorpus]
    # Calculate the probability for each sentence in the test corpora smoothed Unigram 
    pos_sentence_probabilities_smoothed_unigram = [smoothed_model.getSentenceProbability(sentence) for sentence in posTestCorpus]
    neg_sentence_probabilities_smoothed_unigram = [smoothed_model.getSentenceProbability(sentence) for sentence in negTestCorpus]
    # Calculate the probability for each sentence in the test corpora BiUnigram 
    pos_sentence_probabilities_unsmoothed_bigram_model= [unsmoothed_bigram_model.getSentenceProbability(sentence) for sentence in posTestCorpus]
    neg_sentence_probabilities_unsmoothed_bigram_model = [unsmoothed_bigram_model.getSentenceProbability(sentence) for sentence in negTestCorpus]
    # Calculate the probability for each sentence in the test corpora smoothed Unigram 
    pos_sentence_probabilities_smoothed_BIunigram = [smoothed_bigram_model_li.getSentenceProbability(sentence) for sentence in posTestCorpus]
    neg_sentence_probabilities_smoothed_BIunigram = [smoothed_bigram_model_li.getSentenceProbability(sentence) for sentence in negTestCorpus]

    # Get corpus perplexity Unigram 
    corpus_perplexity_Uni = unigramModel.getCorpusPerplexity(trainCorpus)
    # Get corpus perplexity Smoothed Unigram 
    corpus_perplexity_S_Uni = smoothed_model.getCorpusPerplexity(trainCorpus)
    # Get corpus perplexity BIgram 
    corpus_perplexity_BI = unsmoothed_bigram_model.getCorpusPerplexity(trainCorpus)
    # Get corpus perplexity Smoothed Bigram 
    corpus_perplexity_S_BI = smoothed_bigram_model_li.getCorpusPerplexity(trainCorpus)


    smoothed_model = SmoothedUnigramModel(trainCorpus)
    print("\n---------------UN Smoothed Unigram Model Result --------------------------------------------------\n")
    print(unigramModel.unigram_dist)
    print("\n--------------- Smoothed Unigram Model Result --------------------------------------------------\n")
    print(smoothed_model.unigram_dist)
    print("\n---------------UN Smoothed Biigram Model Result --------------------------------------------------\n")

    for word1, word2_counts in list(unsmoothed_bigram_model.bigram_dist.items())[:5]:
        print(f"Bigram pairs for '{word1}': {word2_counts}")
    print("\n--------------- Smoothed Bigram Model Result --------------------------------------------------\n")
    for word1, word2_dict in list(smoothed_bigram_model_li.bigram_dist.items())[:5]:
        print(f"Bigram pairs for '{word1}': {word2_dict}")

    print("\n\n---------------------Test Sentences -------------------------------------------------------\n")    

    # Print UniGRam sentence probabilities for positive test corpus
    print("\n\n----------------  Unigram sentence Positive Test Corpus Sentence Probabilities:-------------\n")
    for i, probability in enumerate(pos_sentence_probabilities_unigram):
        print(f"Sentence {i + 1} Probability: {probability}")

    # Print Unigram sentence probabilities for negative test corpus
    print("\n----------------- Unigram sentence Negative Test Corpus Sentence Probabilities:----------------------\n")
    for i, probability in enumerate(neg_sentence_probabilities_unigram):
        print(f"Sentence {i + 1} Probability: {probability}")



    # Print smoothed UniGRam sentence probabilities for positive test corpus
    print("\n--------------------------------- smoothed Unigram sentence Positive Test Corpus Sentence Probabilities:----------------------------\n")
    for i, probability in enumerate(pos_sentence_probabilities_smoothed_unigram):
        print(f"Sentence {i + 1} Probability: {probability}")

    # Print smoothed Unigram sentence probabilities for negative test corpus
    print(" \n---------------------smoothed Unigram sentence Negative Test Corpus Sentence Probabilities:----------------------\n")
    for i, probability in enumerate(neg_sentence_probabilities_smoothed_unigram):
        print(f"Sentence {i + 1} Probability: {probability}")


    # Print  BIiGRam sentence probabilities for positive test corpus
    print(" \n---------------------------- BIigram sentence Positive Test Corpus Sentence Probabilities:-----------------------------\n")
    for i, probability in enumerate(pos_sentence_probabilities_unsmoothed_bigram_model):
        print(f"Sentence {i + 1} Probability: {probability}")

    # Print  BIigram sentence probabilities for negative test corpus
    print("\n------------------------------------ BIigram sentence Negative Test Corpus Sentence Probabilities:-----------------------------\n")
    for i, probability in enumerate(neg_sentence_probabilities_unsmoothed_bigram_model):
        print(f"Sentence {i + 1} Probability: {probability}")


    # Print Smoothed  BIiGRam sentence probabilities for positive test corpus
    print(" \n----------------------------Smoothed  BIigram sentence Positive Test Corpus Sentence Probabilities:-----------------------------\n")
    for i, probability in enumerate(pos_sentence_probabilities_smoothed_BIunigram):
        print(f"Sentence {i + 1} Probability: {probability}")

    # Print Smoothed BIigram sentence probabilities for negative test corpus
    print("\n------------------------------------Smoothed BIigram sentence Negative Test Corpus Sentence Probabilities:-----------------------------\n")
    for i, probability in enumerate( neg_sentence_probabilities_smoothed_BIunigram):
        print(f"Sentence {i + 1} Probability: {probability}")


    print("\n Uni Gram Corpus Perplexity:", corpus_perplexity_Uni)
    print("\n Smoothed Uni Gram Corpus Perplexity:", corpus_perplexity_S_Uni)
    print("\n BI Gram Corpus Perplexity:", corpus_perplexity_BI)
    print("\n Smoothed BI Gram Corpus Perplexity:", corpus_perplexity_S_BI)
    unigramModel.generateSentencesToFile(5, 'D:/6th_Semester/NLP/Assignments/Assignment_1/Results.txt','unigramModel')
    smoothed_model.generateSentencesToFile(5, 'D:/6th_Semester/NLP/Assignments/Assignment_1/Results.txt','smoothed unigramModel')
    unsmoothed_bigram_model.generateSentencesToFile(5, 'D:/6th_Semester/NLP/Assignments/Assignment_1/Results.txt','unsmoothed_bigram_model')
    smoothed_bigram_model_li.generateSentencesToFile(5, 'D:/6th_Semester/NLP/Assignments/Assignment_1/Results.txt','smoothed_bigram_model')



Reading file  D:/6th_Semester/NLP/Assignments/Assignment_1/train.txt


Reading sentence 1000
Reading sentence 2000
Reading sentence 3000
Reading sentence 4000
Reading sentence 5000
Reading sentence 6000
Reading sentence 7000
Reading sentence 8000
Reading sentence 9000
Reading sentence 10000
Reading sentence 11000
Reading sentence 12000
Reading sentence 13000
Reading sentence 14000
Reading sentence 15000
Reading sentence 16000
Reading sentence 17000
Reading sentence 18000
Reading sentence 19000
Reading sentence 20000
Reading sentence 21000
Reading sentence 22000
Reading sentence 23000
Reading sentence 24000
Reading sentence 25000
Reading sentence 26000
Reading sentence 27000
Reading sentence 28000
Reading sentence 29000
Reading sentence 30000
Reading sentence 1000
Reading sentence 1000


Reading file  D:/6th_Semester/NLP/Assignments/Assignment_1/neg_test.txt
Reading file  D:/6th_Semester/NLP/Assignments/Assignment_1/pos_test.txt



---------------UN Smoothed Unigram Model Result --------------------------------------------------


--------------- Smoothed Unigram Model Result --------------------------------------------------


---------------UN Smoothed Biigram Model Result --------------------------------------------------

Bigram pairs for 'films': {'adapted': 1, 'were': 8, ',': 113, 'like': 44, 'they': 1, 'of': 56, '.': 108, 'that': 31, 'are': 43, 'could': 1, 'is': 10, 'it': 2, 'thus': 2, 'in': 18, 'along': 1, 'at': 2, '?': 2, 'norman': 1, 'he': 4, 'about': 9, 'more': 3, 'which': 5, 'as': 15, 'by': 2, 'released': 5, 'to': 15, 'strike': 1, 'and': 13, ')': 15, 'take': 2, 'treat': 1, 'ought': 1, 'shown': 1, 'this': 6, 'ever': 6, 'over': 1, 'together': 2, 'tend': 1, 'such': 8, 'around': 2, 'made': 6, 'dead': 1, 'have': 14, 'for': 2, 'always': 1, ':': 6, 'main': 1, 'kind': 1, 'brilliantly': 2, "they're": 2, 'i': 7, 'combined': 1, 'the': 2, 'with': 6, 'UNK': 5, 'will': 5, 'was': 4, 'because': 3, 'helps': 1, '-': 2

## Questions

In [117]:
# Question 1
print("1. Sentence Length in Unigram vs. Bigram Models:")
print("   - Unigram model generates sentences independently, controlled by word probabilities.")
print("   - Bigram models consider word dependencies, resulting in more coherent sentence lengths.")

# Question 2
print("\n2. Probability of Generated Sentences:")
print("   - Yes, models assign different probabilities due to independent vs. contextual word generation.")
print("   - Bigram models, especially smoothed ones, provide more accurate contextual probabilities.")

# Question 3
print("\n3. Comparison of Sentence Realism:")
print("   - Smoothed bigram model is expected to produce more realistic sentences.")
print("   - Smoothing techniques enhance the representation of language structure.")

# Question 4
print("\nPerplexity Comparison:")
print("1. Unigram Model Corpus Perplexity:", 1.009)
print("2. Smoothed Unigram Model Corpus Perplexity:", 1.009)
print("3. Bigram Model Corpus Perplexity:", 1.0)
print("4. Smoothed Bigram Model Corpus Perplexity:", 1.0)
print("\nThe lower the perplexity, the better the model predicts the test data. In this case, bigram models outperform unigram models.")


1. Sentence Length in Unigram vs. Bigram Models:
   - Unigram model generates sentences independently, controlled by word probabilities.
   - Bigram models consider word dependencies, resulting in more coherent sentence lengths.

2. Probability of Generated Sentences:
   - Yes, models assign different probabilities due to independent vs. contextual word generation.
   - Bigram models, especially smoothed ones, provide more accurate contextual probabilities.

3. Comparison of Sentence Realism:
   - Smoothed bigram model is expected to produce more realistic sentences.
   - Smoothing techniques enhance the representation of language structure.

Perplexity Comparison:
1. Unigram Model Corpus Perplexity: 1.009
2. Smoothed Unigram Model Corpus Perplexity: 1.009
3. Bigram Model Corpus Perplexity: 1.0
4. Smoothed Bigram Model Corpus Perplexity: 1.0

The lower the perplexity, the better the model predicts the test data. In this case, bigram models outperform unigram models.
