In [7]:
# Task 2.1: Bigram Language Model


# Importing libraries
import numpy as np
from collections import defaultdict

# Class for Bigram Language Model
class BigramLM: 
    def __init__(self):
        self.bigramCounts = defaultdict(lambda: defaultdict(int))  
        # Bigram counts created using defaultdict, 
        # a dictionary that assigns default values to non-existent keys
        self.vocabulary = set() # Vocabulary set created using set() function (a collection of unique elements)
        self.tokenInitiate = '<start>'  # Token for start of sentence
        self.tokenTerminate = '<end>'   # Token for end of sentence

    def tokenize_text(self, text): # Function for tokenizing the text into word list
        return [self.tokenInitiate] + text.split() + [self.tokenTerminate]

    def learn_model(self, corpus): # Function for learning the bigram model from the given corpus 
        for sentence in corpus:
            tokens = self.tokenize_text(sentence)
            for i in range(len(tokens) - 1):
                current_word, next_word = tokens[i], tokens[i + 1]
                self.bigramCounts[current_word][next_word] += 1
                self.vocabulary.add(current_word)

    def calculate_probabilities(self):
        self.bigram_probabilities = defaultdict(dict)
        for current_word, next_word_counts in self.bigramCounts.items():
            total_count = sum(next_word_counts.values())
            for next_word, count in next_word_counts.items():
                probability = count / total_count
                self.bigram_probabilities[current_word][next_word] = probability

    def predict_next_word(self, current_word):
        if current_word not in self.bigram_probabilities:
            return None  # Word not present in training data

        next_word_probs = self.bigram_probabilities[current_word]
        next_words, probabilities = zip(*next_word_probs.items())
        chosen_word = np.random.choice(next_words, p=probabilities)
        return chosen_word
    
    ############################################################
    # Added Laplace and KneserNey Smoothing 
    def calculate_probabilities_LS(self):
        self.bigram_probabilities = defaultdict(dict)
        for current_word, next_word_counts in self.bigramCounts.items():
            total_count = sum(next_word_counts.values())
            for next_word, count in next_word_counts.items():
                probability = (count + 1) / (total_count + len(self.vocabulary))
                self.bigram_probabilities[current_word][next_word] = probability
                
    def LaplaceSmoothing(self, k=1):
        self.calculate_probabilities_LS()
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                # Use Laplace-smoothed probabilities
                self.bigram_probabilities[current_word][next_word] = self.bigram_probabilities[current_word][next_word]

        # Recalculate probabilities after smoothing+
        self.calculate_probabilities()
        
    # implement the Good-Turing Smoothing function who returns the discounted count
    # assuming d = 0.75 for KneserNey Smoothing
    def KneserNeySmoothing(self):
        self.unigramCounts = defaultdict(int)
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                self.unigramCounts[next_word] += 1
        self.calculate_probabilities()
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                self.bigram_probabilities[current_word][next_word] = (max(self.bigram_probabilities[current_word][next_word] - 0.75, 0) + 0.75 * len(self.bigram_probabilities[current_word]) * self.unigramCounts[next_word] / sum(self.unigramCounts.values())) / sum(self.bigram_probabilities[current_word].values())
        self.calculate_probabilities()

with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\corpus.txt", 'r') as f:
    corpus = f.readlines()

# Creating a bigram model
bigram_model = BigramLM()
bigram_model.learn_model(corpus)
bigram_model.calculate_probabilities()

# Predict the next word given a current word
# current_word = "language"
# next_word = bigram_model.predict_next_word(current_word)
# print(f"The predicted next word after '{current_word}' is '{next_word}'")

# Laplace Smoothing
# bigram_model.LaplaceSmoothing()
# current_word = "language"
# next_word = bigram_model.predict_next_word(current_word)
# print(f"The predicted next word after '{current_word}' is '{next_word}'")

# KneserNey Smoothing
bigram_model.KneserNeySmoothing()
current_word = "language"
next_word = bigram_model.predict_next_word(current_word)
print(f"The predicted next word after '{current_word}' is '{next_word}'")

The predicted next word after 'language' is 'of'


### Modifying bigram probabilities by 

In [2]:
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)
def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\corpus.txt", 'r') as f:
    emotion_scores_list = []
    corpus = f.readlines()
    for sentence in corpus:
        emotion_scores_list.append(emotion_scores(sentence))
        print(emotion_scores(sentence))
        print("\n")
    # print(emotion_scores_list)



[{'label': 'sadness', 'score': 0.9990336894989014}, {'label': 'joy', 'score': 0.00022923806682229042}, {'label': 'love', 'score': 0.00016815603885333985}, {'label': 'anger', 'score': 0.00021294814359862357}, {'label': 'fear', 'score': 0.00019591116870287806}, {'label': 'surprise', 'score': 0.00016001718176994473}]


[{'label': 'sadness', 'score': 0.9990696310997009}, {'label': 'joy', 'score': 0.000190242804819718}, {'label': 'love', 'score': 0.00018006691243499517}, {'label': 'anger', 'score': 0.0002524371084291488}, {'label': 'fear', 'score': 0.00015692033048253506}, {'label': 'surprise', 'score': 0.0001506162079749629}]


[{'label': 'sadness', 'score': 0.9989926218986511}, {'label': 'joy', 'score': 0.0002801567316055298}, {'label': 'love', 'score': 0.0002140139404218644}, {'label': 'anger', 'score': 0.00026033984613604844}, {'label': 'fear', 'score': 0.00013414597196970135}, {'label': 'surprise', 'score': 0.00011888779408764094}]


[{'label': 'sadness', 'score': 0.9990227222442627}, 

In [None]:
emotion_list = []
for i in range(len(emotion_scores_list)):
    emotion_list.append(max(emotion_scores_list[i], key=emotion_scores_list[i].get)) # finding maximum emotion score for each sentence

In [None]:
# write the emotion list to a text file 
with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\emotion_list.txt", 'w') as f:
    for item in emotion_list:
        f.write("%s\n" % item)

In [None]:
# update bigram pobablities with emotion scores using the following formula P(wi|wi−1) = (count(wi)/count(wi−1)) + β
def update_bigram_probabilities(emotion_list, bigram_model):
    for i in range(len(emotion_list)):
        current_word = bigram_model.tokenize_text(corpus[i])[0]
        next_word = bigram_model.tokenize_text(corpus[i])[1]
        bigram_model.bigram_probabilities[current_word][next_word] = bigram_model.bigram_probabilities[current_word][next_word] + emotion_scores_list[i][emotion_list[i]]