In [78]:
# Importing libraries
import numpy as np
from collections import defaultdict
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [71]:
with open('bigram_counts.txt', 'r') as f:
    emotion = f.read().splitlines()
    
with open('corpus.txt', 'r') as f:
    corpus = f.read().splitlines()

In [72]:
bigram_data = pd.DataFrame()

In [73]:
bigram_data['word1'] = [i.split('-')[0] for i in emotion]
bigram_data['word2'] = [i.split('-')[1] for i in emotion]
bigram_data['emotion'] = [i.split('-')[2] for i in emotion]

# Main

In [76]:
# Class for Bigram Language Model
class BigramLM: 
    def __init__(self):
        self.bigramCounts = defaultdict(lambda: defaultdict(int))
        # Bigram counts created using defaultdict, 
        # a dictionary that assigns default values to non-existent keys
        self.vocabulary = set() # Vocabulary set created using set() function (a collection of unique elements)
        self.tokenInitiate = '<start>'  # Token for start of sentence
        self.tokenTerminate = '<end>'   # Token for end of sentence

    def tokenize_text(self, text): # Function for tokenizing the text into word list
        return [self.tokenInitiate] + text.split() + [self.tokenTerminate]

    def learn_model(self, corpus): # Function for learning the bigram model from the given corpus 
        for sentence in corpus:
            tokens = self.tokenize_text(sentence)
            for i in range(len(tokens) - 1):
                current_word, next_word = tokens[i], tokens[i + 1]
                self.bigramCounts[current_word][next_word] += 1
                self.vocabulary.add(current_word)

    def calculate_bigram_probabilities(self):
        self.bigram_probabilities = defaultdict(dict)
        for current_word, next_word_counts in self.bigramCounts.items():
            total_count = sum(next_word_counts.values())
            for next_word, count in next_word_counts.items():
                probability = count / total_count
                self.bigram_probabilities[current_word][next_word] = probability

    def predict_next_word(self, current_word):
        if current_word not in self.bigram_probabilities:
            return None  # Word not present in training data

        next_word_probs = self.bigram_probabilities[current_word]
        next_words, probabilities = zip(*next_word_probs.items())
        chosen_word = np.random.choice(next_words, p=probabilities)
        return chosen_word
    
    ############################################################
    # Added Laplace and KneserNey Smoothing 
    def calculate_probabilities_LS(self):
        self.bigram_probabilities = defaultdict(dict)
        for current_word, next_word_counts in self.bigramCounts.items():
            total_count = sum(next_word_counts.values())
            for next_word, count in next_word_counts.items():
                probability = (count + 1) / (total_count + len(self.vocabulary))
                self.bigram_probabilities[current_word][next_word] = probability
                
    def LaplaceSmoothing(self, k=1):
        self.calculate_probabilities_LS()
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                # Use Laplace-smoothed probabilities
                self.bigram_probabilities[current_word][next_word] = self.bigram_probabilities[current_word][next_word]

        # Recalculate probabilities after smoothing+
        self.calculate_probabilities()
        
    # implement the KneserNey Smoothing function who returns the discounted count
    # assuming d = 0.75 for KneserNey Smoothing
    def KneserNeySmoothing(self):
        self.unigramCounts = defaultdict(int)
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                self.unigramCounts[next_word] += 1
        self.calculate_probabilities()
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                self.bigram_probabilities[current_word][next_word] = (max(self.bigram_probabilities[current_word][next_word] - 0.75, 0) + 0.75 * len(self.bigram_probabilities[current_word]) * self.unigramCounts[next_word] / sum(self.unigramCounts.values())) / sum(self.bigram_probabilities[current_word].values())
        self.calculate_probabilities()

    ############################################################
    # implement P(wi|wi−1) = (count(wi)/count(wi−1)) + β where β is the emotion score from textfile emotions_scores.txt
    def set_emotion_score(self):
        pass
    
    
    def emotion_score_probability(self, current_word, emotion_scores, emotion):
        self.emotion_score_probability = defaultdict(dict)
        # self.emotion_score_probability[word] = float(emotion_score)
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                scores = str(emotion_scores[emotion_scores['word1'] == current_word][emotion_scores['word2'] == next_word]['emotion'].values)
                # Remove brackets and split the string into individual items
                items_str = scores.strip('[]')[2:-3].split('}, ')
                items_str = [item + '}' for item in items_str]
                items_str[-1] = items_str[-1][:-1]
                # Convert each item string to a dictionary
                emotions = [eval(item) for item in items_str]
                # Find the score associated with the label
                emotion_score = 0
                for item in emotions:
                    if item['label'] == emotion.lower():
                        emotion_score = item['score']
                # Print the score
                # print(f"Score for {emotion}:", emotion_score)
                self.bigram_probabilities[current_word][next_word] = (self.bigram_probabilities[current_word][next_word] + emotion_score) / 2
                
        self.calculate_bigram_probabilities()

    def generate_emotion_oriented_sample(self, current_word, emotion_scores, emotion):
        if current_word not in self.bigram_probabilities:
            return None  # Word not present in training data
        next_word_probs = self.emotion_score_probability(current_word, emotion_scores, emotion)
        if not next_word_probs:
            return None  # No emotion score for the current_word
        next_words, probabilities = zip(*next_word_probs.items())
        chosen_word = np.random.choice(next_words, p=probabilities)
        return chosen_word

In [81]:
# Creating a bigram model
bigram_model = BigramLM()
bigram_model.learn_model(corpus[1:])
bigram_model.calculate_bigram_probabilities()
current_word = "i"
emotion_type = "fear"

print(bigram_model.generate_emotion_oriented_sample(current_word, bigram_data, emotion_type))