In [None]:
class BigramLM():
    def __init__(self, tokens):
        self.tokens = tokens
        self.bigram_counts = self.get_bigram_counts()
        self.unigram_counts = self.get_unigram_counts()
        self.bigram_probs = self.get_bigram_probs()
        self.unigram_probs = self.get_unigram_probs()
        self.bigram_laplace_probs = self.get_bigram_laplace_probs()
        self.unigram_laplace_probs = self.get_unigram_laplace_probs()

    def get_bigram_counts(self):
        bigram_counts = defaultdict(int)
        for i in range(len(self.tokens) - 1):
            bigram_counts[self.tokens[i], self.tokens[i+1]] += 1
        return bigram_counts

    def get_unigram_counts(self):
        unigram_counts = defaultdict(int)
        for token in self.tokens:
            unigram_counts[token] += 1
        return unigram_counts

    def get_bigram_probs(self):
        bigram_probs = defaultdict(int)
        for bigram, count in self.bigram_counts.items():
            bigram_probs[bigram] = count / self.unigram_counts[bigram[0]]
        return bigram_probs

    def get_unigram_probs(self):
        unigram_probs = defaultdict(int)
        for unigram, count in self.unigram_counts.items():
            unigram_probs[unigram] = count / len(self.tokens)
        return unigram_probs

    def get_bigram_laplace_probs(self):
        bigram_laplace_probs = defaultdict(int)
        for bigram, count in self.bigram_counts.items():
            bigram_laplace_probs[bigram] = (count + 1) / (self.unigram_counts[bigram[0]] + len(self.unigram_counts))
        return bigram_laplace_probs

    def get_unigram_laplace_probs(self):
        unigram_laplace_probs = defaultdict(int)
        for unigram, count in self.unigram_counts.items():
            unigram_laplace_probs[unigram] = (count + 1) / (len(self.tokens) + len(self.unigram_counts))
        return unigram_laplace_probs
    
    


In [6]:
# Task 2.1: Bigram Language Model


# Importing libraries
import numpy as np
from collections import defaultdict

# Class for Bigram Language Model
class BigramLM: 
    def __init__(self):
        self.bigramCounts = defaultdict(lambda: defaultdict(int))
        # Bigram counts created using defaultdict, 
        # a dictionary that assigns default values to non-existent keys
        self.vocabulary = set() # Vocabulary set created using set() function (a collection of unique elements)
        self.tokenInitiate = '<start>'  # Token for start of sentence
        self.tokenTerminate = '<end>'   # Token for end of sentence

    def tokenize_text(self, text): # Function for tokenizing the text into word list
        return [self.tokenInitiate] + text.split() + [self.tokenTerminate]

    def learn_model(self, corpus): # Function for learning the bigram model from the given corpus 
        for sentence in corpus:
            tokens = self.tokenize_text(sentence)
            for i in range(len(tokens) - 1):
                current_word, next_word = tokens[i], tokens[i + 1]
                self.bigramCounts[current_word][next_word] += 1
                self.vocabulary.add(current_word)

    def calculate_probabilities(self):
        self.bigram_probabilities = defaultdict(dict)
        for current_word, next_word_counts in self.bigramCounts.items():
            total_count = sum(next_word_counts.values())
            for next_word, count in next_word_counts.items():
                probability = count / total_count
                self.bigram_probabilities[current_word][next_word] = probability

    def predict_next_word(self, current_word):
        if current_word not in self.bigram_probabilities:
            return None  # Word not present in training data

        next_word_probs = self.bigram_probabilities[current_word]
        next_words, probabilities = zip(*next_word_probs.items())
        chosen_word = np.random.choice(next_words, p=probabilities)
        return chosen_word
    
    ############################################################
    # Added Laplace and KneserNey Smoothing 
    def calculate_probabilities_LS(self):
        self.bigram_probabilities = defaultdict(dict)
        for current_word, next_word_counts in self.bigramCounts.items():
            total_count = sum(next_word_counts.values())
            for next_word, count in next_word_counts.items():
                probability = (count + 1) / (total_count + len(self.vocabulary))
                self.bigram_probabilities[current_word][next_word] = probability
                
    def LaplaceSmoothing(self, k=1):
        self.calculate_probabilities_LS()
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                # Use Laplace-smoothed probabilities
                self.bigram_probabilities[current_word][next_word] = self.bigram_probabilities[current_word][next_word]

        # Recalculate probabilities after smoothing+
        self.calculate_probabilities()
        
    # implement the Good-Turing Smoothing function who returns the discounted count
    # assuming d = 0.75 for KneserNey Smoothing
    def KneserNeySmoothing(self):
        self.unigramCounts = defaultdict(int)
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                self.unigramCounts[next_word] += 1
        self.calculate_probabilities()
        for current_word, next_word_counts in self.bigramCounts.items():
            for next_word in next_word_counts:
                self.bigram_probabilities[current_word][next_word] = (max(self.bigram_probabilities[current_word][next_word] - 0.75, 0) + 0.75 * len(self.bigram_probabilities[current_word]) * self.unigramCounts[next_word] / sum(self.unigramCounts.values())) / sum(self.bigram_probabilities[current_word].values())
        self.calculate_probabilities()

    # implement P(wi|wi−1) = (count(wi)/count(wi−1)) + β where β is the emotion score from textfile emotions_scores.txt
    def emotion_score_probability(self):
        # You may want to add additional logic for handling missing emotion scores
        self.calculate_probabilities()
        print(self.emotion_score_probability)

    def generate_emotion_oriented_sample(self, current_word):
        if current_word not in self.bigram_probabilities:
            return None  # Word not present in training data
        next_word_probs = self.emotion_score_probability.get(current_word, {})
        if not next_word_probs:
            return None  # No emotion score for the current_word
        next_words, probabilities = zip(*next_word_probs.items())
        chosen_word = np.random.choice(next_words, p=probabilities)
        return chosen_word

        
with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\corpus.txt", 'r') as f:
    corpus = f.readlines()

# Creating a bigram model
bigram_model = BigramLM()
bigram_model.learn_model(corpus)
bigram_model.calculate_probabilities()
bigram_model.emotion_score_probability()
# check emotion_score_probability for any word
current_word = "language"

# print biagram counts in proper format
for current_word, next_word_counts in bigram_model.bigramCounts.items():
    for next_word, count in next_word_counts.items():
        print(f"{current_word} {next_word} {count}")


<bound method BigramLM.emotion_score_probability of <__main__.BigramLM object at 0x000001E85CC30B00>>
<start> i 2108
<start> ill 7
<start> im 212
<start> ive 43
<start> during 1
<start> id 8
<start> the 3
<start> on 1
<start> is 1
<start> no 1
<start> a 3
<start> in 1
<start> when 8
<start> heated 1
<start> this 1
<start> occured 1
i stand 2
i feel 1017
i literally 1
i just 59
i really 34
i believed 1
i wanted 12
i ranted 1
i mention 2
i <end> 5
i was 201
i havent 12
i am 293
i have 143
i truly 5
i still 44
i thought 16
i exhausted 1
i held 1
i look 11
i want 48
i left 5
i didnt 42
i cant 30
i stop 2
i did 32
i know 69
i do 60
i allow 1
i will 33
i hesitate 1
i need 20
i didn 15
i m 59
i can 85
i miss 7
i reacted 1
i dont 53
i wound 1
i see 13
i often 14
i distinctly 1
i lack 1
i try 13
i always 23
i also 25
i are 2
i love 36
i probably 3
i only 12
i grabbed 1
i graduated 1
i felt 26
i even 8
i let 2
i so 1
i remember 24
i believe 6
i had 47
i came 3
i wondered 1
i think 50
i would 42


[('href', 'http', 1.0), ('mooshilu', '<end>', 1.0), ('tychelle', 'to', 1.0), ('hang', 'out', 1.0), ('nonexistent', 'social', 1.0)]


### Modifying bigram probabilities by 

In [11]:
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)
def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\corpus.txt", 'r') as f:
    emotion_scores_list = []
    corpus = f.readlines()
    for sentence in corpus:
        emotion_scores_list.append(emotion_scores(sentence))
        print(emotion_scores(sentence))
        print("\n")
    # print(emotion_scores_list)

  from .autonotebook import tqdm as notebook_tqdm


[{'label': 'sadness', 'score': 0.9990336894989014}, {'label': 'joy', 'score': 0.00022923806682229042}, {'label': 'love', 'score': 0.00016815603885333985}, {'label': 'anger', 'score': 0.00021294814359862357}, {'label': 'fear', 'score': 0.00019591116870287806}, {'label': 'surprise', 'score': 0.00016001718176994473}]


[{'label': 'sadness', 'score': 0.9990696310997009}, {'label': 'joy', 'score': 0.000190242804819718}, {'label': 'love', 'score': 0.00018006691243499517}, {'label': 'anger', 'score': 0.0002524371084291488}, {'label': 'fear', 'score': 0.00015692033048253506}, {'label': 'surprise', 'score': 0.0001506162079749629}]


[{'label': 'sadness', 'score': 0.9989926218986511}, {'label': 'joy', 'score': 0.0002801567316055298}, {'label': 'love', 'score': 0.0002140139404218644}, {'label': 'anger', 'score': 0.00026033984613604844}, {'label': 'fear', 'score': 0.00013414597196970135}, {'label': 'surprise', 'score': 0.00011888779408764094}]


[{'label': 'sadness', 'score': 0.9990227222442627}, 

In [8]:
pip install nltk

Collecting nltkNote: you may need to restart the kernel to use updated packages.

  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.5 MB 653.6 kB/s eta 0:00:03
     ------- -------------------------------- 0.3/1.5 MB 2.8 MB/s eta 0:00:01
     ---------------- ----------------------- 0.6/1.5 MB 4.9 MB/s eta 0:00:01
     -------------------------- ------------- 1.0/1.5 MB 5.4 MB/s eta 0:00:01
     ------------------------------------- -- 1.4/1.5 MB 6.4 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 6.0 MB/s eta 0:00:00
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Downloading click-8.1.7-py3-none-any.whl (97 kB)
   ---------------------

In [12]:
# save emotion scores to a text file
with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\total_emotion_scores.txt", 'w') as f:
    for entry in emotion_scores_list:
        f.write("%s\n" % entry)

In [13]:
max_emotion_scores = []

for entry in emotion_scores_list:
    max_score_entry = max(entry, key=lambda x: x['score'])
    max_emotion_scores.append(max_score_entry)

print(max_emotion_scores)

[{'label': 'sadness', 'score': 0.9990336894989014}, {'label': 'sadness', 'score': 0.9990696310997009}, {'label': 'sadness', 'score': 0.9989926218986511}, {'label': 'sadness', 'score': 0.9990227222442627}, {'label': 'sadness', 'score': 0.9990484118461609}, {'label': 'sadness', 'score': 0.9989859461784363}, {'label': 'sadness', 'score': 0.9987605810165405}, {'label': 'sadness', 'score': 0.9990324974060059}, {'label': 'sadness', 'score': 0.998528003692627}, {'label': 'sadness', 'score': 0.9989314675331116}, {'label': 'sadness', 'score': 0.9987971782684326}, {'label': 'sadness', 'score': 0.9990656971931458}, {'label': 'sadness', 'score': 0.9990183115005493}, {'label': 'sadness', 'score': 0.9988137483596802}, {'label': 'sadness', 'score': 0.999083399772644}, {'label': 'sadness', 'score': 0.9989759922027588}, {'label': 'sadness', 'score': 0.9989853501319885}, {'label': 'sadness', 'score': 0.9989619255065918}, {'label': 'sadness', 'score': 0.9989055395126343}, {'label': 'sadness', 'score': 0.

In [14]:
# save emotion scores to a text file
with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\total_emotion_scores.txt", 'w') as f:
    for entry in max_emotion_scores:
        f.write("%s\n" % entry)

In [None]:
max_emotion_scores

In [10]:
# save max_emotion_scores to a text file 
with open("C:\\Users\\Arjun Mehra\\Desktop\\Sem8\\NLP\\Assignment1\\NLPAssignments\\Assignment1\\Task2\\emotion_scores.txt", 'w') as f:
    for item in max_emotion_scores:
        f.write("%s\n" % item)        

In [None]:
# update bigram pobablities with emotion scores using the following formula P(wi|wi−1) = (count(wi)/count(wi−1)) + β
def update_bigram_probabilities(emotion_list, bigram_model):
    for i in range(len(emotion_list)):
        current_word = bigram_model.tokenize_text(corpus[i])[0]
        next_word = bigram_model.tokenize_text(corpus[i])[1]
        bigram_model.bigram_probabilities[current_word][next_word] = bigram_model.bigram_probabilities[current_word][next_word] + emotion_scores_list[i][emotion_list[i]]

### Test60

In [12]:
import numpy as np

class BigramLM:
    def __init__(self):
        self.bigram_counts = {}
        self.unigram_counts = {}
        self.vocabulary = set()

    def learn_from_dataset(self, dataset):
        for sentence in dataset:
            tokens = sentence.split()
            for i in range(1, len(tokens)):
                bigram = (tokens[i - 1], tokens[i])
                self.bigram_counts[bigram] = self.bigram_counts.get(bigram, 0) + 1
                self.unigram_counts[tokens[i - 1]] = self.unigram_counts.get(tokens[i - 1], 0) + 1
                self.vocabulary.add(tokens[i - 1])
                self.vocabulary.add(tokens[i])

    def calculate_probability(self, bigram):
        if bigram[0] in self.unigram_counts:
            return self.bigram_counts.get(bigram, 0) / self.unigram_counts[bigram[0]]
        else:
            return 0
        
    def laplace_smoothing(self, bigram):
        return (self.bigram_counts.get(bigram, 0) + 1) / (self.unigram_counts.get(bigram[0], 0) + len(self.vocabulary))

    def kneser_ney_smoothing(self, bigram, discount=0.5):
        prefix_count = sum(1 for bg in self.bigram_counts if bg[0] == bigram[0])
        return max((self.bigram_counts.get(bigram, 0) - discount) / self.unigram_counts[bigram[0]], 0) + \
               (discount * prefix_count / self.unigram_counts[bigram[0]]) * self.unigram_counts[bigram[1]] / len(self.vocabulary)

# Example usage:
laplace_prob = bigram_model.laplace_smoothing(('feeling', 'very'))
kneser_ney_prob = bigram_model.kneser_ney_smoothing(('feeling', 'very'))

print("Laplace Probability:", laplace_prob)
print("Kneser-Ney Probability:", kneser_ney_prob)

# Example usage:
corpus = ["i am feeling very happy today", "i love coding", "coding is fun"]
bigram_model = BigramLM()
bigram_model.learn_from_dataset(corpus)
probability = bigram_model.calculate_probability(('feeling', 'very'))

print(probability)

1.0
