# Loading of data


In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

input_dir = os.path.join(os.getcwd(), 'data')
input_text_file = os.path.join(input_dir, 'corpus.txt')
label_file = os.path.join(input_dir, 'labels.txt')

with open(input_text_file, 'r') as f:
    corpus = f.readlines()
    for i in range(len(corpus)):
        corpus[i] = corpus[i][:-1]

with open(label_file, 'r') as f:
    labels = f.readlines()
    for i in range(len(labels)):
        labels[i] = labels[i][:-1]

In [2]:
len(corpus), len(labels)

(2400, 2400)

# Emotion score

In [3]:
from transformers import pipeline
import time

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

start_time = time.time()
sample = "I am so happy to see you!"
all_classes = emotion_scores(sample)
for info in all_classes:
    print(info)
end_time = time.time()
print(type(all_classes))
print("Time taken for emotion_scores: ", end_time-start_time)


{'label': 'sadness', 'score': 0.00029859962523914874}
{'label': 'joy', 'score': 0.9987986087799072}
{'label': 'love', 'score': 0.0004451328422874212}
{'label': 'anger', 'score': 0.0001878843759186566}
{'label': 'fear', 'score': 0.00012197871546959504}
{'label': 'surprise', 'score': 0.00014771465794183314}
<class 'list'>
Time taken for emotion_scores:  0.8111534118652344


# Class for Bigram Model

In [7]:
from typing import List, Dict, Tuple
import random


class BigramLM:
    def __init__(self, corpus, labels):
        self.corpus = corpus
        self.labels = labels
        self.bigram_counts = {}
        self.unigram_counts = {}
        self.vocabulary = set()
        self.total_bigram_pairs = None
        self.bigram_prob = {}
        
        
    def train(self):
        '''
        Train the Bigram language model on the corpus and labels.
        '''
        self.count_unigrams()
        self.count_bigrams()
    
    
    def count_unigrams(self):
        '''
        Count the unigrams in the corpus and store counts in the unigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            label = self.labels[i]
            for token in tokens:
                self.unigram_counts[token] = self.unigram_counts.get(token, 0) + 1
                self.vocabulary.add(token)
        self.vocabulary.remove('</start>')
        self.vocabulary.remove('</end>')
        return
    
    
    def count_bigrams(self):
        '''
        Count the bigrams in the corpus and store counts in the bigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            label = self.labels[i]
            bi_grams = self.get_bigrams(tokens)            
            
            for bi_gram in bi_grams:
                context = bi_gram[0]
                token = bi_gram[1]
                if context not in self.bigram_counts:
                    self.bigram_counts[context] = {}
                self.bigram_counts[context][token] = self.bigram_counts[context].get(token, 0) + 1
        return
    
    
    def get_bigrams(self, tokens: List[str]):
        '''
        Given a list of tokens, return a list of possible bigrams
        '''
        bigrams = []
        for i in range(len(tokens) - 1):
            bigrams.append((tokens[i], tokens[i+1]))
        return bigrams
    
    
    def get_bigram_prob(self, context: str, token: str, smoothing: str = 'none'):
        '''
        Get the probability of the token given the context
        '''
        smoothing = smoothing.lower()
        if smoothing == 'none':
            return self.__get_bigram_prob_normal(context, token)
        elif smoothing == 'laplace':
            return self.__get_bigram_prob_laplace(context, token)
        elif smoothing == 'kneser-ney':
            return self.__get_bigram_prob_kneser_ney(context, token)
        else:
            raise ValueError('Smoothing method not supported')
    
    
    def __get_bigram_prob_normal(self, context, token):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return context_token_cnt / self.unigram_counts[context]


    def __get_bigram_prob_laplace(self, context, token):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return (context_token_cnt + 1) / (self.unigram_counts[context] + len(self.vocabulary))
    
    
    def __get_bigram_prob_kneser_ney(self, context, token, avg_discount=0.7):
        d = avg_discount
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        
        # Calculate alpha, which depends on the context
        alpha = d * len(self.bigram_counts[context]) / self.unigram_counts[context]
        # Calculate Continuation Probability, which depends on the token
        bigram_with_token_cnt = 0
        for _context_ in self.bigram_counts:
            bigram_with_token_cnt += 1 if token in self.bigram_counts[_context_] else 0
        total_bigram_pairs = self.__count_total_bigram_pairs()
        P_continuation = bigram_with_token_cnt / total_bigram_pairs
    
        return (max(context_token_cnt - d, 0) / self.unigram_counts[context]) + (alpha * P_continuation)
    
    
    def __count_total_bigram_pairs(self):
        '''
        Count the total number of unaiue bigram pairs in the corpus
        '''
        if self.total_bigram_pairs == None:
            self.total_bigram_pairs = 0
            for context in self.bigram_counts:
                self.total_bigram_pairs += len(self.bigram_counts[context])
        return self.total_bigram_pairs
    
    
    def __generate_bigram_prob_for_context(self, context, smoothing:str = 'none'):
        '''
        Generate bigram probabilities for all tokens for a given context
        '''
        if context not in self.bigram_prob:
            self.bigram_prob[context] = {}
            for token in self.bigram_counts[context]:
                self.bigram_prob[context][token] = self.get_bigram_prob(context, token, smoothing)
        return self.bigram_prob[context]
    
    
    def __generate_token(self, context:str, smoothing:str = 'none'):
        '''
        Generate a token given the context
        '''
        all_possible_tokens = self.__generate_bigram_prob_for_context(context, smoothing)
        generated_token = random.choices(list(all_possible_tokens.keys()), weights=list(all_possible_tokens.values()), k=1)[0]
        return generated_token
    
    
    def generate_sentence(self, max_length: int = 10, smoothing: str = 'none'):
        '''
        Generate a sentence of the given max_length
        '''
        sentence = []
        context = '</start>'
        for _ in range(max_length):
            token = self.__generate_token(context, smoothing)
            if token == '</end>':
                break
            sentence.append(token)
            context = token
        return ' '.join(sentence)        

In [8]:
bigram_lm = BigramLM(corpus, labels)
bigram_lm.train()

In [25]:
bigram_lm.get_bigram_prob('i', 'fine', smoothing='laplace')

0.00010848340203948796

In [7]:
bigram_lm.unigram_counts['i']

3789

In [56]:
bigram_lm.generate_sentence(max_length=10, smoothing='kneser-ney')

'i am trying to hold me to me and physical'

# Generating top 5 Bigrams

In [53]:
smoothing_methods = ['none', 'laplace', 'kneser-ney']

for smoothing in smoothing_methods:
    all_bigram = {}
    for context in bigram_lm.bigram_counts:
        for token in bigram_lm.bigram_counts[context]:
            all_bigram[(context, token)] = bigram_lm.get_bigram_prob(context, token, smoothing)
    
    top_count = 10
    top_bigrams = sorted(all_bigram, key=all_bigram.get, reverse=True)[:top_count]
    print(f"Top {top_count} bigrams with smoothing method: {smoothing}")
    print(f"Total bigrams: {len(all_bigram)}")
    for bigram in top_bigrams:
        print(f"{bigram}: {all_bigram[bigram]}")
    print()

Top 10 bigrams with smoothing method: none
Total bigrams: 25681
('href', 'http'): 1.0
('mooshilu', '</end>'): 1.0
('tychelle', 'to'): 1.0
('hang', 'out'): 1.0
('nonexistent', 'social'): 1.0
('alex', 'and'): 1.0
('marriage', 'and'): 1.0
('personifying', 'an'): 1.0
('progeny', 'who'): 1.0
('genuflecting', 'at'): 1.0

Top 10 bigrams with smoothing method: laplace
Total bigrams: 25681
('</start>', 'i'): 0.2693830629710052
('i', 'feel'): 0.11043610327619874
('feel', 'like'): 0.0350976507217662
('i', 'am'): 0.03189412019960946
('</start>', 'im'): 0.02720653978796781
('that', 'i'): 0.02650602409638554
('and', 'i'): 0.023103748910200523
('im', 'feeling'): 0.022454576619814877
('i', 'was'): 0.021913647211976566
('to', 'be'): 0.01861427094105481

Top 10 bigrams with smoothing method: kneser-ney
Total bigrams: 25681
('href', 'http'): 0.9720021806004439
('don', 't'): 0.9712049203427449
('didn', 't'): 0.9611413972283877
('sort', 'of'): 0.9594087640897253
('supposed', 'to'): 0.9238243578261491
('doe