# Loading of data


In [22]:
import os
input_dir = os.path.join(os.getcwd(), 'data')
input_text_file = os.path.join(input_dir, 'corpus.txt')
label_file = os.path.join(input_dir, 'labels.txt')

with open(input_text_file, 'r') as f:
    corpus = f.readlines()
    for i in range(len(corpus)):
        corpus[i] = corpus[i][:-1]

with open(label_file, 'r') as f:
    labels = f.readlines()
    for i in range(len(labels)):
        labels[i] = labels[i][:-1]

# Emotion score

In [190]:
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

sample = "I am so happy to see you!"
all_classes = emotion_scores(sample)
for info in all_classes:
    print(info)

{'label': 'sadness', 'score': 0.00029859962523914874}
{'label': 'joy', 'score': 0.9987986087799072}
{'label': 'love', 'score': 0.0004451328422874212}
{'label': 'anger', 'score': 0.0001878843759186566}
{'label': 'fear', 'score': 0.00012197871546959504}
{'label': 'surprise', 'score': 0.00014771465794183314}




# Class for Bigram Model

In [182]:
from typing import List, Dict, Tuple


class BigramLM:
    def __init__(self, corpus, labels):
        self.corpus = corpus
        self.labels = labels
        self.bigram_counts = {}
        self.unigram_counts = {}
        self.vocabulary = set()
        
        
    def train(self):
        '''
        Train the Bigram language model on the corpus and labels.
        '''
        self.count_unigrams()
        self.count_bigrams()
    
    
    def count_unigrams(self):
        '''
        Count the unigrams in the corpus and store counts in the unigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split()
            label = self.labels[i]
            for token in tokens:
                self.unigram_counts[token] = self.unigram_counts.get(token, 0) + 1
                self.vocabulary.add(token)
        return
    
    
    def count_bigrams(self):
        '''
        Count the bigrams in the corpus and store counts in the bigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split()
            label = self.labels[i]
            bi_grams = self.get_bigrams(tokens)            
            
            for bi_gram in bi_grams:
                context = bi_gram[0]
                token = bi_gram[1]
                if context not in self.bigram_counts:
                    self.bigram_counts[context] = {}
                self.bigram_counts[context][token] = self.bigram_counts[context].get(token, 0) + 1
        return
    
    
    def get_bigrams(self, tokens: List[str]):
        '''
        Given a list of tokens, return a list of possible bigrams
        '''
        bigrams = []
        for i in range(len(tokens) - 1):
            bigrams.append((tokens[i], tokens[i+1]))
        return bigrams
    
    
    def get_bigram_prob(self, context: str, token: str, smoothing: str = 'none'):
        '''
        Get the probability of the token given the context
        '''
        smoothing = smoothing.lower()
        if smoothing == 'none':
            return self.__get_bigram_prob_normal(context, token)
        elif smoothing == 'laplace':
            return self.__get_bigram_prob_laplace(context, token)
        elif smoothing == 'kneser-ney':
            return self.__get_bigram_prob_kneser_ney(context, token)
        else:
            raise ValueError('Smoothing method not supported')
    
    
    def __get_bigram_prob_normal(self, context, token):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return context_token_cnt / self.unigram_counts[context]


    def __get_bigram_prob_laplace(self, context, token):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return (context_token_cnt + 1) / (self.unigram_counts[context] + len(self.vocabulary))
    
    
    def __get_bigram_prob_kneser_ney(self, context, token, avg_discount=0.7):
        d = avg_discount
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        
        # Calculate alpha, which depends on the context
        alpha = d * len(self.bigram_counts[context]) / self.unigram_counts[context]
        # Calculate Continuation Probability, which depends on the token
        bigram_with_token_cnt = 0
        for _context_ in self.bigram_counts:
            bigram_with_token_cnt += 1 if token in self.bigram_counts[_context_] else 0
        total_biagram_pairs = 0
        for _context_ in self.bigram_counts:
            total_biagram_pairs += len(self.bigram_counts[_context_])
        P_continuation = bigram_with_token_cnt / total_biagram_pairs
        
        return (max(context_token_cnt - d, 0) / self.unigram_counts[context]) + (alpha * P_continuation)
        

In [183]:
bigram_lm = BigramLM(corpus, labels)
bigram_lm.train()

In [204]:
bigram_lm.unigram_counts['i']

3789

In [185]:
bigram_lm.get_bigram_prob('i', 'am', smoothing='kneser-ney')

0.0771809335374076

In [186]:
bigram_lm.unigram_counts['i']

3789

In [202]:
import random
context = {'word': ['i', 'am', 'happy'], 'prob': [0.1, 0.2, 0.7]}
random.choices(context['word'], context['prob'])[0]

'happy'