# Loading of data


In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

input_dir = os.path.join(os.getcwd(), 'data')
input_text_file = os.path.join(input_dir, 'corpus.txt')
label_file = os.path.join(input_dir, 'labels.txt')

with open(input_text_file, 'r') as f:
    corpus = f.readlines()
    for i in range(len(corpus)):
        corpus[i] = corpus[i][:-1]

with open(label_file, 'r') as f:
    labels = f.readlines()
    for i in range(len(labels)):
        labels[i] = labels[i][:-1]

In [2]:
len(corpus), len(labels)

(2400, 2400)

# Emotion score

In [3]:
from transformers import pipeline
import time

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

start_time = time.time()
sample = "I am so happy to see you!"
all_classes = emotion_scores(sample)
for info in all_classes:
    print(info)
end_time = time.time()
print(type(all_classes))
print("Time taken for emotion_scores: ", end_time-start_time)


{'label': 'sadness', 'score': 0.00029859962523914874}
{'label': 'joy', 'score': 0.9987986087799072}
{'label': 'love', 'score': 0.0004451328422874212}
{'label': 'anger', 'score': 0.0001878843759186566}
{'label': 'fear', 'score': 0.00012197871546959504}
{'label': 'surprise', 'score': 0.00014771465794183314}
<class 'list'>
Time taken for emotion_scores:  0.8889884948730469


# Bigram Model for Emotion Sentence Generation

In [4]:
import random
import numpy as np
from typing import List


class BigramLM_Emotion:
    def __init__(self, corpus, labels):
        self.corpus = corpus
        self.labels = labels
        self.bigram_counts = {}
        self.unigram_counts = {}
        self.vocabulary = set()
        self.total_bigram_pairs = None
        self.bigram_emotion_vector = {}
        self.bigram_prob = {}
        self.class_to_idx = {'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}
        self.num_labels = len(self.class_to_idx)
        
        
    def train(self):
        '''
        Train the Bigram language model on the corpus and labels.
        '''
        self.count_unigrams()
        self.count_bigrams()
        self.__make_emotion_vector_to_numpy()
    
    
    def count_unigrams(self):
        '''
        Count the unigrams in the corpus and store counts in the unigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            label = self.labels[i]
            for token in tokens:
                self.unigram_counts[token] = self.unigram_counts.get(token, 0) + 1
                self.vocabulary.add(token)
        self.vocabulary.remove('</start>')
        self.vocabulary.remove('</end>')
        return
    
    
    def count_bigrams(self):
        '''
        Count the bigrams in the corpus and store counts in the bigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            if i % 100 == 0:
                print(i, end=' ')
            sentence = self.corpus[i]
            emotion = emotion_scores(sentence)
            label = self.labels[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            bi_grams = self.get_bigrams(tokens)            
            
            for bi_gram in bi_grams:
                context = bi_gram[0]
                token = bi_gram[1]
                if context not in self.bigram_counts:
                    self.bigram_counts[context] = {}
                self.bigram_counts[context][token] = self.bigram_counts[context].get(token, 0) + 1
                if context not in self.bigram_emotion_vector:
                    self.bigram_emotion_vector[context] = {}
                self.bigram_emotion_vector[context][token] = self.bigram_emotion_vector[context].get(token, [])
                emotion_vector = [0] * self.num_labels
                for i in range(len(emotion)):
                    emotion_vector[self.class_to_idx[emotion[i]['label']]] = emotion[i]['score']
                self.bigram_emotion_vector[context][token].append(emotion_vector)
        return
    
    
    def get_bigrams(self, tokens:List[str]):
        '''
        Given a list of tokens, return a list of possible bigrams
        '''
        bigrams = []
        for i in range(len(tokens) - 1):
            bigrams.append((tokens[i], tokens[i+1]))
        return bigrams
    
    
    def __make_emotion_vector_to_numpy(self):
        '''
        Convert emotion vectors to numpy arrays
        '''
        for context in self.bigram_emotion_vector:
            for token in self.bigram_emotion_vector[context]:
                self.bigram_emotion_vector[context][token] = np.array(self.bigram_emotion_vector[context][token])
        return
    
    
    def get_bigram_prob(self, context:str, token:str, beta_score:float, smoothing:str='kneser-ney'):
        '''
        Get the probability of the token given the context
        '''
        smoothing = smoothing.lower()
        if smoothing == 'none':
            return self.__get_bigram_prob_normal(context, token, beta_score)
        elif smoothing == 'laplace':
            return self.__get_bigram_prob_laplace(context, token, beta_score)
        elif smoothing == 'kneser-ney':
            return self.__get_bigram_prob_kneser_ney(context, token, beta_score)
        else:
            raise ValueError('Smoothing method not supported')
    
    
    def __get_bigram_prob_normal(self, context, token, beta_score):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return beta_score + beta_score*context_token_cnt/self.unigram_counts[context]


    def __get_bigram_prob_laplace(self, context, token, beta_score):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return beta_score + beta_score*(context_token_cnt+1)/(self.unigram_counts[context]+len(self.vocabulary))
    
    
    def __get_bigram_prob_kneser_ney(self, context, token, beta_score, avg_discount=0.7):
        d = avg_discount
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        
        # Calculate alpha, which depends on the context
        alpha = d * len(self.bigram_counts[context]) / self.unigram_counts[context]
        # Calculate Continuation Probability, which depends on the token
        bigram_with_token_cnt = 0
        for _context_ in self.bigram_counts:
            bigram_with_token_cnt += 1 if token in self.bigram_counts[_context_] else 0
        total_bigram_pairs = self.__count_total_bigram_pairs()
        P_continuation = bigram_with_token_cnt / total_bigram_pairs
    
        return (beta_score + beta_score*max(context_token_cnt-d,0)/self.unigram_counts[context]) + (alpha * P_continuation)
    
    
    def __count_total_bigram_pairs(self):
        '''
        Count the total number of unique bigram pairs in the corpus
        '''
        if self.total_bigram_pairs == None:
            self.total_bigram_pairs = 0
            for context in self.bigram_counts:
                self.total_bigram_pairs += len(self.bigram_counts[context])
        return self.total_bigram_pairs
    
    
    def __generate_bigram_prob_for_context(self, context, emotion:str, smoothing:str='kneser-ney'):
        '''
        Generate bigram probabilities for all tokens for a given context
        '''
        if context not in self.bigram_prob:
            self.bigram_prob[context] = {}
        if emotion not in self.bigram_prob[context]:
            self.bigram_prob[context][emotion] = {}
            emotion_vector = np.array([0] * self.num_labels)
            emotion_vector[self.class_to_idx[emotion]] = 1

            # calculate beta scores and normalize them
            beta_scores = {}
            total_score = 0
            for token in self.bigram_counts[context]:
                score = np.sum(emotion_vector * self.bigram_emotion_vector[context][token])
                # Best score combinations
                # score * np.log(score), score * (1 + np.log(score)), score * (np.e + np.log(score)) 
                beta_scores[token] =  score * np.log(score)
                total_score += score
            for token in beta_scores:
                beta_scores[token] = beta_scores[token] / total_score
                
            # calculate bigram probabilities
            for token in self.bigram_counts[context]:
                self.bigram_prob[context][emotion][token] = self.get_bigram_prob(context, token, beta_scores[token], smoothing)
            
            # normalize bigram probabilities
            total_prob = sum(self.bigram_prob[context][emotion].values())
            for token in self.bigram_prob[context][emotion]:
                self.bigram_prob[context][emotion][token] = self.bigram_prob[context][emotion][token] / total_prob

        return self.bigram_prob[context][emotion]
    
    
    def __generate_token(self, context:str, emotion:str, smoothing:str='kneser-ney'):
        '''
        Generate a token given the context
        '''
        all_possible_tokens = self.__generate_bigram_prob_for_context(context, emotion, smoothing)
        generated_token = random.choices(list(all_possible_tokens.keys()), weights=list(all_possible_tokens.values()))[0]
        return generated_token
    
    
    def generate_sentence(self, emotion:str, max_length:int=10, smoothing:str='kneser-ney'):
        '''
        Generate a sentence of the given max_length
        '''
        sentence = []
        context = '</start>'
        for _ in range(max_length):
            token = self.__generate_token(context, emotion, smoothing)
            if token == '</end>':
                break
            sentence.append(token)
            context = token
        return ' '.join(sentence)

## Initialize the model

In [76]:
lm = BigramLM_Emotion(corpus, labels)
lm.train()

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 

In [77]:
lm.class_to_idx

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

## Save the model

In [81]:
import pickle
model_file = os.path.join(os.getcwd(), 'bigram_lm_emotion.pkl')
pickle.dump(lm, open(model_file, 'wb'))

## Load the model

In [5]:
import pickle
import os

model_file = os.path.join(os.getcwd(), 'bigram_lm_emotion.pkl')
lm_saved = pickle.load(open(model_file, 'rb'))

## Generate different emotion sentences using the LM 

In [80]:
all_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
output_dir = os.path.join(os.getcwd(), 'output')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
for emotion in all_emotions:
    print(f'Generating sentences for {emotion}')
    output_file = os.path.join(output_dir, f'gen_{emotion}.txt')
    
    with open(output_file, 'w') as f:
        sentences = []
        outputs = []
        samples = 50
        generated = 0
        trails = 0
        while generated < samples:
            sentence = lm.generate_sentence(emotion, max_length=30, smoothing='kneser-ney')
            emotions = emotion_scores(sentence)
            trails += 1
            
            max_score = 0
            max_label = ''
            for info in emotions:
                if info['score'] > max_score:
                    max_score = info['score']
                    max_label = info['label']
            if max_label != emotion:
                continue
            
            f.write(sentence + '\n')
            f.write(max_label + ' ' + str(max_score) + '\n\n')
            sentences.append(sentence)
            outputs.append(emotions)
            generated += 1
        
    print(f'Generated {generated} sentences for {emotion} in {trails} trails')
    print(f'Accuracy for {emotion}: {generated/trails*100}\n')

Generating sentences for sadness
Generated 50 sentences for sadness in 73 trails
Accuracy for sadness: 68.4931506849315

Generating sentences for joy
Generated 50 sentences for joy in 58 trails
Accuracy for joy: 86.20689655172413

Generating sentences for love
Generated 50 sentences for love in 90 trails
Accuracy for love: 55.55555555555556

Generating sentences for anger
Generated 50 sentences for anger in 69 trails
Accuracy for anger: 72.46376811594203

Generating sentences for fear
Generated 50 sentences for fear in 68 trails
Accuracy for fear: 73.52941176470588

Generating sentences for surprise
Generated 50 sentences for surprise in 71 trails
Accuracy for surprise: 70.4225352112676

