__Importing essential libraries__

In [3]:
import numpy as np
import random
from collections import defaultdict

__Creating the BigramLM class__

In [15]:
class BigramLM: 
    '''
    BigramLM: Provides the functional definition of a bigram language model.
    '''
    def __init__(self):
        '''
        BRIEF:      Constructor for the creation of bigram LM.
        PARAMETERS: None.
        '''
        self.bigramCounts = defaultdict(lambda: defaultdict(int))  
        self.vocabulary = defaultdict(int) # Vocabulary for learning
        self.tokenInitiate = '<s>'  # Token for start of sentence
        self.tokenTerminate = '</s>'   # Token for end of sentence

    def __tokenize_text(self, text):
        '''
        BRIEF:      (Internal method) Tokenizing the sentence into a word list.
        PARAMETERS: The sentence to be tokenized (text).
        RETURN:     list of tokens in the sentence.
        '''
        return [self.tokenInitiate] + text.split() + [self.tokenTerminate]

    def __simple_probabilities(self, current_word, next_word_candidates):
        '''
        BRIEF:      (Internal method) Calculating simple bigram probabilities.
        PARAMETERS: Last word (current_word) | Candidates for the next word (next_word_candidates).
        '''
        return [(self.bigramCounts[current_word][next_word]) / (sum(self.bigramCounts[current_word].values()))
                    for next_word in next_word_candidates]

    def __laplace_probabilities(self, current_word, next_word_candidates):
        '''
        BRIEF:      (Internal method) Computing the Laplace probabilities given the last word.
        PARAMETERS: Last word (current_word) | Candidates for the next word (next_word_candidates).
        '''
        return [(self.bigramCounts[current_word][next_word] + 1) / (sum(self.bigramCounts[current_word].values()) + len(next_word_candidates))
                    for next_word in next_word_candidates]
          
    def __knesser_ney_smoothing(self):
        '''
        BRIEF:      (Internal method) Performing Knesser-Ney smoothing.
        PARAMETERS: None.
        '''
        pass

    def fit(self, corpus):
        '''
        BRIEF:      Learning the bigram model from the given corpus.
        PARAMETERS: The corpus of samples for learning (corpus).
        ''' 
        for sentence in corpus:
            tokens = self.__tokenize_text(sentence)
            for i in range(len(tokens) - 1):
                current_word, next_word = tokens[i], tokens[i + 1]
                self.bigramCounts[current_word][next_word] += 1
                self.vocabulary[current_word] += 1
        
    def generate_sentence(self, numWords, smoothing=None):
        '''
        BRIEF:      Generating a sentence of a given word-count and using a given smoothing.
        PARAMETERS: Word-count (numWords) | The smoothing technique to use (smoothing).
        RETURN:     The generated sentence as a string.
        '''
        current_word = self.tokenInitiate
        sentence = []
        
        for _ in range(numWords):
            
            # next_word_candidates = list(self.bigramCounts[current_word].keys())
            next_word_candidates = list(self.vocabulary.keys())
            probabilities = []
            # TODO: Kneser-Ney Smoothing
            if smoothing == 'laplace':
                probabilities = self.__laplace_probabilities(current_word, next_word_candidates)
            else:
                probabilities = self.__simple_probabilities(current_word, next_word_candidates)
            
            next_word = '<s>'
            while next_word == '<s>':
                next_word = random.choices(next_word_candidates, probabilities)[0]
                        
            if next_word == '</s>':
                break

            sentence.append(next_word)
            current_word = next_word
        
        return " ".join(sentence)
    
    
    def generate_emotion_samples(self, num_samples_per_emotion=50, smoothing=None, emotions=None):
        '''
        BRIEF:      Generate emotion-oriented sentences and store them in .txt files.
        PARAMETERS: Number of samples per emotion (num_samples_per_emotion) | Smoothing technique to use (smoothing) | List of emotions (emotions).
        '''
        if emotions is None:
            emotions = ['fear', 'anger', 'sadness', 'joy', 'love', 'surprise']

        for emotion in emotions:
            samples = []
            for _ in range(num_samples_per_emotion):
                sentence = self.generate_sentence(numWords=10, smoothing=smoothing)  # Adjust numWords as needed
                samples.append(sentence)

            filename = f'gen_{emotion}.txt'
            try:
                with open(filename, 'w') as file:
                    file.write('\n'.join(samples))
            except ValueError as e:
                print(f"Error writing to file {filename}: {e}")


__Working with BigramLM__

In [18]:
with open('../data/corpus.txt', 'r') as f:
    corpus = f.readlines()

# Creating a bigram model
bigram_model = BigramLM()
bigram_model.fit(corpus)

sentence = ""

try:
    sentence = bigram_model.generate_sentence(10, smoothing=None)
except ValueError:
    sentence = bigram_model.generate_sentence(10, smoothing=None)

print(sentence)

i know im seventy ill be any means for ways


** Evalutation

In [None]:
# # Generating emotion-oriented samples
# if __name__ == "__main__":
#     with open('../data/corpus.txt', 'r') as f:
#         corpus = f.readlines()
#     bigram_model = BigramLM()
#     bigram_model.fit(corpus)
#     emotions = ['happy', 'sadness', 'angry', 'fear', 'love', 'surprise']
#     bigram_model.generate_emotion_samples(num_samples_per_emotion=50, smoothing=None, emotions=emotions)