Q1 and Q2 are in LLM_HW1.ipynb. Q3 and Q4 are in Report.pdf. Q5 in User_Interface.ipynb

5. Bonus: Create a simple user interface where users can enter some preffix and get a completion of words up to a specified length.

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.util import ngrams
from collections import Counter
import random

In [None]:
# Tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [None]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [None]:
from nltk.corpus import gutenberg

In [None]:
from nltk.corpus import reuters

In [None]:
# Idea is to prevent recalculation of preprocessing and n-gram freq
# each time I change sentence
# or decide to change dataset, n or smoothing technique

In [None]:
class NgramModel:
  def __init__(self, dataset, n=3, smoothing_technique='None'):
    self.n = n
    self.smoothing_technique = smoothing_technique
    self.preprocessed_dataset = NgramModel.preprocessing_pipeline(dataset)
    self.tokens = self.generate_tokens()
    self.ngram_frequency_dict = self.generate_ngram_frequency_dict(n)
    self.n_minus_1_gram_frequency_dict = self.generate_ngram_frequency_dict(n-1)
    self.unigram_frequency_dict = self.generate_ngram_frequency_dict(1)
    self.unique_tokens = self.generate_unique_tokens()

    if smoothing_technique=='linear_interpolation':
      self.n = 3




  @staticmethod
  def roman_to_int(s):
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}

    result = 0
    prev_value = 0

    for char in reversed(s):
        value = roman_numerals[char]
        if value < prev_value:
            result -= value
        else:
            result += value
        prev_value = value

    return result

  @staticmethod
  def extract_and_convert_roman(input_string):
    # Find all instances of "VOLUME" or "CHAPTER" followed by a space and then capture the Roman numeral
    matches = re.finditer(r'(VOLUME|CHAPTER)\s([IVXLCDM]+)', input_string)

    for match in matches:
      keyword, roman_numeral = match.groups()

      # Convert Roman numeral to integer
      result = NgramModel.roman_to_int(roman_numeral)

      input_string = input_string.replace(f"{keyword} {roman_numeral}", f"{keyword} {result}")

    return input_string

  @staticmethod
  def remove_non_alphanumeric(input_string):
    # Use regular expression to replace non-alphanumeric characters with an empty string
    result_string = re.sub(r'[^a-zA-Z0-9 \n]', '', input_string)
    result_string = re.sub(r'\n+', ' ', result_string)
    return result_string

  @staticmethod
  def lowercase_string(input_string):
      # Use lower() method to convert the string to lowercase
      result_string = input_string.lower()
      return result_string

  @staticmethod
  def preprocessing_pipeline(input_text):
      converted_roman_text = NgramModel.extract_and_convert_roman(input_text)
      cleaned_text = NgramModel.remove_non_alphanumeric(converted_roman_text)
      lower_cased_text = NgramModel.lowercase_string(cleaned_text)
      return lower_cased_text

  @staticmethod
  def generate_gram(seq,n):
    seq_list = seq.split(' ')
    return ' '.join(seq_list[-n+1:])

  def generate_tokens(self):
    return nltk.word_tokenize(self.preprocessed_dataset)

  def generate_unique_tokens(self):
    return list(set(self.tokens))

  def generate_ngram_frequency_dict(self,n):
    n_grams = ngrams(self.tokens,n=n)
    n_grams_lis = list(n_grams)
    ngram_frequency = Counter(n_grams_lis)
    ngram_frequency_dict = dict(ngram_frequency)

    return ngram_frequency_dict

  def probability(self,word,gram):
    if self.smoothing_technique == 'laplace':
      return self.laplace_smoothing_probability(word,gram)

    elif self.smoothing_technique == 'linear_interpolation':
      return self.linear_interpolation_probability(word,gram)

    else:
      return self.no_smoothing_probability(word,gram)

  def laplace_smoothing_probability(self,word,gram):
    prob = 0
    n_minus_1_gram_list = gram.split(' ')

    n_gram_list = []
    n_gram_list.extend(n_minus_1_gram_list)
    n_gram_list.extend([word])

    if tuple(n_gram_list) in self.ngram_frequency_dict:
      N = self.ngram_frequency_dict[tuple(n_gram_list)]+1
    else:
      N = 1

    if tuple(n_minus_1_gram_list) in self.n_minus_1_gram_frequency_dict:
      D = self.n_minus_1_gram_frequency_dict[tuple(n_minus_1_gram_list)]+len(self.unique_tokens)
    else:
      D = len(self.unique_tokens)

    prob = N/D

    return prob

  # gram param is a bigram
  def linear_interpolation_probability(self,word,gram):
    # word = mat, gram = on a
    prob1,prob2,prob3,prob = 0,0,0,0
    n_minus_1_gram_list = gram.split(' ') # [on,a]
    n_gram_list = []
    n_gram_list.extend(n_minus_1_gram_list)
    n_gram_list.extend([word]) # [on,a,mat]

    try:
      prob1 = self.ngram_frequency_dict[tuple(n_gram_list)] / self.n_minus_1_gram_frequency_dict[tuple(n_minus_1_gram_list)]
    except Exception as e:
      pass


    n_minus_1_gram_list.pop(0) # [a]
    n_gram_list.pop(0) # [a,mat]

    try:
      prob2 = self.n_minus_1_gram_frequency_dict[tuple(n_gram_list)] / self.unigram_frequency_dict[tuple(n_minus_1_gram_list)]
    except Exception as e:
      pass


    n_gram_list.pop(0) # [mat]

    try:
      prob3 = self.unigram_frequency_dict[tuple(n_gram_list)] / len(self.unique_tokens)
    except Exception as e:
      pass

    prob = 0.6*prob1 + 0.28*prob2 + 0.12*prob3
    return prob

  def no_smoothing_probability(self,word,gram):
    prob = 0
    n_minus_1_gram_list = gram.split(' ')
    n_gram_list = []
    n_gram_list.extend(n_minus_1_gram_list)
    n_gram_list.extend([word])

    try:
      prob = self.ngram_frequency_dict[tuple(n_gram_list)] / self.n_minus_1_gram_frequency_dict[tuple(n_minus_1_gram_list)]
    except Exception as e:
      pass

    return prob

  def nextWord(self,seq):

    max_prob,nxt_word = 0,''
    for token in self.unique_tokens:
      token_prob = self.probability(token,NgramModel.generate_gram(seq,self.n))
      if token_prob>max_prob:
        max_prob = token_prob
        nxt_word = token

    # if our seq has a word which is not in vocab then
    # prob of all tokens in vocab is equal to 1/len(vocab)
    # hence random token from vocab should be returned

    if max_prob == 1/len(self.unique_tokens) and self.smoothing_technique=='laplace':
      nxt_word = random.choice(self.unique_tokens)
    return nxt_word


  def generateSentence(self,sentence_length,prefix):
    generated_sentence = prefix

    for i in range(sentence_length):
      generated_sentence += ' ' + self.nextWord(generated_sentence)

    return generated_sentence


My Models

In [None]:
# Dataset: Gutenberg, n=2, smoothing=None
model_guten_2_no_smoothing = NgramModel(dataset=gutenberg.raw(), n=2, smoothing_technique='None')

# Dataset: Gutenberg, n=3, smoothing=None
model_guten_3_no_smoothing = NgramModel(dataset=gutenberg.raw(), n=3, smoothing_technique='None')

# Dataset: Gutenberg, n=4, smoothing=None
model_guten_4_no_smoothing = NgramModel(dataset=gutenberg.raw(), n=4, smoothing_technique='None')

# Dataset: Gutenberg, n=5, smoothing=None
model_guten_5_no_smoothing = NgramModel(dataset=gutenberg.raw(), n=5, smoothing_technique='None')

# Dataset: Gutenberg, n=4, smoothing=laplace
model_guten_4_laplace = NgramModel(dataset=gutenberg.raw(), n=4, smoothing_technique='laplace')

# Dataset: Gutenberg, n=3 by default, smoothing=linear_interpolation
model_guten_3_linear_interpolation = NgramModel(dataset=gutenberg.raw(), n=3, smoothing_technique='linear_interpolation')

# Dataset: Reuters, n=4, smoothing=None
model_reuters_4_no_smoothing = NgramModel(dataset=reuters.raw(), n=4, smoothing_technique='None')

# Dataset: Reuters, n=2, smoothing=laplace
model_reuters_2_laplace = NgramModel(dataset=reuters.raw(), n=2, smoothing_technique='laplace')

# Dataset: Reuters, n=3, smoothing=linear_interpolation
model_reuters_3_linear_interpolation = NgramModel(dataset=reuters.raw(), n=3, smoothing_technique='linear_interpolation')

Interface

In [None]:
prompt = input('Enter prefix: ')
length = input('Enter number of words to generate: ')
length = int(length)

print(f'model_guten_2_no_smoothing: {model_guten_2_no_smoothing.generateSentence(length,prompt)}')
print(f'model_guten_3_no_smoothing: {model_guten_3_no_smoothing.generateSentence(length,prompt)}')
print(f'model_guten_4_no_smoothing: {model_guten_4_no_smoothing.generateSentence(length,prompt)}')
print(f'model_guten_5_no_smoothing: {model_guten_5_no_smoothing.generateSentence(length,prompt)}')
print(f'model_guten_4_laplace: {model_guten_4_laplace.generateSentence(length,prompt)}')
print(f'model_guten_3_linear_interpolation: {model_guten_3_linear_interpolation.generateSentence(length,prompt)}')
print(f'model_reuters_4_no_smoothing: {model_reuters_4_no_smoothing.generateSentence(length,prompt)}')
print(f'model_reuters_2_laplace: {model_reuters_2_laplace.generateSentence(length,prompt)}')
print(f'model_reuters_3_linear_interpolation: {model_reuters_3_linear_interpolation.generateSentence(length,prompt)}')

Enter prefix: i am going to
Enter number of words to generate: 5
model_guten_2_no_smoothing: i am going to the lord and the lord
model_guten_3_no_smoothing: i am going to be a great deal of
model_guten_4_no_smoothing: i am going to tell you that you will
model_guten_5_no_smoothing: i am going to tell you to any son
model_guten_4_laplace: i am going to tell you that you will
model_guten_3_linear_interpolation: i am going to the the lobsters and the
model_reuters_4_no_smoothing: i am going to     
model_reuters_2_laplace: i am going to the company said the company
model_reuters_3_linear_interpolation: i am going to the the the the the
