# 1. Language Modeling

In [43]:
import json
from collections import Counter
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.data import find
import gensim
import sklearn
from sympy.parsing.sympy_parser import parse_expr

In [44]:
np.random.seed(0)
nltk.download('word2vec_sample')

[nltk_data] Downloading package word2vec_sample to
[nltk_data]     C:\Users\chatu\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


True

In [64]:
class NgramLM:
	def __init__(self):
		"""
		N-gram Language Model
		"""
		# Dictionary to store next-word possibilities for bigrams. Maintains a list for each bigram.
		self.bigram_prefix_to_trigram = {}
		
		# Dictionary to store counts of corresponding next-word possibilities for bigrams. Maintains a list for each bigram.
		self.bigram_prefix_to_trigram_weights = {}

	def load_trigrams(self):
		"""
		Loads the trigrams from the data file and fills the dictionaries defined above.

		Parameters
		----------

		Returns
		-------
		"""
		with open("data/tweets/covid-tweets-2020-08-10-2020-08-21.trigrams.txt", encoding="utf-8") as f:
			lines = f.readlines()
			for i, line in enumerate(lines):
				word1, word2, word3, count = line.strip().split()
				if (word1, word2) not in self.bigram_prefix_to_trigram:
					self.bigram_prefix_to_trigram[(word1, word2)] = []
					self.bigram_prefix_to_trigram_weights[(word1, word2)] = []
				self.bigram_prefix_to_trigram[(word1, word2)].append(word3)
				self.bigram_prefix_to_trigram_weights[(word1, word2)].append(int(count))

	def top_next_word(self, word1, word2, n=10):
		"""
		Retrieve top n next words and their probabilities given a bigram prefix.

		Parameters
		----------
		word1: str
			The first word in the bigram.
		word2: str
			The second word in the bigram.
		n: int
			Number of words to return.
			
		Returns
		-------
		next_words: list
			The retrieved top n next words.
		probs: list
			The probabilities corresponding to the retrieved words.
		"""
		next_words = []
		probs = []

    # Check if the bigram exists in our dictionary
		if (word1, word2) in self.bigram_prefix_to_trigram:
			# Get the list of next words and their counts
			words = self.bigram_prefix_to_trigram[(word1, word2)]
			weights = self.bigram_prefix_to_trigram_weights[(word1, word2)]
			
			# Calculate the total count to get probabilities
			total_count = sum(weights)
			
			# Create a list of (word, probability) tuples
			word_probs = [(word, count/total_count) for word, count in zip(words, weights)]
			
			# Sort by probability in descending order
			word_probs.sort(key=lambda x: x[1], reverse=True)
			
			# Take the top n results
			top_results = word_probs[:n]
			
			# Separate words and probabilities into two lists
			next_words = [w for w, p in top_results]
			probs = [p for w, p in top_results]

			return next_words, probs
	
	def sample_next_word(self, word1, word2, n=10):
		"""
		Sample n next words and their probabilities given a bigram prefix using the probability distribution defined by frequency counts.

		Parameters
		----------
		word1: str
			The first word in the bigram.
		word2: str
			The second word in the bigram.
		n: int
			Number of words to return.
			
		Returns
		-------
		next_words: list
			The sampled n next words.
		probs: list
			The probabilities corresponding to the retrieved words.
		"""
		next_words = []
		probs = []

		# Check if the bigram exists in our dictionary
		if (word1, word2) in self.bigram_prefix_to_trigram:
			# Get the list of next words and their counts
			words = self.bigram_prefix_to_trigram[(word1, word2)]
			weights = self.bigram_prefix_to_trigram_weights[(word1, word2)]
			
			# Calculate the total count to get probabilities
			total_count = sum(weights)
			
			# Create probabilities array for numpy sampling
			probabilities = [count/total_count for count in weights]
			
			# Sample without replacement
			import numpy as np
			# Limit sample size to available words if less than n
			sample_size = min(n, len(words))
			indices = np.random.choice(len(words), size=sample_size, replace=False, p=probabilities)
			
			# Get the sampled words and their probabilities
			next_words = [words[i] for i in indices]
			probs = [probabilities[i] for i in indices]
    
		return next_words, probs
	
	def generate_sentences(self, prefix, beam=10, sampler=None, max_len=20):
		"""
		Generate sentences using beam search.

		Parameters
		----------
		prefix: str
			String containing two (or more) words separated by spaces.
		beam: int
			The beam size.
		sampler: Callable
			The function used to sample next word.
		max_len: int
			Maximum length of sentence (as measure by number of words) to generate (excluding "<EOS>").
			
		Returns
		-------
		sentences: list
			The top generated sentences
		probs: list
			The probabilities corresponding to the generated sentences
		"""
		sentences = []
		probs = []
		
		# If sampler is not provided, use top_next_word as default
		if sampler is None:
			sampler = self.top_next_word
			
		# Process the prefix to get starting words
		prefix_words = prefix.strip().split()
		
		# Need at least 2 words to start
		if len(prefix_words) < 2:
			return sentences, probs
		
		# Initialize beam search with the given prefix
		beam_sentences = [prefix_words]
		beam_probs = [1.0]  # Starting probability of 1
		
		# Continue until all sentences in beam have ended
		while not all("<EOS>" in sentence for sentence in beam_sentences):
			# Track candidate extensions for this round
			candidates = []
			candidate_probs = []
			
			# Process each sentence in the current beam
			for i, sentence in enumerate(beam_sentences):
				# Skip if sentence already ended
				if "<EOS>" in sentence:
					# Keep this completed sentence as a candidate
					candidates.append(sentence)
					candidate_probs.append(beam_probs[i])
					continue
					
				# Count words after the prefix
				# The prefix itself doesn't count toward the maximum length
				words_after_prefix = len(sentence) - len(prefix_words)
				
				# Check if we've reached maximum length
				if words_after_prefix >= max_len:
					# Add EOS and don't extend further
					candidates.append(sentence + ["<EOS>"])
					candidate_probs.append(beam_probs[i])
					continue
					
				# Get last two words to predict next word
				word1, word2 = sentence[-2], sentence[-1]
				
				# Get next word predictions using the provided sampler
				next_words, next_probs = sampler(word1, word2)
				
				# If no predictions, end the sentence
				if not next_words:
					candidates.append(sentence + ["<EOS>"])
					candidate_probs.append(beam_probs[i])
					continue
					
				# Add all extensions to candidates
				for j, next_word in enumerate(next_words):
					# If the next word is <EOS>, add it
					if next_word == "<EOS>":
						new_sentence = sentence + [next_word]
						new_prob = beam_probs[i] * next_probs[j]
						candidates.append(new_sentence)
						candidate_probs.append(new_prob)
					# Otherwise, check if adding this word would exceed max_len
					elif words_after_prefix + 1 < max_len:
						new_sentence = sentence + [next_word]
						new_prob = beam_probs[i] * next_probs[j]
						candidates.append(new_sentence)
						candidate_probs.append(new_prob)
					# If adding this word would make the sentence exactly max_len, add EOS
					elif words_after_prefix + 1 == max_len:
						new_sentence = sentence + [next_word, "<EOS>"]
						new_prob = beam_probs[i] * next_probs[j]
						candidates.append(new_sentence)
						candidate_probs.append(new_prob)
			
			# If no candidates, break
			if not candidates:
				break
				
			# Sort candidates by probability (descending)
			sorted_indices = sorted(range(len(candidate_probs)), key=lambda i: candidate_probs[i], reverse=True)
			
			# Select top beam candidates
			beam_sentences = [candidates[i] for i in sorted_indices[:beam]]
			beam_probs = [candidate_probs[i] for i in sorted_indices[:beam]]
		
		# Convert word lists to sentences
		sentences = [' '.join(sentence) for sentence in beam_sentences]
		probs = beam_probs
		
		return sentences, probs
	
# Define your language model object
language_model = NgramLM()
# Load trigram data
language_model.load_trigrams()

## Evaluate Language Model

### **Evaluating top next word prediction**

In [60]:
next_words, probs = language_model.top_next_word("middle", "of", 10)
for word, prob in zip(next_words, probs):
	print(word, prob)
# Your first 5 lines of output should be exactly:
# a 0.807981220657277
# the 0.06948356807511737
# pandemic 0.023943661971830985
# this 0.016901408450704224
# an 0.0107981220657277

a 0.807981220657277
the 0.06948356807511737
pandemic 0.023943661971830985
this 0.016901408450704224
an 0.0107981220657277
covid 0.009389671361502348
nowhere 0.008450704225352112
it 0.004694835680751174
summer 0.002347417840375587
lockdown 0.002347417840375587


### **Evaluating sample next word prediction**

In [61]:
next_words, probs = language_model.sample_next_word("middle", "of", 10)
for word, prob in zip(next_words, probs):
	print(word, prob)
# My first 5 lines of output look like this: (YOUR OUTPUT CAN BE DIFFERENT!)
# a 0.807981220657277
# pandemic 0.023943661971830985
# august 0.0018779342723004694
# stage 0.0018779342723004694
# an 0.0107981220657277

a 0.807981220657277
nowhere 0.008450704225352112
pandemic 0.023943661971830985
the 0.06948356807511737
august 0.0018779342723004694
an 0.0107981220657277
this 0.016901408450704224
flu 0.00046948356807511736
covid 0.009389671361502348
summer 0.002347417840375587


### **Evaluating beam search**

In [65]:
sentences, probs = language_model.generate_sentences(prefix="<BOS1> <BOS2> trump", beam=10, sampler=language_model.top_next_word)
for sent, prob in zip(sentences, probs):
	print(sent, prob)
print("#########################\n")
# Your first 3 lines of output should be exactly:
# <BOS1> <BOS2> trump eyes new unproven coronavirus treatment URL <EOS> 0.00021893147502903603
# <BOS1> <BOS2> trump eyes new unproven coronavirus cure URL <EOS> 0.0001719607222046247
# <BOS1> <BOS2> trump eyes new unproven virus cure promoted by mypillow ceo over unproven therapeutic URL <EOS> 9.773272077557522e-05

sentences, probs = language_model.generate_sentences(prefix="<BOS1> <BOS2> biden", beam=10, sampler=language_model.top_next_word)
for sent, prob in zip(sentences, probs):
	print(sent, prob)
print("#########################\n")
# Your first 3 lines of output should be exactly:
# <BOS1> <BOS2> biden calls for a 30 bonus URL #cashgem #cashappfriday #stayathome <EOS> 0.0002495268686322749
# <BOS1> <BOS2> biden says all u.s. governors should mandate masks <EOS> 1.6894510541025754e-05
# <BOS1> <BOS2> biden says all u.s. governors question cost of a pandemic <EOS> 8.777606198953028e-07

sentences, probs = language_model.generate_sentences(prefix="<BOS1> <BOS2> trump", beam=10, sampler=language_model.sample_next_word)
for sent, prob in zip(sentences, probs):
	print(sent, prob)
print("#########################\n")
# My first 3 lines of output look like this: (YOUR OUTPUT CAN BE DIFFERENT!)
# <BOS1> <BOS2> trump eyes new unproven coronavirus treatment URL <EOS> 0.00021893147502903603
# <BOS1> <BOS2> trump eyes new unproven coronavirus cure URL <EOS> 0.0001719607222046247
# <BOS1> <BOS2> trump eyes new unproven virus cure promoted by mypillow ceo over unproven therapeutic URL <EOS> 9.773272077557522e-05

sentences, probs = language_model.generate_sentences(prefix="<BOS1> <BOS2> biden", beam=10, sampler=language_model.sample_next_word)
for sent, prob in zip(sentences, probs):
	print(sent, prob)
# My first 3 lines of output look like this: (YOUR OUTPUT CAN BE DIFFERENT!)
# <BOS1> <BOS2> biden is elected <EOS> 0.001236227651321991
# <BOS1> <BOS2> biden dropping ten points given trump a confidence trickster URL <EOS> 5.1049579351466146e-05
# <BOS1> <BOS2> biden dropping ten points given trump four years <EOS> 4.367575122292103e-05

<BOS1> <BOS2> trump eyes new unproven coronavirus treatment URL <EOS> 0.00021893147502903603
<BOS1> <BOS2> trump eyes new unproven coronavirus cure URL <EOS> 0.0001719607222046247
<BOS1> <BOS2> trump eyes new unproven virus cure promoted by mypillow ceo over unproven therapeutic URL <EOS> 9.773272077557522e-05
<BOS1> <BOS2> trump eyes new unproven coronavirus therapeutic mypillow creator over unproven therapeutic URL <EOS> 8.212549111137046e-05
<BOS1> <BOS2> trump eyes new unproven virus cure promoted by ben carson and mypillow founder URL <EOS> 1.2095697936835552e-05
<BOS1> <BOS2> trump eyes new unproven virus cure promoted by mypillow ceo over unproven therapeutic URL via @USER <EOS> 7.432226908194607e-06
<BOS1> <BOS2> trump eyes new unproven virus cure promoted by mypillow ceo over unproven and dangerous <EOS> 5.61685494684627e-06
<BOS1> <BOS2> trump eyes new unproven virus cure promoted by mypillow ceo over unproven and dangerous covid-19 treatment URL <EOS> 5.235550241426875e-06
<