In [1]:
import math

In [2]:
def read_data(path):
    f = open(path, "r")
    data = f.readlines()
    return data

In [3]:
# Load data
with open('1b_benchmark.train.tokens', 'r', encoding='utf-8') as f:
    train_data = f.readlines()
with open('1b_benchmark.dev.tokens', 'r', encoding='utf-8') as f:
    dev_data = f.readlines()
with open('1b_benchmark.test.tokens', 'r', encoding='utf-8') as f:
    test_data = f.readlines()
print(len(train_data))
print(len(dev_data))
print(len(test_data))

61530
12216
12105


# Task 1- Programming N gram language model.

In [4]:
def unigram_token_freqeuncy(data, cut_off):
  processed_data = []
  vocab = dict({"UNK" : 0})

  # tokenize and add stop to sentences
  for sent in data:
    sent = sent.split()
    sent = sent + ["STOP"]
    processed_data.append(sent)

  # make a dictionary of token frequencies
  for sents in processed_data:
    for word in sents:
      if (word in vocab.keys()):
        vocab[word] += 1
      else:
        vocab[word] = 1

  # if there are words in the token freq dict that occur less than 3 times, add to UNK count instead
  for val in vocab.values():
    if val == cut_off:
      vocab["UNK"] += val

  # remove the out of vocab words in the token freqeuncy dict
  keys = list(vocab.keys())
  for key in keys:
    val = vocab[key]
    if val < cut_off:
      del vocab[key]
  total_words = sum(vocab.values())
  return vocab, total_words

In [5]:
def prob(freq_dict, total_words):
  token_prob = dict({})
  for key in freq_dict.keys():
    token_prob[key] = freq_dict[key]/total_words
  return token_prob


In [6]:
def process_data(data, vocab_dict):
  processed_data = []
# tokenize and add start and stop to sentences
  for sent in data:
    sent = sent.split()
    sent = ["START"] + sent + ["STOP"]
    processed_data.append(sent)

  # replace out of vocab words with UNK using the unigram frequency dict
  final_data = []
  for sent in processed_data:
    for index, token in enumerate(sent):
      if token not in vocab_dict.keys():
        sent[index] = "UNK"
    final_data.append(sent)
  return final_data

In [7]:
def predict(data, token_prob):
  total_words = 0
  mle = 0
  for sent in data:
    for word in sent[1:]:
      total_words += 1
      if (word in token_prob.keys()):
        mle += math.log2(token_prob[word])
      else:
        mle += math.log2(token_prob["UNK"])
  return total_words, mle

Bigram Token Frequencies

In [8]:
def bigram_token_freqeuncy(processed_data):
  bigram = dict({"UNK":0})

  # make a dictionary of bigram token frequencies
  for sent in processed_data:
    for index, word in enumerate(sent[:-1]):
      if ((word, sent[index + 1]) in bigram.keys()):
        bigram[(word, sent[index + 1])] += 1
      else:
        bigram[(word, sent[index + 1])] = 1

  return bigram

In [9]:
def prob_bigram(bigram_token_freq,uni_token_freq):
  bigram_prob = dict({})
  for key in bigram_token_freq.keys():
    if(key[0] == "START"):
      bigram_prob[key] = bigram_token_freq[key] / uni_token_freq["STOP"]
    elif(key[0] in uni_token_freq.keys()):
      bigram_prob[key] = bigram_token_freq[key] / uni_token_freq[key[0]]
    else:
      bigram_prob[key] = bigram_token_freq[key] / uni_token_freq["UNK"]
  return bigram_prob


In [10]:
def process_data_bigram(data, vocab_dict):
  processed_data = []

  # tokenize and add start and stop to sentences
  for sent in data:
    sent = sent.split()
    sent = ["START"] + sent + ["STOP"]
    processed_data.append(sent)

  # replace out of vocab words with UNK using the unigram frequency dict
  unk_data = []
  for sent in processed_data:
    for index, token in enumerate(sent):
      if (token not in vocab_dict):
        sent[index] = "UNK"
    unk_data.append(sent)

  final_data = []
  for sent in unk_data:
    bigrams = []
    for index, token in enumerate(sent[:-1]):
      bigrams.append((token, sent[index+1]))
    final_data.append(bigrams)

  return final_data

Trigram Token Freqeuncies

In [11]:
def process_tri_training(data, vocab_dict):
  processed_data = []

  # tokenize and add start and stop to sentences
  for sent in data:
    sent = sent.split()
    sent = ["START", "START"] + sent + ["STOP"]
    processed_data.append(sent)

  # replace out of vocab words with UNK using the unigram frequency dict
  final_data = []
  for sent in processed_data:
    for index, token in enumerate(sent):
      if token not in vocab_dict.keys():
        sent[index] = "UNK"
    final_data.append(sent)
  return final_data

In [12]:
def trigram_token_freqeuncy(processed_data):
  trigram = dict({"UNK":0})

  # make a dictionary of trigram token frequencies
  for sent in processed_data:
    for index, word in enumerate(sent[:-2]):
      if ((word, sent[index + 1], sent[index + 2]) in trigram.keys()):
        trigram[(word, sent[index + 1], sent[index + 2])] += 1
      else:
        trigram[(word, sent[index + 1], sent[index + 2])] = 1
  return trigram

In [13]:
def prob_trigram(trigram_token_freq,bigram_token_freq, unigram_token_freq):
  trigram_prob = dict({})
  for key in trigram_token_freq.keys():
    if((key[0], key[1]) == ("START", "START")):
      trigram_prob[key] = trigram_token_freq[key] / unigram_token_freq["STOP"]
    elif((key[0], key[1]) in bigram_token_freq.keys()):
      trigram_prob[key] = trigram_token_freq[key] / bigram_token_freq[(key[0], key[1])]
    else:
      trigram_prob[key] = 0

  return trigram_prob

In [14]:
def process_data_trigram(data, vocab_dict):
  processed_data = []

  # tokenize and add start and stop to sentences
  for sent in data:
    sent = sent.split()
    sent = ["START", "START"] + sent + ["STOP"]
    processed_data.append(sent)

  # replace out of vocab words with UNK using the unigram frequency dict
  unk_data = []
  for sent in processed_data:
    for index, token in enumerate(sent):
      if (token not in vocab_dict):
        sent[index] = "UNK"
    unk_data.append(sent)

  final_data = []
  for sent in unk_data:
    trigrams = []
    for index, token in enumerate(sent[:-2]):
      trigrams.append((token, sent[index + 1], sent[index + 2]))
    final_data.append(trigrams)

  return final_data

In [15]:
def bigram_predict(data, token_prob):
  total_words = 0
  mle = 0
  for sent in data:
    for bigram in sent:
      total_words += 1
      if (bigram in token_prob.keys()):
        mle += math.log2(token_prob[bigram])
      else:
        mle = -math.inf
  return total_words, mle

In [16]:
def trigram_predict(data, token_prob):
  total_words = 0
  mle = 0
  for sent in data:
    for trigram in sent:
      total_words += 1
      if (trigram in token_prob.keys()):
        mle += math.log2(token_prob[trigram])
      else:
        mle = -math.inf
  return total_words, mle

Perplexity

In [17]:
def perplexity(total_words, mle):
  perp = 2 ** ((-1 / total_words) * mle)
  return perp

In [18]:
# train model ungirams
unigram_vocab_dict, total_words = unigram_token_freqeuncy(train_data, 3)
print(total_words, len(train_data))
unigram_token_prob = prob(unigram_vocab_dict, total_words)

1574164 61530


In [19]:
# train model bigrams
processed_train = process_data(train_data, unigram_vocab_dict)
bigram_vocab_dict = bigram_token_freqeuncy(processed_train)
bigram_token_prob = prob_bigram(bigram_vocab_dict, unigram_vocab_dict)

In [20]:
# train model trigrams
processed_train_tri = process_tri_training(train_data, unigram_vocab_dict)
trigram_vocab_dict = trigram_token_freqeuncy(processed_train_tri)
trigram_token_prob = prob_trigram(trigram_vocab_dict, bigram_vocab_dict, unigram_vocab_dict)

In [21]:
# predict train unigram
total_unigrams_train, unigram_mle_train = predict(processed_train, unigram_token_prob)
unigram_perplexity_train = perplexity(total_unigrams_train, unigram_mle_train)
print("Unigram perplexity for train", unigram_perplexity_train)

# predict train bigrams
processed_train_bigram = process_data_bigram(train_data, unigram_vocab_dict)
total_bigrams_train, bigram_mle_train = bigram_predict(processed_train_bigram, bigram_token_prob)
bigram_perplexity_train = perplexity(total_bigrams_train, bigram_mle_train)
print("Bigram perplexity for train", bigram_perplexity_train)

# predict train trigrams
processed_train_trigram = process_data_trigram(train_data, unigram_vocab_dict)
total_trigrams_train, trigram_mle_train = trigram_predict(processed_train_trigram, trigram_token_prob)
trigram_perplexity_train = perplexity(total_trigrams_train, trigram_mle_train)
print("Trigram perplexity for train", trigram_perplexity_train)


Unigram perplexity for train 1001.7500328769861
Bigram perplexity for train 72.87810137763663
Trigram perplexity for train 7.873059029042192


On Dev Data

In [22]:
# predict dev set unigram
processed_dev = process_data(dev_data, unigram_vocab_dict)
total_unigrams_dev, unigram_mle_dev = predict(processed_dev, unigram_token_prob)
unigram_perplexity_dev = perplexity(total_unigrams_dev, unigram_mle_dev)
print("Unigram perplexity for dev", unigram_perplexity_dev)

Unigram perplexity for dev 928.7011934138095


In [23]:
# predict dev set bigrams
processed_dev_bigram = process_data_bigram(dev_data, unigram_vocab_dict)
total_bigrams_dev, bigram_mle_dev = bigram_predict(processed_dev_bigram, bigram_token_prob)
bigram_perplexity_dev = perplexity(total_bigrams_dev, bigram_mle_dev)
print("Bigram perplexity for dev", bigram_perplexity_dev)

Bigram perplexity for dev inf


In [24]:
# predict dev set trigrams
processed_dev_trigram = process_data_trigram(dev_data, unigram_vocab_dict)
total_trigrams_dev, trigram_mle_dev = trigram_predict(processed_dev_trigram, trigram_token_prob)
trigram_perplexity_dev = perplexity(total_trigrams_dev, trigram_mle_dev)
print("Trigram perplexity for dev", trigram_perplexity_dev)


Trigram perplexity for dev inf


On Test Data

In [25]:
# predict test unigram
processed_test = process_data(test_data, unigram_vocab_dict)
total_unigrams_test, unigram_mle_test = predict(processed_test, unigram_token_prob)
unigram_perplexity_test = perplexity(total_unigrams_test, unigram_mle_test)
print("Unigram perplexity for test", unigram_perplexity_test)

Unigram perplexity for test 932.0903293095981


In [26]:
# predict test bigrams
processed_test_bigram = process_data_bigram(test_data, unigram_vocab_dict)
total_bigrams_test, bigram_mle_test = bigram_predict(processed_test_bigram, bigram_token_prob)
bigram_perplexity_test = perplexity(total_bigrams_test, bigram_mle_test)
print("Bigram perplexity for test", bigram_perplexity_test)

Bigram perplexity for test inf


In [27]:
# predict test trigrams
processed_test_trigram = process_data_trigram(test_data, unigram_vocab_dict)
total_trigrams_test, trigram_mle_test = trigram_predict(processed_test_trigram, trigram_token_prob)
trigram_perplexity_test = perplexity(total_trigrams_test, trigram_mle_test)
print("Trigram perplexity for dev", trigram_perplexity_test)

Trigram perplexity for dev inf


# Task 2 - LINEAR INTERPOLATION

In [28]:
def linear_interpolation(data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, lambda1, lambda2, lambda3):
  processed_data_uni = process_data(data, unigram_vocab_dict)
  processed_data_bi = process_data_bigram(data, unigram_vocab_dict)
  processed_data_tri = process_data_trigram(data, unigram_vocab_dict)

  linear_interpolation_mle = 0
  total_words = 0

  for i in range(0, len(processed_data_uni)):
    for j in range (0, len(processed_data_uni[i])-1):
      total_words += 1

      if(processed_data_uni[i][j+1] in unigram_token_prob.keys()):
        uni_prob = unigram_token_prob[processed_data_uni[i][j+1]]
      else:
        uni_prob = 0
      if(processed_data_bi[i][j] in bigram_token_prob.keys()):
        bi_prob = bigram_token_prob[processed_data_bi[i][j]]
      else:
        bi_prob = 0
      if(processed_data_tri[i][j] in trigram_token_prob.keys()):
        tri_prob = trigram_token_prob[processed_data_tri[i][j]]
      else:
        tri_prob = 0
      linear_interpolation_mle += math.log2((uni_prob * lambda1) + (bi_prob * lambda2) + (tri_prob * lambda3))
  return linear_interpolation_mle, total_words

* λ1 = 0.3, λ2 = 0.3, λ3 = 0.4
λ1 = 0.4, λ2 = 0.3, λ3 = 0.2
λ1 = 0.2, λ2 = 0.3, λ3 = 0.4
λ1 = 0.3, λ2 = 0.2, λ3 = 0.4
λ1 = 0.4, λ2 = 0.4, λ3 = 0.2
λ1 = 0.5, λ2 = 0.3, λ3 = 0.2

In [29]:
#Model 1
#Perplexity of train with lambda values λ1 = 0.4, λ2 = 0.3, λ3 = 0.2
train_1, total_words = linear_interpolation(train_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.4, 0.3, 0.2)
train_perp_1 = perplexity(total_words, train_1)
print("Perplexity of train with lambda values 0.4, 0.3, 0.2: ", train_perp_1)

#Perplexity of dev with lambda values λ1 = 0.4, λ2 = 0.3, λ3 = 0.2
dev_1, total_words1 = linear_interpolation(dev_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.4, 0.3, 0.2)
dev_perp_1 = perplexity(total_words1, dev_1)
print("Perplexity of dev with lambda values 0.4, 0.3, 0.2:   ", dev_perp_1, "\n")

Perplexity of train with lambda values 0.4, 0.3, 0.2:  25.151237165008716
Perplexity of dev with lambda values 0.4, 0.3, 0.2:    297.2434258749143 



In [30]:
#Model 2
#Perplexity of train with lambda values λ1 = 0.2, λ2 = 0.3, λ3 = 0.4
train_1, total_words = linear_interpolation(train_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.2, 0.3, 0.4)
train_perp_1 = perplexity(total_words, train_1)
print("Perplexity of train with lambda values 0.2, 0.3, 0.4: ", train_perp_1)

#Perplexity of dev with lambda values λ1 = 0.4, λ2 = 0.3, λ3 = 0.2
dev_1, total_words1 = linear_interpolation(dev_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.2, 0.3, 0.4)
dev_perp_1 = perplexity(total_words1, dev_1)
print("Perplexity of dev with lambda values 0.2, 0.3, 0.4:   ", dev_perp_1, "\n")

Perplexity of train with lambda values 0.2, 0.3, 0.4:  15.296423833195202
Perplexity of dev with lambda values 0.2, 0.3, 0.4:    315.343570028335 



In [31]:
#Model 3
#Perplexity of train with lambda values λ1 = 0.3, λ2 = 0.2, λ3 = 0.4
train_1, total_words = linear_interpolation(train_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.3, 0.2, 0.4)
train_perp_1 = perplexity(total_words, train_1)
print("Perplexity of train with lambda values 0.4, 0.3, 0.2: ", train_perp_1)

#Perplexity of dev with lambda values λ1 = 0.3, λ2 = 0.2, λ3 = 0.4
dev_1, total_words1 = linear_interpolation(dev_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.3, 0.2, 0.4)
dev_perp_1 = perplexity(total_words1, dev_1)
print("Perplexity of dev with lambda values 0.4, 0.3, 0.2:   ", dev_perp_1, "\n")

Perplexity of train with lambda values 0.4, 0.3, 0.2:  16.07853285728428
Perplexity of dev with lambda values 0.4, 0.3, 0.2:    327.73955214115443 



In [32]:
#Model 4
#Perplexity of train with lambda values λ1 = 0.4, λ2 = 0.4, λ3 = 0.2
train_1, total_words = linear_interpolation(train_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.4, 0.4, 0.2)
train_perp_1 = perplexity(total_words, train_1)
print("Perplexity of train with lambda values 0.4, 0.4, 0.2: ", train_perp_1)

#Perplexity of dev with lambda values λ1 = 0.4, λ2 = 0.4, λ3 = 0.2
dev_1, total_words1 = linear_interpolation(dev_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.4, 0.4, 0.2)
dev_perp_1 = perplexity(total_words1, dev_1)
print("Perplexity of dev with lambda values 0.4, 0.4, 0.2:   ", dev_perp_1, "\n")

Perplexity of train with lambda values 0.4, 0.4, 0.2:  23.286791883635704
Perplexity of dev with lambda values 0.4, 0.4, 0.2:    259.4630274937098 



In [33]:
#Model 5
#Perplexity of train with lambda values λ1 = 0.5, λ2 = 0.3, λ3 = 0.2
train_1, total_words = linear_interpolation(train_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.5, 0.3, 0.2)
train_perp_1 = perplexity(total_words, train_1)
print("Perplexity of train with lambda values 0.5, 0.3, 0.2: ", train_perp_1)

#Perplexity of dev with lambda values λ1 = 0.4, λ2 = 0.4, λ3 = 0.2
dev_1, total_words1 = linear_interpolation(dev_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.5, 0.3, 0.2)
dev_perp_1 = perplexity(total_words1, dev_1)
print("Perplexity of dev with lambda values 0.5, 0.3, 0.2:   ", dev_perp_1, "\n")

Perplexity of train with lambda values 0.5, 0.3, 0.2:  24.81205153217311
Perplexity of dev with lambda values 0.5, 0.3, 0.2:    274.43782125162784 



In [34]:
#Report the training and development perplexity for the values λ1 = 0.3, λ2 = 0.3, λ3 = 0.4.
#Perplexity of train with lambda values λ1 = 0.3, λ2 = 0.3, λ3 = 0.4
train_1, total_words = linear_interpolation(train_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.3, 0.3, 0.4)
train_perp_1 = perplexity(total_words, train_1)
print("Perplexity of train with lambda values 0.3, 0.3, 0.4: ", train_perp_1)

#Perplexity of dev with lambda values λ1 = 0.4, λ2 = 0.4, λ3 = 0.2
dev_1, total_words1 = linear_interpolation(dev_data, unigram_vocab_dict, bigram_vocab_dict, trigram_vocab_dict, unigram_token_prob, bigram_token_prob, trigram_token_prob, 0.3, 0.3, 0.4)
dev_perp_1 = perplexity(total_words1, dev_1)
print("Perplexity of dev with lambda values 0.3, 0.3, 0.4:   ", dev_perp_1, "\n")

Perplexity of train with lambda values 0.3, 0.3, 0.4:  15.13822057991001
Perplexity of dev with lambda values 0.3, 0.3, 0.4:    278.8498077733136 

