In [32]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [33]:
# dataloader

def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [34]:
data, vocab = load_data("train2.txt")

In [35]:
data[:3]

[['<s>', 'i', 'liked', 'your', 'idea', 'and', 'adopted', 'it', '.', '</s>'],
 ['<s>', 'you', 'are', 'wrong', ',', 'however', '.', '</s>'],
 ['<s>', 'how', 'soon', 'will', 'this', 'laundry', 'be', 'ready', '?', '</s>']]

In [36]:
vocab

defaultdict(<function __main__.load_data.<locals>.<lambda>()>,
            {'<s>': 180000,
             'i': 40771,
             'liked': 123,
             'your': 6741,
             'idea': 528,
             'and': 11784,
             'adopted': 36,
             'it': 16913,
             '.': 165951,
             '</s>': 180000,
             'you': 25651,
             'are': 8776,
             'wrong': 564,
             ',': 27562,
             'however': 181,
             'how': 3195,
             'soon': 971,
             'will': 6205,
             'this': 10554,
             'laundry': 31,
             'be': 7764,
             'ready': 256,
             '?': 15744,
             'is': 31094,
             'said': 1561,
             'that': 14245,
             'nobody': 275,
             'has': 5686,
             'solved': 58,
             'the': 78598,
             'problem': 826,
             'yet': 523,
             'our': 2858,
             'project': 180,
             'crashed': 

In [37]:
def remove_rare_words(data, vocab, mincount):
    ## FILL CODE
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    data_with_unk = data[:]
    
    for i in range(len(data)):
        for word in range(len(data[i])):
            if vocab[data[i][word]] < mincount:
                data_with_unk[i][word] = "unk"
                
    return data_with_unk

In [38]:
# LOAD DATA

train_data, vocab = load_data("train2.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# rare words with <unk> in the dataset
remove_rare_words(train_data, vocab, 100)

print("load validation set")
valid_data, _ = load_data("valid2.txt")
remove_rare_words(valid_data, vocab, 100)

# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# OOV with <unk> in the dataset

load validation set


[['<s>',
  'the',
  'unk',
  'horse',
  'was',
  'finally',
  'broken',
  'by',
  'the',
  'patient',
  '<unk>',
  '.',
  '</s>'],
 ['<s>', 'john', 'moved', 'forward', 'to', 'the', 'gate', '.', '</s>'],
 ['<s>', 'many', 'thanks', 'for', 'this', 'wonderful', 'trip', '.', '</s>'],
 ['<s>', 'never', 'give', 'up', 'till', 'the', 'very', 'end', '.', '</s>'],
 ['<s>', 'use', 'your', 'head', 'for', 'a', 'change', '.', '</s>'],
 ['<s>',
  '<unk>',
  'is',
  'a',
  'unk',
  'condition',
  'that',
  '<unk>',
  'the',
  '<unk>',
  '.',
  '</s>'],
 ['<s>',
  'i',
  'learned',
  'to',
  'study',
  'and',
  'play',
  'like',
  'my',
  'japanese',
  'friends',
  '.',
  '</s>'],
 ['<s>',
  'i',
  'don',
  "'t",
  'care',
  'where',
  'we',
  'eat',
  'dinner',
  '.',
  'it',
  "'s",
  'unk',
  'up',
  'to',
  'you',
  '.',
  '</s>'],
 ['<s>', 'the', 'price', 'rose', '.', '</s>'],
 ['<s>',
  'i',
  'entered',
  'a',
  'restaurant',
  'and',
  'had',
  'lunch',
  '.',
  '</s>'],
 ['<s>', 'tom', 'unk', '

In [39]:
# Function to build a bigram model

def build_bigram(data):
    unigram_counts = defaultdict(lambda:0)
    bigram_counts  = defaultdict(lambda: defaultdict(lambda: 0.0))
    total_number_words = 0

    ## FILL CODE
    # Store the unigram and bigram counts as well as the total 
    # number of words in the dataset
    
    #unigram
    for sentence in data:
        for idx, word in enumerate(sentence):
            unigram_counts[word] += 1
            total_number_words+=1
            if idx < len(sentence)-1:
                bigram_counts[word][sentence[idx+1]] += 1
        

    unigram_prob = defaultdict(lambda:0)
    bigram_prob = defaultdict(lambda: defaultdict(lambda: 0.0))

    ## FILL CODE
    # Build unigram and bigram probabilities from counts
    
    for sentence in data:
        for idx, word in enumerate(sentence):
            unigram_prob[word] = (1.*unigram_counts[word])/total_number_words
            if idx<len(sentence) -1:
                bigram_prob[word][sentence[idx+1]] = (1.*bigram_counts[word][sentence[idx+1]])/unigram_counts[word]

    return {'bigram': bigram_prob, 'unigram': unigram_prob}

In [40]:
# RUN TO BUILD BIGRAM MODEL

print("build bigram model")
model = build_bigram(train_data)
model

build bigram model


{'bigram': defaultdict(<function __main__.build_bigram.<locals>.<lambda>()>,
             {'<s>': defaultdict(<function __main__.build_bigram.<locals>.<lambda>.<locals>.<lambda>()>,
                          {'i': 0.15040555555555554,
                           'you': 0.029344444444444444,
                           'how': 0.009222222222222222,
                           'it': 0.03631111111111111,
                           'our': 0.003672222222222222,
                           'two': 0.0007833333333333334,
                           'one': 0.0022444444444444443,
                           'the': 0.10031666666666667,
                           'she': 0.04811666666666667,
                           'this': 0.021966666666666666,
                           'unk': 0.028788888888888888,
                           'god': 0.00035555555555555557,
                           'can': 0.0048222222222222225,
                           'don': 0.007633333333333333,
                           'he': 0.

# Perplexity

In NLP, the perplexity metric is a way to capture the degree of 'uncertainty' a model has in predicting(assigning probabilities to)some text. The lower the perplexity, the higher the probability hence the better the model. Related to [Shannon's entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory))

In [41]:
def get_prob(model, w1, w2):
    assert model["unigram"][w2] != 0, "Out of Vocabulary word!"
    ## FILL CODE
    # Should return the probability of the bigram (w1w2) if it exists
    # Else it return the probility of unigram (w2) multiply by 0.4
    
    prob = model["bigram"][w1][w2]
    
    if prob == 0:
        prob = 0.4 * model["unigram"][w2]
    
    return prob

def perplexity(model, data):
    ## FILL CODE
    # follow the formula in the slides
    # call the function get_prob to get P(w2 | w1)
    T = 0
    log_sum = 0
    
    for sentence in data:
        prev_word = sentence[0]
        for word in sentence[1:]:
            log_sum += np.log(get_prob(model, prev_word, word))
            prev_word = word
        T+= len(sentence)
    perp = -(log_sum/T)
    return perp

In [42]:
# COMPUTE PERPLEXITY ON VALIDATION SET
print("The perplexity is", perplexity(model, valid_data))

The perplexity is 3.238405910059498


In [43]:
def generate(model):
    sentence = ["<s>"]
    ## FILL CODE
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    #   np.random.choice(x, 1, p = y)
    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    p = sentence[0]
    bigram = model["bigram"]
    
    while True:
        p = np.random.choice(list(bigram[p].keys()), 1, p = list(bigram[p].values()))[0]
        sentence.append(p)
        
        if p == "</s>":break
    return sentence

In [44]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(model))

Generated sentence:  ['<s>', 'tom', 'had', 'no', 'problem', '.', '</s>']
