# Lab 3, part 1: N-gram Language Models

Welcome to the third session of the NLP course. Today we will explore different approaches for Language Models. This session is divided in two parts. First, we will look at some classic approaches known as N-gram models. In the second part, we will use neural networks to model corpus text.

## Download WikiText-2 dataset

In [None]:
! wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt
! wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt
! wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt

## Helper function

In [None]:
from collections import Counter, defaultdict
import math
import copy
import random
import operator

flatten = lambda l: [item for sublist in l for item in sublist]

# some helper functions
def prepare_data(filename):
    data = [l.strip().lower().split() + ['</s>'] for l in open(filename) if l.strip()]
    corpus = flatten(data)
    vocab = set(corpus)
    return vocab, data

## N-Gram LM
A language model assigns a probability to each possible next word given a history of previous words (context). 

$P(w_t|w_{t-1}, w_{t-2}, ... , w_1)$

### Markov Assumption
Since calculating the probability of the whole sentence is not feasible, the Markov Assumption is introduced.

It assumes that each next word only depends on the previous K words (in an N-Gram language model, K = N-1).
- Unigram: $P(w_t|w_{t-1}, w_{t-2}, ... , w_1) = P(w_t)$
- Bigram: $P(w_t|w_{t-1}, w_{t-2}, ... , w_1) = P(w_t|w_{t-1}) $
- Trigram: $P(w_t|w_{t-1}, w_{t-2}, ... , w_1) = P(w_t|w_{t-1}, w_{t-2})$

For an N-Gram language model, the probability is calculated by counting the frequency:

$P(w_t|w_{t-1}, w_{t-2}, ... ,w_{t-N+1}) = \frac{C(w_t, w_{t-1}, w_{t-2}, ... ,w_{t-N+1} )}{C(w_{t-1}, w_{t-2}, ... ,w_{t-N+1})}$

In [None]:
class NGramLM():
    def __init__(self, N):
        self.N = N
        self.vocab = set()
        self.data = []
        self.prob = {}
        self.counts = defaultdict(Counter)
    
    # For N = 1, the probability is stored in a dict   P = prob[next_word]
    # For N > 1, the probability is in a nested dict   P = prob[context][next_word]
    def train(self, vocab, data, smoothing_k=0):
        self.vocab = vocab
        self.data = data
        self.smoothing_k = smoothing_k

        if self.N == 1:
            self.counts = Counter(flatten(data))
            self.prob = self.get_prob(self.counts)
        else:
            self.vocab.add('<s>')
            counts = self.count_ngram()
            
            self.prob = {}
            for context, counter in counts.items():
                self.prob[context] = self.get_prob(counter)
    
    def count_ngram(self):
        counts = defaultdict(Counter)
        for sentence in self.data:
            sentence = (self.N - 1) * ['<s>'] + sentence 
            for i in range(len(sentence)-self.N+1):
                context = sentence[i:i+self.N-1]
                context = " ".join(context)
                word = sentence[i+self.N-1]
                counts[context][word] += 1

        self.counts = counts
        return counts
        
    # normalize counts into probability(considering smoothing)
    def get_prob(self, counter):
        total = float(sum(counter.values()))
        k = self.smoothing_k
        
        prob = {}
        for word, count in counter.items():
            prob[word] = (count + k) / (total + len(self.vocab) * k)
        return prob
        
    def get_ngram_logprob(self, word, seq_len, context=""):
        if self.N == 1 and word in self.prob.keys():
            return math.log(self.prob[word]) / seq_len
        elif self.N > 1 and not self._is_unseen_ngram(context, word):
            return math.log(self.prob[context][word]) / seq_len
        else:
            # assign a small probability to the unseen ngram
            # to avoid log of zero and to penalise unseen word or context
            return math.log(1/len(self.vocab)) / seq_len
        
    def get_ngram_prob(self, word, context=""):
        if self.N == 1 and word in self.prob.keys():
            return self.prob[word]
        elif self.N > 1 and not self._is_unseen_ngram(context, word):
            return self.prob[context][word]
        elif word in self.vocab and self.smoothing_k > 0:
            # probability assigned by smoothing
            return self.smoothing_k / (sum(self.counts[context].values()) + self.smoothing_k*len(self.vocab))
        else:
            # unseen word or context
            return 0
            
    # In this method, the perplexity is measured at the sentence-level, averaging over all sentences.
    # Actually, it is also possible to calculate perplexity by merging all sentences into a long one.
    def perplexity(self, test_data):
        log_ppl = 0
        if self.N == 1:
            for sentence in test_data:
                for word in sentence:
                    log_ppl += self.get_ngram_logprob(word=word, seq_len=len(sentence))
        else:
            for sentence in test_data:
                for i in range(len(sentence)-self.N+1):
                    context = sentence[i:i+self.N-1]
                    context = " ".join(context)
                    word = sentence[i+self.N-1]
                    log_ppl += self.get_ngram_logprob(context=context, word=word, seq_len=len(sentence))
                        
        log_ppl /= len(test_data)
        ppl = math.exp(-log_ppl)
        return ppl
    
    def _is_unseen_ngram(self, context, word):
        if context not in self.prob.keys() or word not in self.prob[context].keys():
            return True
        else:
            return False
    
    # generate the most probable k words
    def generate_next(self, context, k):
        context = (self.N-1) * '<s> ' + context
        context = context.split()
        ngram_context_list = context[-self.N+1:]
        ngram_context = " ".join(ngram_context_list)
        
        if ngram_context in self.prob.keys():
            candidates = self.prob[ngram_context]
            most_probable_words = sorted(candidates.items(), key=lambda kv: kv[1], reverse=True)
            for i in range(min(k, len(most_probable_words))):
                print(" ".join(context[self.N-1:])+" "+most_probable_words[i][0]+"\t P={}".format(most_probable_words[i][1]))
        else:
            print("Unseen context!")
            
    # generate the next n words with greedy search
    def generate_next_n(self, context, n):
        context = (self.N-1) * '<s> ' + context
        context = context.split()
        ngram_context_list = context[-self.N+1:]
        ngram_context = " ".join(ngram_context_list)
        
        for i in range(n):
            try:
                candidates = self.prob[ngram_context]
                most_likely_next = max(candidates.items(), key=operator.itemgetter(1))[0]
                context += [most_likely_next]
                ngram_context_list = ngram_context_list[1:] + [most_likely_next]
                ngram_context = " ".join(ngram_context_list)
            except:
                break
        print(" ".join(context[self.N-1:]))
    

## Train with toy dataset

At this step, let's train a Bigram language model on the toy dataset.

In [None]:
corpus = ["I like ice cream",
         "I like chocolate",
         "I hate beans"]
data = [l.strip().lower().split() + ['</s>'] for l in corpus if l.strip()]
vocab = set(flatten(data))
print(data)
print(vocab)

In [None]:
def print_probability(lm):
    for context in lm.vocab:
        for word in lm.vocab:
            prob = lm.get_ngram_prob(word, context)
            print("P({}\t|{}) = {}".format(word, context, prob))
        print("--------------------------")

## Smoothing
The smoothing approach is used to deal with the sparsity problem in the N-Gram LM.
The probability mass is shifted towards the less frequent words.

For example, with an add-1 smoothing, the probability is calculated as:

$$P(w_t | context) = \frac{C(w_t, context)+1}{C(context)+|V|}$$

Q1: What is the disadvantage of smoothing?

A: If there are too many word with zero counts, the frequent words will sacrifice more probability, which might lead to higher perplexity on the test set.

In [None]:
lm = NGramLM(2)
lm.train(vocab, data, smoothing_k=0)

######################################################
# Q2: try with add-1 smoothing and see the probability
######################################################
# lm.train(vocab, data, smoothing_k=1)

print_probability(lm)

## Train on WikiText-2 dataset and evaluate perplexity
### Evaluating perplexity

Q3: Why do we need to calculate log perplexity?

A:

$ PPL(W) = P(w_1, w_2, ... , w_n)^{-\frac{1}{n}}$

$ log(PPL(W)) = -\frac{1}{n}\sum^n_{k=1}log(P(w_k|w_1, w_2, ... , w_{k-1}))$

In [None]:
vocab, train_data = prepare_data('train.txt')
_, valid_data = prepare_data('valid.txt')
_, test_data = prepare_data('test.txt')
print(len(vocab))

In [None]:
lm = NGramLM(3)
lm.train(vocab, train_data)

In [None]:
print(lm.perplexity(valid_data))
print(lm.perplexity(test_data))

## Generating sentences
With a pre-trained N-Gram language model, we can predict possible next words given context.

In [None]:
# generate the most probable following words given the context
print(" ".join(valid_data[12]))

# actually the only useful context in the Trigram language model is ["where", "they"]
context = "the eggs hatch at night , and the larvae swim to the water surface where they"  
lm.generate_next(context, 3)


In [None]:
# we can also generate with shorter contexts, even shorter than N-1

contexts = ["the eggs",
            "the",
            ""]
for context in contexts:
  lm.generate_next(context, 3)
  print("---")

In [None]:
context = "the eggs hatch at night , and the larvae swim to the water surface where they"  

# generate the next n words with greedy search
lm.generate_next_n(context, 10)

# This is not a good method in practice,
# because wrong predictions in the early steps would introduce errors to the following predictions
lm.generate_next_n(context, 20)

## Effect of N

Q4: Why does the perplexity increases on the validation and test data when N is large?

A:

In [None]:
for n in range(1,6):
    lm = NGramLM(n)
    lm.train(vocab, train_data)
    print("************************")
    print("{}-gram LM perplexity on train set: {}".format(n, lm.perplexity(train_data)))
    print("{}-gram LM perplexity on valid set: {}".format(n, lm.perplexity(valid_data)))
    print("{}-gram LM perplexity on test  set: {}".format(n, lm.perplexity(test_data)))

## Interpolation
In interpolation, we mix the probability estimates from all the n-gram estimators to alleviate the sparsity problem.

In [None]:
class InterpolateNGramLM(NGramLM):
    
    def __init__(self, N):
        super(InterpolateNGramLM, self).__init__(N)
        self.ngram_lms = []
        self.lambdas = []
        
    def train(self, vocab, data, smoothing_k=0, lambdas=[]):
        assert len(lambdas) == self.N
        assert sum(lambdas) - 1 < 1e-9
        self.vocab = vocab
        self.lambdas = lambdas
        
        for i in range(self.N, 0, -1):
            lm = NGramLM(i)
            print("Training {}-gram language model".format(i))
            lm.train(vocab, data, smoothing_k)
            self.ngram_lms.append(lm)
    
    def get_ngram_logprob(self, word, seq_len, context):
        prob = 0
        for i, (coef, lm) in enumerate(zip(self.lambdas, self.ngram_lms)):
            context_words = context.split()
            cutted_context = " ".join(context_words[-self.N + i + 1:])
            prob += coef * lm.get_ngram_prob(context=cutted_context, word=word)
        return math.log(prob) / seq_len
        

In [None]:
###################################################
# Q5: tune your coefficients to decrease perplexity
###################################################
ilm = InterpolateNGramLM(3)
ilm.train(vocab, train_data, lambdas=[0.5, 0.4, 0.1])

In [None]:
print(ilm.perplexity(valid_data))
print(ilm.perplexity(test_data))