# Statistical Language Modeling with NLTK

### Exercise 1

- Load Shakespeare's Hamlet from Gutenberg corpus
    - lowercase it

- Extract padded unigrams and bigrams

- Using NgramCounter
    - get total number of ngrams
    - get count of unigram `the`
    - get count of bigram `of the`
    
|                     | Count  | 
|---------------------|---------|
| Ngrams      | 84038     | 
| Unigram *the* | 993|
| Bigram *of the*     |59 |

In [None]:
from nltk.corpus import gutenberg
from nltk.lm.preprocessing import padded_everygram_pipeline, NgramCounter

hamlet = gutenberg.sents('shakespeare-hamlet.txt')

print(len(hamlet))
print(hamlet[0])
# lowercasing use .lower()
hamlet_lowercase = [[w.lower() for w in sent] for sent in hamlet]
print(hamlet_lowercase[0])

In [None]:
padded_ngrams, flat_text = padded_everygram_pipeline(2, hamlet_lowercase)

In [None]:
counter = NgramCounter(padded_ngrams)

In [None]:
print(counter.N())
print(counter['the'])
print(counter[['of']]['the'])

#### Exercise 2
- lookup in vocabulary
    - "trento is the capital city of trentino"
- update vocabulary with "trento is the capital city of trentino"
    - do the lookup again to see the effect
- experiment with changing the cut-off value from `1` to `10`
    - do the lookup again to see the effect

In [None]:
from nltk.lm import Vocabulary

hamlet_words = gutenberg.words('shakespeare-hamlet.txt')

# lowercase
hamlet_words = [w.lower() for w in hamlet_words]

sentence = "trento is the capital city of trentino".split()

# Cut-off 0   
vocab = Vocabulary(hamlet_words)
print(list(vocab.lookup(sentence)))
vocab.update(sentence)
print(list(vocab.lookup(sentence)))


In [None]:
# Cut-off 1
vocab = Vocabulary(hamlet_words, unk_cutoff=1)
print(list(vocab.lookup(sentence)))
vocab.update(sentence)
print(list(vocab.lookup(sentence)))

# Cut-off 10
vocab = Vocabulary(hamlet_words, unk_cutoff=10)
print(list(vocab.lookup(sentence)))
vocab.update(sentence)
print(list(vocab.lookup(sentence)))

#### Exercise 3
Implement a function to compute score of a sequence (i.e. Chain Rule)

- arguments:
    - Language Model
    - List of Tokens

- functionality
    - extracts ngrams w.r.t. LM order (`lm.order`)
    - scores each ngram w.r.t. LM (`lm.score` or `lm.logscore`)
        - mind that `score` takes care of OOV by conterting to `<UNK>` already
    - computes the overal score using chain rule
        - mind the difference between `score` and `logscore`

- compute the scores of the sentences below
    - compute padded and unpadded sequence scores

In [None]:
test_sents = ["the king is dead", "the tzar is dead", 'the tragedie of hamlet is good']


In [None]:
from itertools import chain
from nltk.lm.preprocessing import padded_everygram_pipeline, everygrams
def chain_rule(lm, sentence, log=True, pad=True):
    highest_ngram = lm.order
    tokens = sentence.split()
    if pad:
        ngrams, _ = padded_everygram_pipeline(highest_ngram, [tokens])
        ngrams = chain.from_iterable(ngrams) # Flat the sequence
    else:
        ngrams = everygrams(tokens, max_len=highest_ngram)

        
    if log:
        total_score = 0
    else:
        total_score = 1
        
    for x in ngrams:
        if len(x) == highest_ngram:
            if log:
                w_t = x[-1]
                # In python you can get a split of a tuple or array as array[from:to] "to" is excluded
                context = x[0:-1] # or x[:-1]
                score = lm.logscore(w_t, context)
                total_score += score # Add or multiply score to total_score ?
            else:
                w_t = x[-1]
                context = x[0:-1]
                score = lm.score(w_t, context)
                total_score *= score
    
    return total_score

for sent in test_sents:
    print(sent, chain_rule("Add your model", sent, log=True, pad=True))

#### Exercise 4
Compute entropy and perplexity of the `MLE` models  on the bigrams of the test sentences below, treating them as a test set.

- experiment with the two test sets
- experiment with OOVs (with vs without)



In [None]:
test_sents1 = ["the king is dead", "the emperor is dead", "may the force be with you"]
test_sents2 = ["the king is dead", "welcome to you", "how are you"]

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline, flatten
from nltk.lm import Vocabulary, MLE

# Load data
hamlet_sents = [[w.lower() for w in sent] for sent in gutenberg.sents('shakespeare-hamlet.txt')]
hamlet_words = flatten(hamlet_sents)
# Compute vocab 
lex = Vocabulary(hamlet_words, unk_cutoff=2)
# Handeling OOV
hamlet_oov_sents = [list(lex.lookup(sent)) for sent in hamlet_sents]
padded_ngrams_oov, flat_text_oov = padded_everygram_pipeline(2, hamlet_oov_sents)
# Train the model 
lm_oov = MLE(2)
lm_oov.fit(padded_ngrams_oov, flat_text_oov)
# Compute PPL and entropu with OOV on test 1
test_set = test_sents2
ngrams, flat_text = padded_everygram_pipeline(lm_oov.order, [lex.lookup(sent.split()) for sent in test_set])
ngrams = chain.from_iterable(ngrams)
ppl =  lm_oov.perplexity([x for x in ngrams   if len(x) == lm_oov.order])
print('PPL:', ppl)
# Generators are one-use only!
ngrams, flat_text = padded_everygram_pipeline(lm_oov.order, [lex.lookup(sent.split()) for sent in test_set])
ngrams = chain.from_iterable(ngrams)
cross_entropy = lm_oov.entropy([x for x in ngrams  if len(x) == lm_oov.order])
print('Cross Entropy :', cross_entropy)
print('\t PPL:', pow(2, cross_entropy))


##### PP: how it works inside

In [None]:
import math
import numpy as np

def compute_ppl(model, data):
    highest_ngram = model.order
    scores = [] 
    for sentence in data:
        ngrams, flat_text = padded_everygram_pipeline(highest_ngram, [sentence.split()])
        scores.extend([model.logscore(w[-1], w[:-1]) for gen in ngrams for w in gen if len(w) == highest_ngram])
    
    return math.pow(2.0, (-1 * np.asarray(scores).mean()))

compute_ppl("Add your model", test_sents2)    