In [1]:
! pip install wget
import wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=fde5d92b8bdce2ce7ae9f134e3b4d9a2429030caa7e97c550cdfd92a4c75a68b
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/moby_dick.txt'
wget.download(url, 'moby_dick.txt')

'moby_dick.txt'

# Language Models

Let's start with a simple, Laplace-smoothed trigram model:

In [3]:
from collections import defaultdict
import numpy as np
import nltk

smoothing = 0.001 # a factor helps the model to deal with unknown words
START = '_***_'
STOP = '_STOP_'

# map from (u, v) to w = (w|u,v)
# a dict that counts how many times we see a trigram, we will count from 0.001 not 0
counts = defaultdict(lambda: defaultdict(lambda: smoothing))

# fit data on corpus
corpus = [line.strip().split() for line in open('moby_dick.txt')]

# collect counts for MLE
for sentence in corpus:
    # include special tokens for start and the end of sentence
    tokens = [START, START] + sentence + [STOP]
    for u, v, w in nltk.ngrams(tokens, 3):
        counts[(u, v)][w] += 1

def logP(u, v, w): 
    """
    compute the log probability of a trigram
    (u,v,w) => P(w|u,v) = c(u,v,w) / SUM(c(u,v,*))
    """
    return np.log(counts[(u, v)][w]) - np.log(sum(counts[(u, v)].values()))

def sentence_logP(S):
    """
    score a sentence in log likelihood with chain rule
    S: list(str)
    """
    tokens = [START, START] + S + [STOP]
    return sum([logP(u, v, w) for u, v, w in nltk.ngrams(tokens, 3)])

In [4]:
counts[('because','he')]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {'always': 1.001,
             'could': 1.001,
             'had': 2.001,
             'happens': 1.001,
             'is': 1.001,
             'knows': 1.001,
             'seemed': 1.001,
             'treated': 1.001,
             'tucks': 1.001,
             'wanted': 1.001,
             'was': 2.001})

We can now score arbitrary sentences:

In [5]:
sentence_logP('Captain Ahab is a man .'.split())

-27.92672048112014

In [6]:
counts[('you','are')]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {',': 1.001,
             '.': 1.001,
             'all': 1.001,
             'an': 1.001,
             'but': 1.001,
             'close': 1.001,
             'dead': 1.001,
             'determined': 1.001,
             'eating': 1.001,
             'experienced': 1.001,
             'goin': 1.001,
             'heavy': 1.001,
             'in': 3.001,
             'just': 1.001,
             'mistaken': 1.001,
             'now': 3.001,
             'only': 1.001,
             'pitched': 1.001,
             'quick': 1.001,
             'skylarking': 1.001,
             'speaking': 1.001,
             'struck': 1.001,
             'telling': 1.001,
             'that': 1.001,
             'the': 1.001})

In [7]:
sum(counts[('you','are')].values())

29.02500000000001

## Activity
Implement the perplexity measure for a given corpus, and try it with two LM with different smoothing parameters.

$$perplexity = 2^{-\sum_{x \in X} p(x) \log p(x)}$$

In [8]:
def get_perplexity(corpus):
    """
    perplexity = 2^-entropy(X)
    entropy = -sum(p(x) *log(p(x)))
    """
    entropy = 0.0
    for sentence in corpus:
        sentence_log_prob = sentence_logP(sentence)
        sentence_entropy = np.exp(sentence_log_prob) * sentence_log_prob
        entropy += sentence_entropy
        
    perplexity = 2 ** -entropy
    return perplexity

print(get_perplexity(corpus))
#4.12 means the model will choose 4 words !!!   , when inferring a new word 

4.118431257864399


## Generation

We can re-use the counts to generate language:

In [9]:
def generate():
    result = [START, START]
    next_word = sample_next_word(result[-2], result[-1])
    result.append(next_word)
    while next_word != STOP:
        next_word = sample_next_word(result[-2], result[-1])
        result.append(next_word)
    
    return ' '.join(result[2:-1])

def sample_next_word(u, v):
    """
    sample a word w based on the history (u, v) --> the length of history is 2
    """
    # separate word and their counts into separate variables
    keys, values = zip(*counts[(u, v)].items())
    
    # normalize the counts into a probability distribution
    values = np.array(values)
    values /= values.sum() # create probability distro
     
    # this is the meat of the function
    sample = np.random.multinomial(1, values) # pick one position
    
    return keys[np.argmax(sample)]

In [10]:
keys , values = zip(*counts[(START,START)].items())
#keys, values #these are the possible start words with their counts
keys , values = zip(*counts[('you','are')].items())
values = np.array(values)
values /= values.sum()
values # the probability distribution for the keys

array([0.10339363, 0.03448751, 0.03448751, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.03448751, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.10339363, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.03448751, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.03448751, 0.03448751, 0.03448751])

In [11]:
sample = np.random.multinomial(1, values) 
sample

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [12]:
keys[np.argmax(sample)]

'speaking'

In [13]:
for i in range(50):
    keys, values = zip(*counts[('you','are')].items())
    values = np.array(values)
    values /= values.sum()
    sample = np.random.multinomial(1, values)
    print(keys[np.argmax(sample)])

an
close
in
pitched
goin
eating
only
the
in
now
now
mistaken
dead
but
goin
but
,
now
determined
experienced
mistaken
pitched
all
.
only
dead
struck
all
only
pitched
dead
goin
skylarking
eating
heavy
an
in
but
eating
in
but
now
speaking
dead
.
in
in
in
close
.


In [14]:
sample_next_word('as', 'a'), counts[('as', 'a')]

('pile',
 defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'Commodore': 1.001,
              'Dish': 1.001,
              'Latin': 1.001,
              'Roman': 1.001,
              'backwoodsman': 1.001,
              'bat': 1.001,
              'birch': 1.001,
              'body': 2.001,
              'candidate': 1.001,
              'cat': 1.001,
              'civilized': 1.001,
              'clam': 1.001,
              'clock': 1.001,
              'coffin': 1.001,
              'conceited': 1.001,
              'cook': 1.001,
              'corpse': 1.001,
              'country': 1.001,
              'cricket': 1.001,
              'crucible': 1.001,
              'dead': 1.001,
              'dinnerless': 1.001,
              'dragon': 1.001,
              'drawing': 1.001,
              'dromedary': 1.001,
              'fin': 1.001,
              'flavorish': 1.001,
              'fly': 1.001,
              'foreshadowing': 1.001,
              'fr

We can now generate non-sensical sentences:

In [15]:
print(generate())

It is by the stranger ' s nails are divided into wrought nails and cut nails ; so that there ' s the word with a pealing exultation and joy of Jonah teaches to all sperm whales , the Commodore ' s stern came into conspicuous relief .


## Exercise

Modify generate to take any number of initial words.

In [16]:
def generate_any(words=''):
    result = [START, START]
    for word in words.split():
      result.append(word)
    next_word = sample_next_word(result[-2], result[-1])
    result.append(next_word)
    while next_word != STOP:
        next_word = sample_next_word(result[-2], result[-1])
        result.append(next_word)
    
    return ' '.join(result[2:-1])

In [17]:
print(generate_any('I want'))

I want it ; but till you say aye to me as a sailor , but that will point as true as any .


In [18]:
print(generate_any('I will'))

I will here venture upon a thwart of his companions had mounted to its latter formations exceed in size , varying from fifteen to twenty - six arms , sir ."


In [19]:
print(generate_any('my'))

my spine , all the Pacific , and directions from Mrs . Hussey wore a red cotton velvet vest and the great White Whale fully incites the hearts of mountains bathed in their beds ; the headlong , sled - like sea heaved up their anchors with that almost every twenty - four hours , when -- THERE SHE BLOWS !-- the deck - table are thus cannibally carving each other better than royal blood there .


In [20]:
print(generate_any())

By all accounts Tarshish could have been hunted by man ), but in the tub oarsman ( him seated by the old man , his body , there are gestures in it .


## Exercise

Extend the code above to arbitray $n$-gram sizes. Use another corpus to try it with $n=4$.

It might be helpful to use a `class` for the LM, make the smoothing a parameter, `counts` a class property, and add a function `fit()`.

In [37]:
# Your code here

class LM():

  def __init__(self, smoothing, n_grams_size):
    self.n_grams_size_ = n_grams_size
    self.counts = defaultdict(lambda: defaultdict(lambda: smoothing))

  def fit(self,document):
    self.corpus = [line.strip().split() for line in document]
    for sentence in self.corpus:
      tokens = [START] * (self.n_grams_size_ -1 ) + sentence + [STOP]
      for n in nltk.ngrams(tokens, self.n_grams_size_):
        self.counts[n[:-1]][n[-1]] += 1

  def sample_next_word(self,nwords):
    keys, values = zip(*self.counts[tuple(nwords)].items())
    values = np.array(values)
    values /= values.sum() 
    
    sample = np.random.multinomial(1, values) # pick one position
    
    return keys[np.argmax(sample)]

  def generate(self, words=''):
    result = [START] * (self.n_grams_size_ - 1)
    for word in words:
      result.append(word)
    next_word = self.sample_next_word(nwords = result[- (self.n_grams_size_ - 1):])
    result.append(next_word)
    while next_word != STOP:
        next_word = self.sample_next_word(nwords = result[- (self.n_grams_size_ - 1):])
        result.append(next_word)
    
    return ' '.join(result[(self.n_grams_size_ - 1):-1])


In [38]:
import wget
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/tweets_en.txt'
wget.download(url, 'tweets_en.txt')
tweets = [line.strip() for line in open('tweets_en.txt', encoding='utf8')]
 
lm = LM(smoothing=0.001, n_grams_size=4)
lm.fit(document=tweets)
print(np.unique([lm.generate(words=["Trump","should","think","about"]) 
 for _ in range(10)]))

['Trump should think about going and buying some food!!'
 'Trump should think about going to Bulgaria in October #willibeabletolivethere'
 'Trump should think about going to the gym anyway, dedication or wit?'
 'Trump should think about going to the match #buzzzzn'
 'Trump should think about going to uni to study something']
