<a href="https://colab.research.google.com/github/BFCC/BFCC.github.io/blob/main/Calabresi_N_grams_lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import re

import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize

from nltk.util import bigrams
from nltk.lm.preprocessing import padded_everygram_pipeline

import requests

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:

r = requests.get(r'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')
great_gatsby = r.text

for char in ["\n", "\r", "\d", "\t"]:
    great_gatsby = great_gatsby.replace(char, " ")

print(great_gatsby[:100])

﻿The Project Gutenberg eBook of The Great Gatsby        This ebook is for the use of anyone anywhere


In [5]:
great_gatsby = great_gatsby[983:]

In [6]:
def sample_clean_text(text: str):
    text = text.lower()

    text = re.sub(r"[^\w\s]", "", text)

    tokens = nltk.word_tokenize(text)

    return tokens

sample_tokens = sample_clean_text(text = great_gatsby)

print(sample_tokens[:50])

['then', 'wear', 'the', 'gold', 'hat', 'if', 'that', 'will', 'move', 'her', 'if', 'you', 'can', 'bounce', 'high', 'bounce', 'for', 'her', 'too', 'till', 'she', 'cry', 'lover', 'goldhatted', 'highbouncing', 'lover', 'i', 'must', 'have', 'you', 'thomas', 'parke', 'dinvilliers', 'i', 'in', 'my', 'younger', 'and', 'more', 'vulnerable', 'years', 'my', 'father', 'gave', 'me', 'some', 'advice', 'that', 'ive', 'been']


In [21]:
my_bigrams = bigrams(sample_tokens)

list(my_bigrams)[:10]

[('then', 'wear'),
 ('wear', 'the'),
 ('the', 'gold'),
 ('gold', 'hat'),
 ('hat', 'if'),
 ('if', 'that'),
 ('that', 'will'),
 ('will', 'move'),
 ('move', 'her'),
 ('her', 'if')]

In [22]:
n = 2
text = great_gatsby

In [9]:
sentences = nltk.sent_tokenize(text)

tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

tokenized_text = [[word.lower() for word in sent] for sent in tokenized_sentences]

print(tokenized_text[0])

['then', 'wear', 'the', 'gold', 'hat', ',', 'if', 'that', 'will', 'move', 'her', ';', 'if', 'you', 'can', 'bounce', 'high', ',', 'bounce', 'for', 'her', 'too', ',', 'till', 'she', 'cry', '“', 'lover', ',', 'gold-hatted', ',', 'high-bouncing', 'lover', ',', 'i', 'must', 'have', 'you', '!', '”', 'thomas', 'parke', 'd', '’', 'invilliers', 'i', 'in', 'my', 'younger', 'and', 'more', 'vulnerable', 'years', 'my', 'father', 'gave', 'me', 'some', 'advice', 'that', 'i', '’', 've', 'been', 'turning', 'over', 'in', 'my', 'mind', 'ever', 'since', '.']


In [10]:
print(text[:10])

 Then wear


In [11]:
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

In [12]:
from nltk.lm import MLE

lm = MLE(n)

In [13]:
len(lm.vocab)

0

In [14]:
lm.fit(train_data, padded_sents)
len(lm.vocab)

6953

In [15]:
 print(lm.vocab.lookup(tokenized_text[0]))

('then', 'wear', 'the', 'gold', 'hat', ',', 'if', 'that', 'will', 'move', 'her', ';', 'if', 'you', 'can', 'bounce', 'high', ',', 'bounce', 'for', 'her', 'too', ',', 'till', 'she', 'cry', '“', 'lover', ',', 'gold-hatted', ',', 'high-bouncing', 'lover', ',', 'i', 'must', 'have', 'you', '!', '”', 'thomas', 'parke', 'd', '’', 'invilliers', 'i', 'in', 'my', 'younger', 'and', 'more', 'vulnerable', 'years', 'my', 'father', 'gave', 'me', 'some', 'advice', 'that', 'i', '’', 've', 'been', 'turning', 'over', 'in', 'my', 'mind', 'ever', 'since', '.')


In [16]:
print(lm.vocab.lookup('then wear the gold hat iphone .'.split()))

('then', 'wear', 'the', 'gold', 'hat', '<UNK>', '.')


Unknown!

It works!

In [17]:
print(lm.counts['daisy'])

183


In [18]:
lm.score('daisy')

0.0026549057726065954

In [19]:
print(lm.counts[['daisy']]['and'])
lm.score('and', 'daisy'.split())

14


0.07650273224043716

In [20]:
lm.score("<UNK>")

0.0

That's not good: the model is too accurate.  We would have to add some randomness to make it function better.

In [23]:
print(lm.generate(20, text_seed= 'daisy', random_seed=42))

['other', ',', 'but', 'he', 'turned', 'to', 'their', 'cars', 'blocking', 'the', 'dull', 'light', ',', 'and', 'separated', 'only', 'building', 'in', 'the', 'adventitious']


In [24]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(lm, num_words, text_seed, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in lm.generate(num_words, text_seed=text_seed, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [25]:
generate_sent(lm, 20, text_seed='daisy', random_seed = 42)

'other, but he turned to their cars blocking the dull light, and separated only building in the adventitious'

In [27]:

print(lm.generate(20, text_seed= 'golden', random_seed=42))

['forty', '.', '</s>', 'and', 'the', 'rain', 'with', 'a', 'little', ',', 'as', 'it', ',', 'and', 'separated', 'only', 'building', 'in', 'the', 'adventitious']


In [31]:
generate_sent(lm, 20, text_seed='golden', random_seed = 42)

'forty.'

In [32]:
print(lm.generate(20, text_seed= 'tennis', random_seed=42))

['names', ')', '596-1887', '.', '</s>', 'ridges', 'and', 'aware', 'of', 'a', 'drugstore', 'nowadays.', '”', 'he', 'spoke', 'rapidly', '.', '</s>', 'thing', ',']


In [33]:
generate_sent(lm, 20, text_seed='tennis', random_seed = 42)

'names) 596-1887.'