# Demo 22

## Movie Reviews

In [None]:
import nltk
import pandas as pd

moview_reviews = nltk.corpus.movie_reviews
review_files = [(file_id, file_id.startswith("pos")) for file_id in moview_reviews.fileids()]
df = pd.DataFrame(review_files)
df = df.rename(columns={0: "file_name", 1: "gold-label"})


def read_mov_review(f_name):
    return moview_reviews.open(f_name).read()

df['review_text'] = df['file_name'].apply(read_mov_review)

df = df.sample(df.shape[0])

## n-grams

In [None]:
from nltk import ngrams
bigrams = [gram for gram in ngrams(df['review_text'].iloc[0], 2)]
bigrams[:10]

**Question:** What happened?
<details>
<summary>Hint</summary>
    Lets look at the contextual help
</details>

<details>
<summary>Solution</summary>
    It requires a list
</details>

In [None]:
# skip cells

In [None]:
def tokenize(review):
    return " ".join([" ".join(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(review)]).split()

df['cleaned_text'] = df['review_text'].apply(tokenize)

In [None]:
bigrams = [gram for gram in ngrams(df['cleaned_text'].iloc[0], 2)]
bigrams[:10]

**Question:** How could we get tri-grams from the first sentence?

In [None]:
# skip

In [None]:
trigrams = [gram for gram in ngrams(df['cleaned_text'].iloc[0], 3)]
trigrams[:10]

**Question:** How could we get 25-grams from the first sentence?

In [None]:
gram25 = [gram for gram in ngrams(df['cleaned_text'].iloc[0], 25)]
gram25[:1]

### Google n-gram viewer

https://books.google.com/ngrams

(back to slides)

### n-grams as features in DTM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit_transform(df['review_text'])

In [None]:
len(vectorizer.get_feature_names())

In [None]:
vectorizer.get_feature_names()

**Assignment:** 
Looking at the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for CountVectorizer, how can we extract n-grams as features.

In [None]:
#skip

In [None]:
two_gram_vectorizer = CountVectorizer(ngram_range=(2,2))
two_gram_vectorizer.fit_transform(df['review_text'])

**Question:** How many more features do we have now?

In [None]:
len(two_gram_vectorizer.get_feature_names())

In [None]:
two_gram_vectorizer.get_feature_names()

Let's look at if we include unigram and bigrams as features

In [None]:
uni_bi_gram_vectorizer = CountVectorizer(ngram_range=(1,2))
uni_bi_gram_vectorizer.fit_transform(df['review_text'])

In [None]:
len(uni_bi_gram_vectorizer.get_feature_names())

In [None]:
uni_bi_gram_vectorizer.get_feature_names()

**Question:**
How can we determine the most probable n-grams?

(ask class before going back to slides)


(back to slides)

## Language Modeling

Let's now look at the complete work of Shakspeare

In [None]:
!wget https://norvig.com/ngrams/shakespeare.txt
!mv shakespeare.txt data

This corpus was already tokenized:
    
>The complete works of Shakespeare, tokenized so that there is a space between words and punctuation. From John DeNero. https://norvig.com/ngrams/

In [None]:
corpus = [line.strip().split() for line in open("data/shakespeare.txt").readlines() if line.strip()]
corpus[:10]

This is code from the textbook. It computes the log probabilities of a sentence based on shakspeaere

In [None]:
from collections import defaultdict 
import numpy as np
import nltk

smoothing = 0.001
counts = defaultdict(lambda: defaultdict(lambda: smoothing))

for sentence in corpus:
    tokens = ['*', '*'] + sentence + ['STOP'] 
    for u, v, w in nltk.ngrams(tokens, 3):
        counts[(u, v)][w] += 1

def logP(u, v, w):
    return np.log(counts[(u, v)][w]) - np.log(sum(counts[(u, v)
].values()))

def sentence_logP(S):
    tokens = ['*', '*'] + S + ['STOP']
    return sum([logP(u, v, w) for u, v, w in nltk.ngrams(tokens,
3)])

Now we can compute the log probability of a sentence being written by Shakpseare

*Note:* In the equation we were dividing, so when we use logs we end up using subtraction. So here, the larger the number is the more likely it is written by shakspeare. 

In [None]:
sentence_logP(corpus[0])

NYTimes headline from today: 
> President Biden and Prime Minister Boris Johnson will emphasize a vision of recovery from the pandemic that builds on the special relationship.

In [None]:
nytimes_headline = "President Biden and Prime Minister Boris Johnson will emphasize a vision of recovery from the pandemic that builds on the “special relationship."
sentence_logP(nytimes_headline.split())

### Language Models to generate text

We can now use the language model (these probabilities) to generate new Shakspeare text

In [None]:
def sample_next_word(u, v):
    keys, values = zip(*counts[(u, v)].items()) 
    values = np.array(values)
    values /= values.sum()
    return keys[np.argmax(np.random.multinomial(1, values))]

def generate():
    result = ['*', '*']
    next_word = sample_next_word(result[-2], result[-1]) 
    result.append(next_word)
    while next_word != 'STOP':
        next_word = sample_next_word(result[-2], result[-1]) 
        result.append(next_word)
    return ' '.join(result[2:-1])

In [None]:
generate()

In [None]:
def generate_from(word):
    result = ['*', word]
    next_word = sample_next_word(result[-2], result[-1])
    result.append(word)
    result.append(next_word)
    while next_word != 'STOP':
        next_word = sample_next_word(result[-2], result[-1]) 
        result.append(next_word)
    return ' '.join(result[2:-1])

In [None]:
generate_from("Oh")

Let's look at all the possible words we could get following "Oh"

In [None]:
counts[("*", "Oh")].items()

In [None]:
generate_from("Hamlet")

Let's sample some more together

(back to slides)

## Finding Common Phrases

In [None]:
documents = [" ".join(sent) for sent in corpus] 

In [None]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords

#stopwords_ = set(stopwords.words('english'))
words = [word.lower() for document in documents for word in document.split()
    if len(word) > 2
    and word not in stopwords_]

finder = BigramCollocationFinder.from_words(words) 
bgm = BigramAssocMeasures()
collocations = {bigram: pmi for bigram, pmi in finder.
                    score_ngrams(bgm.mi_like)} 

collocations