# Demo 05

In [1]:
import nltk
import spacy
import pandas as pd
import os
from tqdm import tqdm

import numpy as np

## Types vs Tokens

In [2]:
speech = "We refuse to believe that there are insufficient funds in the great vaults \
of opportunity of this nation. And so we've come to cash this check, a check that \
will give us upon demand the riches of freedom and the security of justice"

In [3]:
speech

"We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation. And so we've come to cash this check, a check that will give us upon demand the riches of freedom and the security of justice"

In [None]:
nltk

In [4]:
tokens = nltk.tokenize.word_tokenize(speech)
" ".join(tokens)

"We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation . And so we 've come to cash this check , a check that will give us upon demand the riches of freedom and the security of justice"

In [5]:
f"Number of tokens is {len(tokens)}, number of types is {len(set(tokens))}"

'Number of tokens is 46, number of types is 37'

In [6]:
nltk.tokenize.sent_tokenize(speech)

['We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation.',
 "And so we've come to cash this check, a check that will give us upon demand the riches of freedom and the security of justice"]

### Duplicate types?

**Question:** Can you find any duplicate types in our vocabulary?

In [7]:
vocab = set(tokens)
" ".join(vocab)

"security And we insufficient vaults give justice believe cash come to refuse and freedom We this us riches in upon great the that a so are funds . of check opportunity , will nation 've there demand"

<details>
<summary>Answer</summary>
    <b>"We"</b> and <b>"we"</b>

</details>

Do we want to treat these as different types?

**Question:** What solution would you suggest? 

<details>
<summary>Solution</summary>
<b>Lowercase</b>

</details>

In [None]:
# Solution is below in code

In [8]:
nltk.tokenize.word_tokenize(speech.lower())

['we',
 'refuse',
 'to',
 'believe',
 'that',
 'there',
 'are',
 'insufficient',
 'funds',
 'in',
 'the',
 'great',
 'vaults',
 'of',
 'opportunity',
 'of',
 'this',
 'nation',
 '.',
 'and',
 'so',
 'we',
 "'ve",
 'come',
 'to',
 'cash',
 'this',
 'check',
 ',',
 'a',
 'check',
 'that',
 'will',
 'give',
 'us',
 'upon',
 'demand',
 'the',
 'riches',
 'of',
 'freedom',
 'and',
 'the',
 'security',
 'of',
 'justice']

In [9]:
lower_tokens = nltk.tokenize.word_tokenize(speech.lower())
f"Number of lowered tokens is {len(lower_tokens)}, number of types is {len(set(lower_tokens))}"

'Number of lowered tokens is 46, number of types is 35'

In [12]:
freqdist = nltk.FreqDist(lower_tokens)
freqdist

FreqDist({'of': 4, 'the': 3, 'we': 2, 'to': 2, 'that': 2, 'this': 2, 'and': 2, 'check': 2, 'refuse': 1, 'believe': 1, ...})

In [15]:
freqdist.most_common(2)

[('of', 4), ('the', 3)]

(back to slides)

## Lematization

In [16]:
import nltk

In [17]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatizer.lemmatize("transformed", "v") # The NLTK WordNet Lemmatizer needs to know the part of speech tag

'transform'

#### Go, Goes, Went, Gone, Going

**Question:** What do you think the lemma for these terms should be? 

In [18]:
lemmatizer.lemmatize("go"), lemmatizer.lemmatize("goes")

('go', 'go')

In [19]:
lemmatizer.lemmatize("went", "v"), lemmatizer.lemmatize("gone", "v"), lemmatizer.lemmatize("going", "v")

('go', 'go', 'go')

In [21]:
lemmatizer.lemmatize("leaves")

'leaf'

In [22]:
lemmatizer.lemmatize("leaves", "v")

'leave'

(back to slides)
## Stemming

In [23]:
snowball_stemmer = nltk.stem.SnowballStemmer("english") # Same of PorterStemmer
snowball_stemmer.stem("babies")

# "y" -> "i"

'babi'

### constituional, constitutionality, ...

In [24]:
snowball_stemmer.stem("constitution"), snowball_stemmer.stem("constitutions"), snowball_stemmer.stem("constitutional"), snowball_stemmer.stem("constitutionality"), snowball_stemmer.stem("constitutionalism")

('constitut', 'constitut', 'constitut', 'constitut', 'constitut')

### Relat

In [25]:
snowball_stemmer.stem("relativity"), snowball_stemmer.stem("relative")

('relat', 'relat')

In [28]:
snowball_stemmer.stem("constitution"), lemmatizer.lemmatize("constitution", "n")

('constitut', 'constitution')

In [30]:
snowball_stemmer.stem("babies"), lemmatizer.lemmatize("babies")

('babi', 'baby')

(back to slides)
## Stopwords

In [31]:
" ".join(nltk.corpus.stopwords.words('english'))

"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't"

**Question:** What do we notice about these words?

In [32]:
nltk.corpus.stopwords.words('spanish')

['de',
 'la',
 'que',
 'el',
 'en',
 'y',
 'a',
 'los',
 'del',
 'se',
 'las',
 'por',
 'un',
 'para',
 'con',
 'no',
 'una',
 'su',
 'al',
 'lo',
 'como',
 'más',
 'pero',
 'sus',
 'le',
 'ya',
 'o',
 'este',
 'sí',
 'porque',
 'esta',
 'entre',
 'cuando',
 'muy',
 'sin',
 'sobre',
 'también',
 'me',
 'hasta',
 'hay',
 'donde',
 'quien',
 'desde',
 'todo',
 'nos',
 'durante',
 'todos',
 'uno',
 'les',
 'ni',
 'contra',
 'otros',
 'ese',
 'eso',
 'ante',
 'ellos',
 'e',
 'esto',
 'mí',
 'antes',
 'algunos',
 'qué',
 'unos',
 'yo',
 'otro',
 'otras',
 'otra',
 'él',
 'tanto',
 'esa',
 'estos',
 'mucho',
 'quienes',
 'nada',
 'muchos',
 'cual',
 'poco',
 'ella',
 'estar',
 'estas',
 'algunas',
 'algo',
 'nosotros',
 'mi',
 'mis',
 'tú',
 'te',
 'ti',
 'tu',
 'tus',
 'ellas',
 'nosotras',
 'vosotros',
 'vosotras',
 'os',
 'mío',
 'mía',
 'míos',
 'mías',
 'tuyo',
 'tuya',
 'tuyos',
 'tuyas',
 'suyo',
 'suya',
 'suyos',
 'suyas',
 'nuestro',
 'nuestra',
 'nuestros',
 'nuestras',
 'vuestro'

In [41]:
nltk_en_stop_words = nltk.corpus.stopwords.words('english')
print(" ".join(tokens))
" ".join([word for word in tokens if word.lower() not in nltk_en_stop_words])

We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation . And so we 've come to cash this check , a check that will give us upon demand the riches of freedom and the security of justice


"refuse believe insufficient funds great vaults opportunity nation . 've come cash check , check give us upon demand riches freedom security justice"

In [40]:
np.all([stopword.islower() for stopword in nltk_en_stop_words])

True

(back to demo)
## Part of Speech Tagging

In [42]:
nltk.pos_tag(speech)

TypeError: tokens: expected a list of strings, got a string

**Question:** What does this error mean?

In [47]:
speech_tokenized = [word for word in tokens if word.lower() not in nltk_en_stop_words]

In [48]:
nltk.pos_tag(speech_tokenized)

[('refuse', 'NN'),
 ('believe', 'VBP'),
 ('insufficient', 'NN'),
 ('funds', 'NNS'),
 ('great', 'JJ'),
 ('vaults', 'NNS'),
 ('opportunity', 'NN'),
 ('nation', 'NN'),
 ('.', '.'),
 ("'ve", 'VBP'),
 ('come', 'VBN'),
 ('cash', 'NN'),
 ('check', 'NN'),
 (',', ','),
 ('check', 'VB'),
 ('give', 'VBP'),
 ('us', 'PRP'),
 ('upon', 'IN'),
 ('demand', 'NN'),
 ('riches', 'NNS'),
 ('freedom', 'VBP'),
 ('security', 'NN'),
 ('justice', 'NN')]

Let's look at another tagset

In [49]:
nltk.pos_tag(speech_tokenized, tagset='universal')

[('refuse', 'NOUN'),
 ('believe', 'VERB'),
 ('insufficient', 'NOUN'),
 ('funds', 'NOUN'),
 ('great', 'ADJ'),
 ('vaults', 'NOUN'),
 ('opportunity', 'NOUN'),
 ('nation', 'NOUN'),
 ('.', '.'),
 ("'ve", 'VERB'),
 ('come', 'VERB'),
 ('cash', 'NOUN'),
 ('check', 'NOUN'),
 (',', '.'),
 ('check', 'VERB'),
 ('give', 'VERB'),
 ('us', 'PRON'),
 ('upon', 'ADP'),
 ('demand', 'NOUN'),
 ('riches', 'NOUN'),
 ('freedom', 'VERB'),
 ('security', 'NOUN'),
 ('justice', 'NOUN')]

Tutorial 2.1 will further explore differences between these sets

### Tricky examples

***time flies like an arrow***

**Question:** What should the POS tags here be?

- time: 
- flies:
- like: 
- an:
- arrow:

Let's see what nltk tells us

In [50]:
nltk.pos_tag(nltk.word_tokenize("time flies like an arrow"), tagset='universal')

[('time', 'NOUN'),
 ('flies', 'NOUN'),
 ('like', 'ADP'),
 ('an', 'DET'),
 ('arrow', 'NOUN')]

**Question:** Do we agree?

Tutorial 2.1 will focus on the difference between these

(back to slides)

## Dependency Parsing

### Spacy

In [51]:
import spacy
nlp = spacy.load('en_core_web_sm')

You might need to run 

> !python3 -m spacy download en_core_web_sm

In [52]:
doc = nlp(speech)
doc, type(doc)

(We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation. And so we've come to cash this check, a check that will give us upon demand the riches of freedom and the security of justice,
 spacy.tokens.doc.Doc)

In [53]:
list(doc.sents)[0]

We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation.

Tutorial 2.1 will go into details about the spacy `Doc` object

In [54]:
from spacy import displacy
displacy.render(list(doc.sents)[0], style="dep")

In [55]:
for tok in list(doc.sents)[0]:
    print(tok.text, tok.dep_.upper(), tok.head)

We NSUBJ refuse
refuse ROOT refuse
to AUX believe
believe XCOMP refuse
that MARK are
there EXPL are
are CCOMP believe
insufficient AMOD funds
funds ATTR are
in PREP funds
the DET vaults
great AMOD vaults
vaults POBJ in
of PREP vaults
opportunity POBJ of
of PREP opportunity
this DET nation
nation POBJ of
. PUNCT refuse


Spacy dependency parse labels are explained [here](https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md)

(back to slides)

## Named Entity Recognition

In [56]:
example_doc = nlp("Monday, October 30, Hillary Clinton will present her book in Chicago at the University of Chicago.")
example_doc

Monday, October 30, Hillary Clinton will present her book in Chicago at the University of Chicago.

**Question:** How do we get the entities?

In [57]:
example_ents = example_doc.ents
example_ents

(Monday, October 30, Hillary Clinton, Chicago, the University of Chicago)

**Question:** Let's get the text of the entities and the label of the entity

<details>
<summary>Solution</summary>
<b>[(ent.text, ent.label_) for ent in example_doc.ents]</b>

</details>

In [59]:
len(example_ents)

4

In [61]:
type(example_ents[0])

spacy.tokens.span.Span

In [67]:
for ent in example_ents:
    print(ent.label_, ent.text)

DATE Monday, October 30
PERSON Hillary Clinton
GPE Chicago
ORG the University of Chicago


### Entities in Dracula

I downloaded Dracula from Project Gutenberg: https://www.gutenberg.org/ebooks/345

In [None]:
!ls data/Dracula.txt

The next line will take about 2 minutes

In [None]:
%%time 
doc = nlp(open("data/Dracula.txt").read())

I ran a [tool](https://github.com/JonathanReeve/chapterize) developed by Jonathan Reeve that splits novels from Project Gutenberg into files for each chapter.

[Jonathan](https://jonreeve.com/) is a Computational literary analyst here at Columbia.

In [None]:
!ls data/Dracula-chapters

In [None]:
%%time

DRACULA_PATH = "data/Dracula-chapters/"

chapter2doc = {}
for file in tqdm(os.listdir(DRACULA_PATH)):
    chapter_id = file.split(".")[0]
    chapter2doc[chapter_id] = nlp(open(DRACULA_PATH + file).read())

In [None]:
chapter2doc.keys()

In [None]:
type(chapter2doc['01'])

In [None]:
texts, labels = [], []
for ent in chapter2doc['01'].ents:
    texts.append(ent.text)
    labels.append(ent.label_)
    
ents_df = pd.DataFrame({'text': texts, 'label': labels})
ents_df.sample(10)

**Question:** What labels do we see the most in the first Chapter?

In [None]:
ents_df['label'].value_counts()

**Question:** What person is mentioned the most in the first chapter?

In [None]:
ents_df[ents_df['label'] == 'PERSON'].value_counts()

**Question:** Who is mentioned the most throughout the entire book?

In [None]:
chapters, texts, labels = [], [], []

for chapter, doc in chapter2doc.items():
    for ent in doc.ents:
        texts.append(ent.text)
        labels.append(ent.label_)
        chapters.append(chapter)
    
ents_df = pd.DataFrame({'text': texts, 'label': labels, 'chapter': chapters})
ents_df.sample(10)

In [None]:
ents_df.sort_values(by='chapter')

In [None]:
lucy_mentions_df = ents_df[ents_df['text'] == 'Lucy']
lucy_mentions_df

In [None]:
lucy_mentions_df['label'].value_counts()

In [None]:
lucy_mentions_df = lucy_mentions_df.drop(columns=['label']) 
lucy_mentions_df

In [None]:
lucy_mentions_df['chapter'].value_counts().plot(kind='line')

**Question:** What don't we like about this graph?

In [None]:
ax = lucy_mentions_df['chapter'].value_counts().sort_index().plot(kind='line')
ax.set_title("Number of times Lucy is mentioned per chapter")
ax.set_xlabel("Chapter Number")
ax.set_xlabel("Number of Lucy mentions")

**Question:** Does this figure make sense based on the novel?


#### Plotting most common characters in Dracula

**Question:** Who are the 50 most commonly mentioned characters in Dracula?


<details>
<summary>Solution</summary>
<b>ents_df[ents_df['label'] == 'PERSON']['text'].value_counts().head(50)</b>

</details>

In [None]:
# write code here to determine that based on entities_df

Let's query the dataframe to find all rows that have been tagged as a PERSON and 
save the result from the query in `person_df`


<details>
<summary>Solution</summary>
<b>ents_df[ents_df['label'] == 'PERSON']</b>

</details>

In [None]:
person_df = ...
person_df

Now lets determine how many times each person was mentioned in each chapter.

We want to make a new dataframe where the indices are the chapters and the columns represent the counts of how many times a specific character was mentioned in the chapter.

In [None]:
pv_table = pd.pivot_table(person_df, index=['chapter'],
                    columns=['text'], aggfunc=len, fill_value=0)
pv_table

In [None]:
pv_table.reset_index()

Let's plot just the 10 most frequently mentioned characters

In [None]:
person_df['text'].value_counts().head(10) # first find the 10 most frequently mentioned characters

In [None]:
ten_freq_people = person_df['text'].value_counts().index[:10] # Lets get their names
ten_freq_people

In [None]:
pv_table['label'][ten_freq_people].plot(kind='line') # Query the pivot table and then plot the result 

let's make subplots as well