# Presidential Speech similarity (cont)

In [1]:
from datetime import datetime
import pandas as pd
import pickle
import os

from collections import Counter
from tabulate import tabulate


In [2]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_lg

In [3]:
#Load our pickle
root_dir = os.getcwd()

output_path = os.path.join(root_dir, "Output")

file_name = os.path.join(output_path,"president_speeches.pkl")

df = pd.read_pickle(file_name)

In [4]:
df.head()

Unnamed: 0,filename,president,title,pub_date,speech
0,adams_speeches_000.txt,adams,Special Session Message to Congress,"May 16, 1797",The personal inconveniences to the members of ...
1,adams_speeches_001.txt,adams,Inaugural Address,"March 4, 1797","When it was first perceived, in early times, t..."
2,adams_speeches_002.txt,adams,Second Annual Message,"December 8, 1798",Gentlemen of the Senate and Gentlemen of the H...
3,adams_speeches_003.txt,adams,"Proclamation of Day of Fasting, Humiliation an...","March 23, 1798",As the safety and prosperity of nations ultima...
4,adams_speeches_004.txt,adams,Third Annual Message,"December 3, 1799",It is with peculiar satisfaction that I meet t...


In [5]:
df.title

0                    Special Session Message to Congress
1                                      Inaugural Address
2                                  Second Annual Message
3      Proclamation of Day of Fasting, Humiliation an...
4                                   Third Annual Message
                             ...                        
957                       Wilson&#8217;s Fourteen Points
958                                 Sixth Annual Message
959                                 Fifth Annual Message
960                               Seventh Annual Message
961                                Eighth Annual Message
Name: title, Length: 962, dtype: object

In [6]:
import string
import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = spacy.load('en_core_web_lg')  #<< switch to the 'large' model with embeddings


## Digging deeper on named entities NER

While it is possible to make/add custom entity lists, we'll not do that today.

https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718


In [7]:
#pick a speech and look at some entities ()
print(df.pub_date[5])

speech = nlp(df.speech[5])

entities=[(token, token.label_, token.label) for token in speech.ents]  

entities[0:5]


November 22, 1797


[(Philadelphia, 'GPE', 384),
 (the National Legislature, 'ORG', 383),
 (Congress, 'ORG', 383),
 (Congress, 'ORG', 383),
 (Constitution, 'LAW', 390)]

In [24]:
spacy.explain("GPE")  #aka geo-political entities

'Countries, cities, states'

In [9]:
from spacy import displacy

#spaCy.displacy helps visualize embeddings and such
displacy.render(speech, style = "ent", jupyter = True)

In [10]:
#select one speech (Presidend Adams Inagural Speech)
adams_inaug = df[ (df['president'] == 'adams') & (df['title'] == 'Inaugural Address') ]["speech"].values[0]

adams_inaug

"When it was first perceived, in early times, that no middle course for America remained between unlimited submission to a foreign legislature and a total independence of its claims, men of reflection were less apprehensive of danger from the formidable power of fleets and armies they must determine to resist than from those contests and dissensions which would certainly arise concerning the forms of government to be instituted over the whole and over the parts of this extensive country. Relying, however, on the purity of their intentions, the justice of their cause, and the integrity and intelligence of the people, under an overruling Providence which had so signally protected this country from the first, the representatives of this nation, then consisting of little more than half its present number, not only broke to pieces the chains which were forging and the rod of iron that was lifted up, but frankly cut asunder the ties which had bound them, and launched into an ocean of uncerta

In [11]:
#pick a speech
adams_inaug = df[ (df['president'] == 'adams') & (df['title'] == 'Inaugural Address') ]["speech"].values[0]

#nlp process the speech
speech = nlp(adams_inaug)


In [12]:
#what type of entities are in this speech?
entity_types = ((ent.text, ent.label_) for ent in speech.ents)

print(tabulate(entity_types, headers=['Entity', 'Entity Type']))

Entity                                 Entity Type
-------------------------------------  -------------
first                                  ORDINAL
America                                GPE
Providence                             GPE
first                                  ORDINAL
more than half                         CARDINAL
the Revolutionary war                  EVENT
Confederation                          NORP
Batavian                               NORP
Helvetic                               ORG
a single day                           DATE
Congress                               ORG
States                                 GPE
States                                 GPE
America                                GPE
happy Constitution of Government       LAW
first                                  ORDINAL
the Constitution of the United States  LAW
States                                 GPE
Senate                                 ORG
Congress                               ORG
State         

#### Question

Are there any tokens/words that are not part of the spacy model vocabulary from that speech?

In [13]:
outs = []          #OOV is 'out of vocabulary'

for token in speech:

    if  token.is_oov:
        outs.append(token)    

print(outs)

[number, consequences?universal, unexampled, meliorate, engraven]



#### Question

How can we use all those named entities?  

How can we use differnet parts of speech?

In [14]:
#                            LEMMA                             POS = NOUN
noun_counter = Counter(token.lemma_ for token in speech if token.pos_ == 'NOUN')

print(tabulate(noun_counter.most_common(10), headers=['Noun Lemmas', 'Count']))


Noun Lemmas      Count
-------------  -------
people              20
nation              20
government          11
country             10
honor                7
power                6
mind                 6
citizen              6
justice              5
year                 5


#### Question
How many words in the spacy model embeddings vector?

In [15]:
print("There are {:,} word embeddings in this model.".format((speech.vocab.vectors.size)))

There are 205,449,300 word embeddings in this model.


In [16]:
print(spacy.__version__)

2.2.1


### JSON 

In [17]:
#nlp keeps a lot of info on the document and can output that in json (easier to read)
import json

json_s = speech.to_json()   #not available in spacy < 2.2

with open("SpeechData.json", "w") as speech_data_file:
    json.dump(json_s, speech_data_file, indent=4) 
    


#### Question
How many sentences are in this speech?

In [18]:
print("There are {} senteneces in this speech".format(len(list(speech.sents))))


There are 37 senteneces in this speech


#### Question
How many Part of Speech (POS) tags are in this speech?  What are they?

In [19]:
speech_tags = []

for s in speech.sents:
    for token in s:
        speech_tags.append(str(token.pos_))

        
tag_counter = Counter(t for t in speech_tags)

print(tabulate(tag_counter.most_common(15), headers=['POS (token.pos_)','Count']))

POS (token.pos_)      Count
------------------  -------
NOUN                    544
ADP                     355
DET                     308
PUNCT                   261
ADJ                     216
PRON                    179
VERB                    165
CCONJ                   164
AUX                     132
ADV                      73
PROPN                    65
SCONJ                    60
PART                     56
SPACE                    14
NUM                       4


#### Question
Spacy has several POS attributes.  What is the differnce between the __.pos___ and __.tag___?

In [20]:
#Tabulate parts of speech

speech_tags = []

for s in speech.sents:
    for token in s:
        speech_tags.append(str(token.tag_))

        
tag_counter = Counter(t for t in speech_tags)

print(tabulate(tag_counter.most_common(15), headers=['POS (token.tag_)','Count']))

POS (token.tag_)      Count
------------------  -------
IN                      413
NN                      380
DT                      302
JJ                      210
,                       202
NNS                     168
CC                      164
VB                       88
PRP                      76
VBN                      72
RB                       70
PRP$                     61
NNP                      60
TO                       39
MD                       38


#### Question
How many verbs are in this speech?

In [21]:
from collections import Counter
from tabulate import tabulate

verb_counter = Counter(token.lemma_ for token in speech if token.pos_ == 'VERB')

print(tabulate(verb_counter.most_common(5), headers=['Verbs', 'Count']))

Verbs        Count
---------  -------
support          4
feel             3
consider         3
form             3
establish        3


# Semantic Textual Similarity


The default estimate is cosine similarity using an average of word vectors. [spaCy]

In [22]:
#Just a quick re-cap on how to compare two documents
doc  = nlp("Here is some text to encode.")
doc2 = nlp("Here are some words to encode.")
doc3 = nlp("Encoding words is what this does!")
doc4 = nlp("This workshop is boring because there is no MATLAB code!")
doc5 = nlp("My car needs gas.")
doc6 = nlp("Space is the final frontier.")

print(doc.similarity(doc2))
print(doc.similarity(doc3))
print(doc2.similarity(doc3))
print(doc.similarity(doc4))
print(doc.similarity(doc5))
print(doc.similarity(doc6))



0.9446589176740247
0.8688087578391293
0.8495116864187342
0.8182720256258303
0.6536331310895203
0.7311156590042915


__References:__

SCIBERT: A Pretrained Language Model for Scientific Text
https://arxiv.org/pdf/1903.10676.pdf

Text Similarity Estimation Based on Word Embeddings and Matrix
Norms for Targeted Marketing https://www.aclweb.org/anthology/N19-1181.pdf