In [1]:
from datetime import datetime
import pandas as pd
import pickle
import os

from collections import Counter
from tabulate import tabulate

import spacy

from spacy.lang.en import English

nlp = spacy.load('en_core_web_lg')


In [2]:
#!python -m spacy download en_core_web_lg

In [3]:
root_dir = os.getcwd()
output_path = os.path.join(root_dir, "Output")

file_name = os.path.join(output_path,"president_speeches.pkl")

df = pd.read_pickle(file_name)
df.head()

Unnamed: 0,filename,president,title,pub_date,speech
0,adams_speeches_000.txt,adams,Special Session Message to Congress,"May 16, 1797",The personal inconveniences to the members of ...
1,adams_speeches_001.txt,adams,Inaugural Address,"March 4, 1797","When it was first perceived, in early times, t..."
2,adams_speeches_002.txt,adams,Second Annual Message,"December 8, 1798",Gentlemen of the Senate and Gentlemen of the H...
3,adams_speeches_003.txt,adams,"Proclamation of Day of Fasting, Humiliation an...","March 23, 1798",As the safety and prosperity of nations ultima...
4,adams_speeches_004.txt,adams,Third Annual Message,"December 3, 1799",It is with peculiar satisfaction that I meet t...


In [4]:
df.describe()

Unnamed: 0,filename,president,title,pub_date,speech
count,962,962,962,962,962
unique,962,43,690,948,962
top,grant_speeches_031.txt,lbjohnson,State of the Union Address,"July 22, 1920","Thank you, Governor. Thank you all very much f..."
freq,1,71,40,5,1


In [5]:
print("There are {:,} possible combinations from {} speechs".format( (len(df) * len(df) ), (len(df))))

There are 925,444 possible combinations from 962 speechs


In [6]:
pres_counter = Counter(df.president)

print(tabulate(pres_counter.most_common(20), headers=['Pres', 'Speech Count']))

Pres           Speech Count
-----------  --------------
lbjohnson                71
reagan                   59
fdroosevelt              49
obama                    48
kennedy                  45
clinton                  39
gwbush                   39
grant                    32
wilson                   32
cleveland                31
johnson                  31
hoover                   29
jackson                  26
polk                     25
jefferson                24
bush                     23
nixon                    23
carter                   22
madison                  22
roosevelt                22


In [7]:
#who has 'Inaugural'
title_counter = Counter(df.title)

print(tabulate(title_counter.most_common(10), headers=['Title', 'Speech Count']))

Title                         Speech Count
--------------------------  --------------
State of the Union Address              40
First Annual Message                    25
Inaugural Address                       22
Second Annual Message                   22
Third Annual Message                    21
Fourth Annual Message                   19
First Inaugural Address                 15
Second Inaugural Address                15
Press Conference                        10
Farewell Address                         7


In [8]:
df[df.title == 'Inaugural Address']

Unnamed: 0,filename,president,title,pub_date,speech
1,adams_speeches_001.txt,adams,Inaugural Address,"March 4, 1797","When it was first perceived, in early times, t..."
23,bharrison_speeches_003.txt,bharrison,Inaugural Address,"March 4, 1889",\nFellow-Citizens:\n\nThere is no constitution...
37,buchanan_speeches_001.txt,buchanan,Inaugural Address,"March 4, 1857",\nFellow-Citizens:\n\nI appear before you this...
51,bush_speeches_001.txt,bush,Inaugural Address,"January 20, 1989","Mr. Chief Justice, Mr. President, Vice Preside..."
77,carter_speeches_004.txt,carter,Inaugural Address,"January 20, 1977","For myself and for our Nation, I want to thank..."
168,coolidge_speeches_003.txt,coolidge,Inaugural Address,"March 4, 1925",\nMy Countrymen:\n\nNo one can contemplate cur...
253,garfield_speeches_000.txt,garfield,Inaugural Address,"March 4, 1881",Fellow-Citizens:\nWe stand to-day upon an emin...
325,harding_speeches_000.txt,harding,Inaugural Address,"March 4, 1921",\nMy Countrymen:\n\nWhen one surveys the world...
343,harrison_speeches_000.txt,harrison,Inaugural Address,"March 4, 1841",Called from a retirement which I had supposed ...
346,hayes_speeches_002.txt,hayes,Inaugural Address,"March 5, 1877",\nFellow-Citizens:\n\nWe have assembled to rep...


In [9]:
bush_inaug = df[ (df['president'] == 'bush') & (df['title'] == 'Inaugural Address') ]["speech"].values[0]

obama_inaug = df[ (df['president'] == 'obama') & (df['title'] == 'Inaugural Address') ]["speech"].values[0]

vanburen_inaug = df[ (df['president'] == 'vanburen') & (df['title'] == 'Inaugural Address') ]["speech"].values[0]


In [10]:
#Get a document that has been through the spacy."NLP" process
bush_doc = nlp(bush_inaug)

obama_doc = nlp(obama_inaug)

vanburen_doc = nlp(vanburen_inaug)

#### Question
What is the semantic textual similarity between two documents?

In [11]:
bush_doc.similarity(obama_doc)

0.99559560656699

In [12]:
bush_doc.similarity(vanburen_doc)

0.982152854157184

In [13]:
df[df.title == 'Address to the Nation About the Watergate Investigations']

Unnamed: 0,filename,president,title,pub_date,speech
671,nixon_speeches_016.txt,nixon,Address to the Nation About the Watergate Inve...,"August 15, 1973",Good evening:\nNow that most of the major witn...
672,nixon_speeches_017.txt,nixon,Address to the Nation About the Watergate Inve...,"April 30, 1973",Good evening:\nI want to talk to you tonight f...


In [14]:
nixon_before = df[ (df['filename'] == 'nixon_speeches_017.txt') ]["speech"].values[0] 
nixon_after = df[ (df['filename'] == 'nixon_speeches_016.txt') ]["speech"].values[0]

nixon_before_doc = nlp(nixon_before)
nixon_after_doc = nlp(nixon_after)

nixon_after_doc.similarity(nixon_before_doc)

0.9985841489396466

In [15]:
nixon_after_doc.similarity(vanburen_doc)

0.989283579814654

#### Question
Where do we go from here?

Should we compare every sentence from both speeches?