In [1]:
from __future__ import print_function
import urllib.request, json, urllib.parse , requests, sys
import pandas as pd
import random

## 1. Data Gathering

### 1.1 Extract Speeches

In [2]:
republican_list = [
    'Chester A. Arthur',
    'George H. W. Bush',
    'George W. Bush',
    'Calvin Coolidge',
    'Dwight D. Eisenhower',
    'Gerald Ford',
    'James A. Garfield',
    'Ulysses S. Grant',
    'Warren G. Harding',
    'Benjamin Harrison',
    'Rutherford B. Hayes',
    'Herbert Hoover',
    'William McKinley',
    'Richard M. Nixon',
    'Ronald Reagan',
    'Theodore Roosevelt',
    'William Taft',
    'Donald Trump'
]

democratic_list = [
    'Joe Biden',
    'James Buchanan',
    'Jimmy Carter',
    'Grover Cleveland',
    'Bill Clinton',
    'Andrew Jackson',
    'Andrew Johnson',
    'Lyndon B. Johnson',
    'John F. Kennedy',
    'Barack Obama',
    'Franklin Pierce',
    'James K. Polk',
    'Franklin D. Roosevelt',
    'Harry S. Truman',
    'Martin Van Buren',
    'Woodrow Wilson'
]

In [3]:
endpoint = "https://api.millercenter.org/speeches"

r = requests.post(url = endpoint)
data = r.json()
party_speeches = data["Items"]

while 'LastEvaluatedKey' in data:
  parameters = {"continue_president": data['LastEvaluatedKey']['president'], 
                "continue_doc_name": data['LastEvaluatedKey']['doc_name']}
  r = requests.post(url = endpoint, params = parameters)
  data = r.json()
  party_speeches += data["Items"]


### 1.2 Stratify and Random Sample for each Party

In [4]:
party_speeches_df = pd.DataFrame(party_speeches)
party_speeches_df.head()

Unnamed: 0,doc_name,date,transcript,president,title
0,april-18-1977-address-nation-energy,1977-04-18,Good evening.\r\nTonight I want to have an unp...,Jimmy Carter,"April 18, 1977: Address to the Nation on Energy"
1,april-25-1980-statement-iran-rescue-mission,1980-04-25,"Late yesterday, I cancelled a carefully planne...",Jimmy Carter,"April 25, 1980: Statement on the Iran Rescue M..."
2,august-14-1980-acceptance-speech-democratic-na...,1980-08-14,"Fellow Democrats, fellow citizens:\r\n\r\nI th...",Jimmy Carter,"August 14, 1980: Acceptance Speech at the Demo..."
3,december-15-1978-speech-establishing-diplomati...,1978-12-15,Good evening.\r\nI would like to read a joint ...,Jimmy Carter,"December 15, 1978: Speech on Establishing Dipl..."
4,february-2-1977-report-american-people-energy,1977-02-02,Good evening.\r\nTomorrow will be two weeks si...,Jimmy Carter,"February 2, 1977: Report to the American Peopl..."


In [5]:
republican_speeches_df, democratic_speeches_df = pd.DataFrame(), pd.DataFrame()
for index, row in party_speeches_df.iterrows():
    if row["president"] in republican_list:
        republican_speeches_df = republican_speeches_df.append(row)
    elif row["president"] in democratic_list:
        democratic_speeches_df = democratic_speeches_df.append(row)

democratic_speeches_df.reset_index(drop = True, inplace=True)
republican_speeches_df.reset_index(drop = True, inplace=True)

In [8]:
republican_rand_indices, democratic_rand_indices = random.sample(range(len(republican_speeches_df)-1),50), random.sample(range(len(democratic_speeches_df)-1),50)

In [9]:
for index, row in republican_speeches_df.iterrows():
    if index not in republican_rand_indices:
        republican_speeches_df.drop(index, inplace=True)

for index, row in democratic_speeches_df.iterrows():
    if index not in democratic_rand_indices:
        democratic_speeches_df.drop(index, inplace=True)

In [12]:
rs_democratic_speeches_df = democratic_speeches_df.copy()
rs_democratic_speeches_df.head(3)

Unnamed: 0,doc_name,date,transcript,president,title
2,august-14-1980-acceptance-speech-democratic-na...,1980-08-14,"Fellow Democrats, fellow citizens:\r\n\r\nI th...",Jimmy Carter,"August 14, 1980: Acceptance Speech at the Demo..."
7,january-20-1977-inaugural-address,1977-01-20,"For myself and for our Nation, I want to thank...",Jimmy Carter,"January 20, 1977: Inaugural Address"
13,may-22-1977-university-notre-dame-commencement,1977-05-22,To Father Hesburgh and the great faculty of No...,Jimmy Carter,"May 22, 1977: University of Notre Dame Commenc..."


In [13]:
rs_republican_speeches_df = republican_speeches_df.copy()
rs_republican_speeches_df.head(3)

Unnamed: 0,doc_name,date,transcript,president,title
4,december-5-1898-second-annual-message,1898-12-05,To the Senate and House of Representatives: No...,William McKinley,"December 5, 1898: Second Annual Message"
21,august-23-1984-republican-national-convention,1984-08-23,"Mr. Chairman, Mr. Vice President, delegates to...",Ronald Reagan,"August 23, 1984: Republican National Convention"
25,december-23-1981-address-nation-christmas-and-...,1981-12-23,"Good evening.\r\n\r\nAt Christmas time, every ...",Ronald Reagan,"December 23, 1981: Address to the Nation on Ch..."


## 2. NLP Analysis

### 2.1 Preprocessing

In [14]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import nltk
from nltk.stem import WordNetLemmatizer

In [15]:
punctuation_strings = string.punctuation+("’")
spacy_stop_words = list(STOP_WORDS)
nlp = English()
nlp_lemm = spacy.load("en_core_web_sm")

In [16]:
# Remove stop words from text and calculate average # of words per sentence.
def clean_text(text):
    tokenized_sentences = nltk.sent_tokenize(text)
    word_tokens = list()
    total_sentence_len = 0
    for tokenized_sentence in tokenized_sentences:
        doc = nlp_lemm(tokenized_sentence)
        sentence_words = [word.lemma_.lower() for word in doc 
                    if str(word).lower() not in spacy_stop_words and 
                    str(word).lower() not in punctuation_strings and 
                    not nlp.vocab[str(word)].is_stop and
                    not str(word).lower().isdigit() and
                    str(word).isalpha()]
        # lemmatizer = WordNetLemmatizer()
        # sentence_words = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(tokenized_sentence) 
        #             if word.lower() not in spacy_stop_words and 
        #             word.lower() not in punctuation_strings and 
        #             not nlp.vocab[word].is_stop and
        #             not word.lower().isdigit() and
        #             word.isalpha()]
        total_sentence_len += len(sentence_words)
        word_tokens += sentence_words
        
    return {"Word Tokens": word_tokens, "Average Sentence Length": total_sentence_len / len(tokenized_sentences)}

In [17]:
def clean_df_text(df):
    clean_transcript, avg_sentence_length = [], []
    for index, row in df.iterrows():
        clean_text_output = clean_text(row["transcript"])
        clean_transcript.append(clean_text_output["Word Tokens"])
        avg_sentence_length.append(clean_text_output["Average Sentence Length"])
    df["clean transcript"] = clean_transcript
    df["avg_sentence_length"] = avg_sentence_length

In [18]:
clean_df_text(rs_republican_speeches_df)
clean_df_text(rs_democratic_speeches_df)

In [19]:
rs_democratic_speeches_df

Unnamed: 0,doc_name,date,transcript,president,title,clean transcript,avg_sentence_length
2,august-14-1980-acceptance-speech-democratic-na...,1980-08-14,"Fellow Democrats, fellow citizens:\r\n\r\nI th...",Jimmy Carter,"August 14, 1980: Acceptance Speech at the Demo...","[fellow, democrats, fellow, citizen, thank, no...",8.079681
7,january-20-1977-inaugural-address,1977-01-20,"For myself and for our Nation, I want to thank...",Jimmy Carter,"January 20, 1977: Inaugural Address","[nation, want, thank, predecessor, heal, land,...",9.711538
13,may-22-1977-university-notre-dame-commencement,1977-05-22,To Father Hesburgh and the great faculty of No...,Jimmy Carter,"May 22, 1977: University of Notre Dame Commenc...","[father, hesburgh, great, faculty, notre, dame...",8.813665
23,april-16-1945-first-speech-congress,1945-04-16,"Mr. Speaker, Mr. President, Members of the Con...",Harry S. Truman,"April 16, 1945: First Speech to Congress","[speaker, president, member, congress, heavy, ...",7.452174
26,january-15-1953-farewell-address,1953-01-15,My fellow Americans:\r\nI am happy to have thi...,Harry S. Truman,"January 15, 1953: Farewell Address","[fellow, americans, happy, opportunity, talk, ...",6.980676
51,february-24-2009-address-joint-session-congress,2009-02-24,"Madam Speaker, Mr. Vice President, Members of ...",Barack Obama,"February 24, 2009: Address Before a Joint Sess...","[madam, speaker, vice, president, members, con...",9.073171
56,january-20-2009-inaugural-address,2009-01-20,I stand here today humbled by the task before ...,Barack Obama,"January 20, 2009: Inaugural Address","[stand, today, humble, task, grateful, trust, ...",9.449541
58,january-21-2013-second-inaugural-address,2013-01-21,"\r\nVice President Biden, Mr. Chief Justice,\r...",Barack Obama,"January 21, 2013: Second Inaugural Address","[vice, president, biden, chief, justice, membe...",8.409091
60,january-25-2011-2011-state-union-address,2011-01-25,"Mr. Speaker, Mr. Vice President, members of Co...",Barack Obama,"January 25, 2011: 2011 State of the Union Address","[speaker, vice, president, member, congress, d...",6.395445
64,january-29-2013-remarks-immigration-reform,2013-01-29,Thank you! (Applause.) Thank you! Thank you...,Barack Obama,"January 29, 2013: Remarks on Immigration Reform","[thank, applause, thank, thank, applause, good...",5.131474
