In [25]:
from __future__ import print_function
import urllib.request, json, urllib.parse , requests, sys
import pandas as pd
import random

## 1. Data Gathering

### 1.1 Extract Speeches

In [14]:
republican_list = [
    'Chester A. Arthur',
    'George H. W. Bush',
    'George W. Bush',
    'Calvin Coolidge',
    'Dwight D. Eisenhower',
    'Gerald Ford',
    'James A. Garfield',
    'Ulysses S. Grant',
    'Warren G. Harding',
    'Benjamin Harrison',
    'Rutherford B. Hayes',
    'Herbert Hoover',
    'William McKinley',
    'Richard M. Nixon',
    'Ronald Reagan',
    'Theodore Roosevelt',
    'William Taft',
    'Donald Trump'
]

democratic_list = [
    'Joe Biden',
    'James Buchanan',
    'Jimmy Carter',
    'Grover Cleveland',
    'Bill Clinton',
    'Andrew Jackson',
    'Andrew Johnson',
    'Lyndon B. Johnson',
    'John F. Kennedy',
    'Barack Obama',
    'Franklin Pierce',
    'James K. Polk',
    'Franklin D. Roosevelt',
    'Harry S. Truman',
    'Martin Van Buren',
    'Woodrow Wilson'
]

In [103]:
endpoint = "https://api.millercenter.org/speeches"

r = requests.post(url = endpoint)
data = r.json()
party_speeches = data["Items"]

while 'LastEvaluatedKey' in data:
  parameters = {"continue_president": data['LastEvaluatedKey']['president'], 
                "continue_doc_name": data['LastEvaluatedKey']['doc_name']}
  r = requests.post(url = endpoint, params = parameters)
  data = r.json()
  party_speeches += data["Items"]


### 1.2 Stratify and Random Sample for each Party

In [204]:
party_speeches_df = pd.DataFrame(party_speeches)
party_speeches_df.head()

Unnamed: 0,doc_name,date,transcript,president,title
0,april-18-1977-address-nation-energy,1977-04-18,Good evening.\r\nTonight I want to have an unp...,Jimmy Carter,"April 18, 1977: Address to the Nation on Energy"
1,april-25-1980-statement-iran-rescue-mission,1980-04-25,"Late yesterday, I cancelled a carefully planne...",Jimmy Carter,"April 25, 1980: Statement on the Iran Rescue M..."
2,august-14-1980-acceptance-speech-democratic-na...,1980-08-14,"Fellow Democrats, fellow citizens:\r\n\r\nI th...",Jimmy Carter,"August 14, 1980: Acceptance Speech at the Demo..."
3,december-15-1978-speech-establishing-diplomati...,1978-12-15,Good evening.\r\nI would like to read a joint ...,Jimmy Carter,"December 15, 1978: Speech on Establishing Dipl..."
4,february-2-1977-report-american-people-energy,1977-02-02,Good evening.\r\nTomorrow will be two weeks si...,Jimmy Carter,"February 2, 1977: Report to the American Peopl..."
...,...,...,...,...,...
1033,december-4-1883-third-annual-message,1883-12-04,To the Congress of the United States: At the t...,Chester A. Arthur,"December 4, 1883: Third Annual Message"
1034,december-6-1881-first-annual-message,1881-12-06,To the Senate and House of Representatives of ...,Chester A. Arthur,"December 6, 1881: First Annual Message"
1035,july-1-1882-veto-safety-regulations-bill,1882-07-01,To the House of Representatives of the United ...,Chester A. Arthur,"July 1, 1882: Veto of Safety Regulations Bill"
1036,july-1-1884-message-regarding-settlement-india...,1884-07-01,By the President of the United States of Ameri...,Chester A. Arthur,"July 1, 1884: Message Regarding Settlement on ..."


In [217]:
republican_speeches_df, democratic_speeches_df = pd.DataFrame(), pd.DataFrame()
for index, row in party_speeches_df.iterrows():
    if row["president"] in republican_list:
        republican_speeches_df = republican_speeches_df.append(row)
    elif row["president"] in democratic_list:
        democratic_speeches_df = democratic_speeches_df.append(row)

democratic_speeches_df.reset_index(drop = True, inplace=True)
republican_speeches_df.reset_index(drop = True, inplace=True)

In [219]:
republican_rand_indices, democratic_rand_indices = list(), list()

while len(republican_rand_indices) < 50:
    rand_republican = random.randint(0, len(republican_speeches_df)-1)
    if rand_republican not in republican_rand_indices:
        republican_rand_indices.append(rand_republican) 

while len(democratic_rand_indices) < 50:
    rand_democrat = random.randint(0, len(democratic_speeches_df)-1)
    if rand_democrat not in democratic_rand_indices:
        democratic_rand_indices.append(rand_democrat) 

In [220]:
for index, row in republican_speeches_df.iterrows():
    if index not in republican_rand_indices:
        republican_speeches_df.drop(index, inplace=True)

for index, row in democratic_speeches_df.iterrows():
    if index not in democratic_rand_indices:
        democratic_speeches_df.drop(index, inplace=True)

In [275]:
rs_democratic_speeches_df = democratic_speeches_df.copy()
rs_democratic_speeches_df.head(3)

Unnamed: 0,doc_name,date,transcript,president,title
10,january-4-1980-speech-afghanistan,1980-01-04,\r\n\r\nI come to you this evening to discuss...,Jimmy Carter,"January 4, 1980: Speech on Afghanistan"
27,january-20-1949-inaugural-address,1949-01-20,"Mr. Vice President, Mr. Chief Justice, fellow ...",Harry S. Truman,"January 20, 1949: Inaugural Address"
32,march-15-1952-columbia-scholastic-press-associ...,1952-03-15,"Dr. Murphy, distinguished guests, Mr. Mayor, d...",Harry S. Truman,"March 15, 1952: Columbia Scholastic Press Asso..."


In [276]:
rs_republican_speeches_df = republican_speeches_df.copy()
rs_republican_speeches_df.head(3)

Unnamed: 0,doc_name,date,transcript,president,title
2,april-23-1898-proclamation-calling-military-vo...,1898-04-23,By the President of the United States of Ameri...,William McKinley,"April 23, 1898: Proclamation Calling for Milit..."
8,july-24-1897-message-regarding-banking-and-cur...,1897-07-24,To the Congress of the United States:\nIn my m...,William McKinley,"July 24, 1897: Message Regarding Banking and C..."
9,march-15-1897-message-regarding-special-sessio...,1897-03-15,To the Congress of the United States:\nRegrett...,William McKinley,"March 15, 1897: Message Regarding Special Sess..."


## 2. NLP Analysis

### 2.1 Preprocessing

In [293]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import nltk
from nltk.stem import WordNetLemmatizer

In [295]:
punctuation_strings = string.punctuation+("’")
spacy_stop_words = list(STOP_WORDS)
nlp = English()
nlp_lemm = spacy.load("en_core_web_sm")

In [306]:
# Remove stop words from text and calculate average # of words per sentence.
def clean_text(text):
    tokenized_sentences = nltk.sent_tokenize(text)
    word_tokens = list()
    total_sentence_len = 0
    for tokenized_sentence in tokenized_sentences:
        doc = nlp_lemm(tokenized_sentence)
        sentence_words = [word.lemma_.lower() for word in doc 
                    if str(word).lower() not in spacy_stop_words and 
                    str(word).lower() not in punctuation_strings and 
                    not nlp.vocab[str(word)].is_stop and
                    not str(word).lower().isdigit() and
                    str(word).isalpha()]
        # lemmatizer = WordNetLemmatizer()
        # sentence_words = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(tokenized_sentence) 
        #             if word.lower() not in spacy_stop_words and 
        #             word.lower() not in punctuation_strings and 
        #             not nlp.vocab[word].is_stop and
        #             not word.lower().isdigit() and
        #             word.isalpha()]
        total_sentence_len += len(sentence_words)
        word_tokens += sentence_words
        
    return {"Word Tokens": word_tokens, "Average Sentence Length": total_sentence_len / len(tokenized_sentences)}

In [307]:
def clean_df_text(df):
    clean_transcript, avg_sentence_length = [], []
    for index, row in df.iterrows():
        clean_text_output = clean_text(row["transcript"])
        clean_transcript.append(clean_text_output["Word Tokens"])
        avg_sentence_length.append(clean_text_output["Average Sentence Length"])
    df["clean transcript"] = clean_transcript
    df["avg_sentence_length"] = avg_sentence_length

In [308]:
clean_df_text(rs_republican_speeches_df)
clean_df_text(rs_democratic_speeches_df)

In [311]:
rs_democratic_speeches_df

Unnamed: 0,doc_name,date,transcript,president,title,clean transcript,avg_sentence_length
10,january-4-1980-speech-afghanistan,1980-01-04,\r\n\r\nI come to you this evening to discuss...,Jimmy Carter,"January 4, 1980: Speech on Afghanistan","[come, evening, discuss, extremely, important,...",10.896552
27,january-20-1949-inaugural-address,1949-01-20,"Mr. Vice President, Mr. Chief Justice, fellow ...",Harry S. Truman,"January 20, 1949: Inaugural Address","[vice, president, chief, justice, fellow, citi...",8.898305
32,march-15-1952-columbia-scholastic-press-associ...,1952-03-15,"Dr. Murphy, distinguished guests, Mr. Mayor, d...",Harry S. Truman,"March 15, 1952: Columbia Scholastic Press Asso...","[murphy, distinguished, guest, mayor, delegate...",6.198113
39,september-18-1948-whistlestop-tour-chariton-iowa,1948-09-18,"I appreciate that introduction very much, and ...",Harry S. Truman,"September 18, 1948: Whistlestop Tour in Charit...","[appreciate, introduction, think, good, prophe...",7.268293
40,september-18-1948-whistlestop-tour-trenton-mis...,1948-09-18,It certainly is a pleasure. This is the first ...,Harry S. Truman,"September 18, 1948: Whistlestop Tour in Trento...","[certainly, pleasure, missouri, town, stop, le...",7.228571
53,february-9-2010-news-conference-congressional-...,2010-02-09,"THE PRESIDENT: Hello, everybody. I am glad t...",Barack Obama,"February 9, 2010: News Conference on Congressi...","[president, hello, everybody, glad, brave, wea...",7.803089
96,december-24-1943-fireside-chat-27-tehran-and-c...,1943-12-24,My Friends:\r\nI have recently (just) returned...,Franklin D. Roosevelt,"December 24, 1943: Fireside Chat 27: On the Te...","[friend, recently, return, extensive, journeyi...",10.61194
97,december-29-1940-fireside-chat-16-arsenal-demo...,1940-12-29,My friends:\r\nThis is not a fireside chat on ...,Franklin D. Roosevelt,"December 29, 1940: Fireside Chat 16: On the ""A...","[friend, fireside, chat, war, talk, national, ...",8.631579
108,july-19-1940-democratic-national-convention,1940-07-19,Members of the Convention-my friends:\r\nIt is...,Franklin D. Roosevelt,"July 19, 1940: Democratic National Convention","[member, convention, friend, late, feel, speak...",10.664234
118,june-5-1944-fireside-chat-29-fall-rome,1944-06-05,"My Friends:\r\nYesterday, on June fourth, 1944...",Franklin D. Roosevelt,"June 5, 1944: Fireside Chat 29: On the Fall of...","[friend, yesterday, june, fourth, rome, fall, ...",9.272727
