
# Homework

http://www.presidency.ucsb.edu/sou.php

### Scrape all of SOTU speeches. Store them as dictionary objects with each key as "lastname_firstname_date" and the value as the text (raw) of each speech. Upload these data to Harvard GDrive (n.b. $\infty$ space), Dropbox, or Github and post a link to that on Canvas. Also include your IPython notebook.

(n.b. In your script, your files should be saving to a place where they are automatically uploaded. This is good practice for getting large datasets and not overloading your storage. The `os.chdir` command changes the work directory to another folder alongside choosing to have that folder auto upload.)

In [6]:
from bs4 import BeautifulSoup,SoupStrainer
import re
import requests
from urllib.request import urlopen

import pickle
import json
import os

urls = []

# we first take the URL 
url = "http://www.presidency.ucsb.edu/sou.php"

# and then we request it, as if we are loading a web page.
req = requests.get(url,timeout=20) #delay to prevent timeout
req.status_code #200 means we have gotten it correctly

req.html = req.text #extract the text from the request

#we grab all td elements with the 'ver12' class, which contains all the links to relevant speeches
soup = BeautifulSoup(req.html,'lxml', parse_only=SoupStrainer("td", {"class": "ver12"}))
#grab all anchor elements
x = soup.findAll("a")

#iterate through all the links; grab only links that point internally
for tr in x:
    links = tr.get('href')
    if "presidency.ucsb.edu" in links:
        urls.append(links)  

#print sample of urls
for i in range(1,10):
    print(urls[i])

http://www.presidency.ucsb.edu/ws/index.php?pid=128921
http://www.presidency.ucsb.edu/ws/index.php?pid=102826
http://www.presidency.ucsb.edu/ws/index.php?pid=104596
http://www.presidency.ucsb.edu/ws/index.php?pid=108031
http://www.presidency.ucsb.edu/ws/index.php?pid=111174
http://www.presidency.ucsb.edu/ws/index.php?pid=85753
http://www.presidency.ucsb.edu/ws/index.php?pid=87433
http://www.presidency.ucsb.edu/ws/index.php?pid=88928
http://www.presidency.ucsb.edu/ws/index.php?pid=99000


In [9]:
#load date parser which parses string dates to datetime objects
!pip install python-dateutil
from dateutil import parser

#convenience function that eliminates escape characters in speech text
def filter_non_printable(str):
  return ''.join([c for c in str if ord(c) > 31 or ord(c) == 9])



In [10]:
import pandas as pd

#create dictionary 
speeches = {}
#speech_df = pd.DataFrame(columns=['first_name','last_name','date','text'])
 
#iterate through all urls
for url in urls:
    req = requests.get(url, timeout = 10)
    req.html = req.text
    soup = BeautifulSoup(req.html,"html.parser")

    #grabs page titles, gets text to the left of colon, splits based on spaces
    try:
        name = soup.title.text.split(":")[0].split(" ")
    except:
        print("no name content found; moving to next value")
        continue
    
    #if the list is of length three, there's a middle name!
    if len(name) == 3:
        first_name = name[0]
        middle_name = name[1]
        last_name = name[2]
        
    #otherwise, grab first and last name
    elif len(name) == 2:
        first_name = name[0]
        last_name = name[1]
    
    #grabs all text elements of class 'displaytext'
    text = soup.select('.displaytext')[0].get_text()
    text = "{0}".format(text)
    #eliminates escape characters
    text = filter_non_printable(text)

    #grabs date elements from doc date class
    date = soup.select('.docdate')[0].get_text()
    
    #nice automatic parser to datetime objects
    dt = parser.parse(date)
    dt = str(dt).split(" ")[0].replace("-", "_")

    #confirm everything is working by printing to screen
    print(url, last_name + "_" + first_name + "_"+ dt, text[0:50])
    speeches[last_name + "_" + first_name + "_"+ dt] = text
    #speech_df = speech_df.append([first_name, last_name, dt, text], ignore_index=True)

http://www.presidency.ucsb.edu/ws/index.php?pid=123408 Trump_Donald_2017_02_28 Thank you very much. Mr. Speaker, Mr. Vice Preside


http://www.presidency.ucsb.edu/ws/index.php?pid=128921 Trump_Donald_2018_01_30 Mr. Speaker, Mr. Vice President, Members of Congre
http://www.presidency.ucsb.edu/ws/index.php?pid=102826 Obama_Barack_2013_02_12 Please, everybody, have a seat. Mr. Speaker, Mr. V
http://www.presidency.ucsb.edu/ws/index.php?pid=104596 Obama_Barack_2014_01_28 The President. Mr. Speaker, Mr. Vice President, Me
http://www.presidency.ucsb.edu/ws/index.php?pid=108031 Obama_Barack_2015_01_20 The President. Mr. Speaker, Mr. Vice President, Me
http://www.presidency.ucsb.edu/ws/index.php?pid=111174 Obama_Barack_2016_01_12 Thank you. Mr. Speaker, Mr. Vice President, Member
http://www.presidency.ucsb.edu/ws/index.php?pid=85753 Obama_Barack_2009_02_24 Madam Speaker, Mr. Vice President, Members of Cong
http://www.presidency.ucsb.edu/ws/index.php?pid=87433 Obama_Barack_2010_01_27 Madam Speaker, Vice President Biden, Members of Co
http://www.presidency.ucsb.edu/ws/index.php?pid=88928 Obama_Barack_2011_01_25 Mr. Speaker, M

http://www.presidency.ucsb.edu/ws/index.php?pid=11162 Eisenhower_Dwight_1958_01_09 Mr. President, Mr. Speaker, Members of the 85th Co
http://www.presidency.ucsb.edu/ws/index.php?pid=11685 Eisenhower_Dwight_1959_01_09 [Delivered in person before a joint session] Mr. P
http://www.presidency.ucsb.edu/ws/index.php?pid=12061 Eisenhower_Dwight_1960_01_07 [Delivered in person before a joint session] Mr. P
http://www.presidency.ucsb.edu/ws/index.php?pid=12074 Eisenhower_Dwight_1961_01_12 To the Congress of the United States: Once again i
http://www.presidency.ucsb.edu/ws/index.php?pid=12074 Eisenhower_Dwight_1961_01_12 To the Congress of the United States: Once again i
http://www.presidency.ucsb.edu/ws/index.php?pid=9829 Eisenhower_Dwight_1953_02_02 Mr. President, Mr. Speaker, Members of the Eighty-
http://www.presidency.ucsb.edu/ws/index.php?pid=10096 Eisenhower_Dwight_1954_01_07 Mr. President, Mr. Speaker, Members of the Eighty-
http://www.presidency.ucsb.edu/ws/index.php?pid=10416 Eisenhowe

http://www.presidency.ucsb.edu/ws/index.php?pid=29548 Roosevelt_Theodore_1907_12_03  To the Senate and House of Representatives: No na
http://www.presidency.ucsb.edu/ws/index.php?pid=29549 Roosevelt_Theodore_1908_12_08  To the Senate and House of Representatives:  FINA
http://www.presidency.ucsb.edu/ws/index.php?pid=29542 Roosevelt_Theodore_1901_12_03 To the Senate and House of Representatives: The Co
http://www.presidency.ucsb.edu/ws/index.php?pid=29543 Roosevelt_Theodore_1902_12_02  To the Senate and House of Representatives: We st
http://www.presidency.ucsb.edu/ws/index.php?pid=29544 Roosevelt_Theodore_1903_12_07  To the Senate and House of Representatives: The c
http://www.presidency.ucsb.edu/ws/index.php?pid=29545 Roosevelt_Theodore_1904_12_06  To the Senate and House of Representatives: The N
http://www.presidency.ucsb.edu/ws/index.php?pid=29538 McKinley_William_1897_12_06  To the Senate and House of Representatives: It gi
http://www.presidency.ucsb.edu/ws/index.php?pid=29539 McK

http://www.presidency.ucsb.edu/ws/index.php?pid=29483 Tyler_John_1841_12_07  To the Senate and House of Representatives of the
http://www.presidency.ucsb.edu/ws/index.php?pid=29484 Tyler_John_1842_12_06  To the Senate and House of Representatives of the
http://www.presidency.ucsb.edu/ws/index.php?pid=29647 Tyler_John_1843_12_05  To the Senate and House of Representatives of the
http://www.presidency.ucsb.edu/ws/index.php?pid=29485 Tyler_John_1844_12_03  To the Senate and House of Representatives of the
http://www.presidency.ucsb.edu/ws/index.php?pid=29479 Buren_Martin_1837_12_05  Fellow-Citizens of the Senate and House of Repres
http://www.presidency.ucsb.edu/ws/index.php?pid=29480 Buren_Martin_1838_12_03  Fellow-Citizens of the Senate and House of Repres
http://www.presidency.ucsb.edu/ws/index.php?pid=29481 Buren_Martin_1839_12_02  Fellow-Citizens of the Senate and House of Repres
http://www.presidency.ucsb.edu/ws/index.php?pid=29482 Buren_Martin_1840_12_05  Fellow-Citizens of the Sen

In [17]:
#we can pickle this, although this is just for demonstrative purposes because its not really necessary
import pickle
output = open('data.pkl', 'wb')
pickle.dump(speeches, output)
output.close()

In [14]:
#convert dictionary to pandas df
df = pd.DataFrame(list(speeches.items()), columns=['name','text'])

#confirm it works
df.head

<bound method NDFrame.head of                              name  \
0         Trump_Donald_2017_02_28   
1         Trump_Donald_2018_01_30   
2         Obama_Barack_2013_02_12   
3         Obama_Barack_2014_01_28   
4         Obama_Barack_2015_01_20   
5         Obama_Barack_2016_01_12   
6         Obama_Barack_2009_02_24   
7         Obama_Barack_2010_01_27   
8         Obama_Barack_2011_01_25   
9         Obama_Barack_2012_01_24   
10         Bush_George_2005_02_02   
11         Bush_George_2006_01_31   
12         Bush_George_2007_01_23   
13         Bush_George_2008_01_28   
14         Bush_George_2001_02_27   
15         Bush_George_2002_01_29   
16         Bush_George_2003_01_28   
17         Bush_George_2004_01_20   
18     Clinton_William_1997_02_04   
19     Clinton_William_1998_01_27   
20     Clinton_William_1999_01_19   
21     Clinton_William_2000_01_27   
22     Clinton_William_1993_02_17   
23     Clinton_William_1994_01_25   
24     Clinton_William_1995_01_24   
25     C

In [16]:
#write to excel (not .csv because commas and tabs in text)
writer = pd.ExcelWriter('output_sotu.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()