In [1]:
#Operating system
from pathlib import Path

#Print outputs
from pprint import pprint

#Accessing APIs and URLs
import requests

#Fetch the URL
from bs4 import BeautifulSoup 
from urllib.request import urlopen

#Regular expression
import re

#Extract contents from PDF files
import pdfplumber

#Data processing
import pandas as pd

In [2]:
#Define URL for web scraping
url = "https://www.americanrhetoric.com/barackobamaspeeches.htm"
html = urlopen(url).read()

#Cook the soup
soup = BeautifulSoup(html, 'html.parser')

In [3]:
#Extract all PDF links from the website
pdf_links = []
for link in soup.find_all('a'):
    current_link = link.get('href')
    if current_link.endswith('pdf'):
        pdf_links.append(r"https://www.americanrhetoric.com/" + current_link)
#print the number of PDFs
print("Total number of PDFs:", len(pdf_links))

Total number of PDFs: 436


In [4]:
#Print first 5 rows of links to check
pprint(pdf_links[:5])

['https://www.americanrhetoric.com/speeches/PDFFiles/Barack%20Obama%20-%202004%20DNC%20Address.pdf',
 'https://www.americanrhetoric.com/speeches/PDFFiles/Barack%20Obama%20-%20Senate%20Speech%20on%20Ohio%20Electoral%20Vote.pdf',
 'https://www.americanrhetoric.com/speeches/PDFFiles/Barack%20Obama%20-%20Knox%20College%20Commencement.pdf',
 'https://www.americanrhetoric.com/speeches/PDFFiles/Barack%20Obama%20-%20Rosa%20Parks.pdf',
 'https://www.americanrhetoric.com/speeches/PDFFiles/Barack%20Obama%20-%20Senate%20Floor%20Speech%20on%20the%20Patriot%20Act.pdf']


In [5]:
#Function of getting titles of speech PDFs
def get_pdf_name(link):
    pdfname = Path(link).parts[-1]
    pat = re.compile(r"%20")
    pdfname = pat.sub(r"_", pdfname)#Using regular expression to replace "%20" with "_"
    if pdfname.count('-') == 1:
        pdfname = pdfname.split("-")[1].strip()[1:]
    return pdfname

#Function of downloading speech PDFs    
def get_pdf(name,content):
    pdf = open(Path.cwd() / "pdf_files" / name, 'wb')
    pdf.write(content)
    return pdf.close()

#The downloaded pdf files are stored in the path named "pdf_files" under the working path

In [6]:
#Get all PDF files
i=1
for my_link in pdf_links:

    #Avoid the 403 error
    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"}
    #Make a request to the url page, and return the status code
    response = requests.get(my_link, headers=headers)
        
    #Use function 'get_pdf_name' to get the titles of PDF files
    pdfname = get_pdf_name(my_link)
    
    #Use function 'get_pdf‘ to download PDF files 
    get_pdf(pdfname,response.content)
          
print("All PDF files downloaded")

All PDF files downloaded


In [7]:
#Create a list of PDF paths
pdf_dir = Path.cwd()/ "pdf_files"
pdfpaths = list(pdf_dir.glob('*.pdf'))  

In [8]:
#Create a blank dictionary named "speeches"
speeches= {'date': [], 'title': [], 'content':[]}

In [9]:
#Function of getting the date of a speech
def get_pdf_date(first_page):
    pattern = re.compile(r"\d{1,2}\s\w{1,10}\s\d{4}")
    date = pattern.findall(first_page)
    date = ''.join(date)
    return date

#Function of text processing of the first page of a speech(deleting the page header and footer)
def first_page_processing(first_page):
    first_page = re.sub(".*Page\s..","", first_page)
    first_page = re.sub("\d{1,2}\s\w{1,10},?\s\d{4}.*","",first_page)
    first_page = re.sub("\n","", first_page)
    first_page = re.sub(".*audio]?","",first_page)
    first_page = re.sub(".*Delivered","",first_page,flags=re.I)
    return first_page

#Function of text processing of other pages of a speech(deleting page headers and footers)
def other_page_processing(page):
    page = re.sub(".*Page\s..","",page)
    page = re.sub("\n","",page)
    page = re.sub("AA RRmmeerriiccaann hheettoorriicc\.\.ccoomm","",page)
    page = re.sub("AAmmeerriiccaannRRhheettoorriicc\.\.ccoomm","",page)
    page = re.sub("\s{2,10}","",page)
    return page

#The header and footer formats of the first page and other pages are quite different and processed with different regular expressions

In [10]:
#Write the contents of speech PDFs to the dictionary "speeches"
for pdfpath in pdfpaths:    
    pdf = pdfplumber.open(pdfpath)
    full_text = ''
    other_pages = ''
    
    for i in range(len(pdf.pages)):     
        if i == 0:
            #Extract the raw text of the first page
            first_page = pdf.pages[0].extract_text()
            
            #Get the date of a speech from the first page 
            date = get_pdf_date(first_page)
            
            #Write the date to the key "date" 
            speeches['date'].append(date)
            
            #Text processing of the first page
            first_page = first_page_processing(first_page)
            
        else:
            #Extract the raw text of the first page
            page = pdf.pages[i].extract_text()
            
            #Text processing of the first page
            page = other_page_processing(page)
            other_pages = other_pages + page
            
    #Get the full text of speeches after processed    
    full_text = first_page + other_pages
    #Write the full text to the key "content" 
    speeches['content'].append(full_text)
    
    #Get the title of the speeches and write to the key "title"
    title = pdfpath.parts[-1].split(".")[0]    
    speeches['title'].append(title)

#The process of extracting speeches from PDFs is done
print("Finished extracting speeches from PDF files")

Finished extracting speeches from PDF files


In [11]:
#Convert dictionary "speeches" into dataframe "speech_df" 
speech_df = pd.DataFrame(speeches)

In [12]:
#Obviously,this dataframe had certain 'date' data missing.
#Find out the following dates need to be corrected after visually checking
print(speech_df.loc[[15,155,185,241,279,308,322],['date','title']])

    date                                  title
15                                   AIPAC_2012
155       Go_Presidential_Election_Outcome_2016
185                           Indian_Parliament
241                        MH_Flight_17_Downing
279                            NY_NJ_Explosions
308            Post_Iran_Nuclear_Accord_Presser
322          Recovery_and_Reinvestment_Act_2016


In [13]:
#Correct the missing dates
speech_df.loc[speech_df.title=='AIPAC_2012','date'] = '4 March 2012'
speech_df.loc[speech_df.title=='Go_Presidential_Election_Outcome_2016','date'] = '9 November 2016'
speech_df.loc[speech_df.title=='Indian_Parliament','date'] = '8 November 2010'
speech_df.loc[speech_df.title=='MH_Flight_17_Downing','date'] = '18 July 2014'
speech_df.loc[speech_df.title=='Post_Iran_Nuclear_Accord_Presser','date'] = '19 September 2016'
speech_df.loc[speech_df.title=='NY_NJ_Explosions','date'] = '26 February 2016'
speech_df.loc[speech_df.title=='Recovery_and_Reinvestment_Act_2016','date'] = '15 July 2015'

In [14]:
#Check
print(speech_df.loc[[15,155,185,241,279,308,322],['date','title']])

                  date                                  title
15        4 March 2012                             AIPAC_2012
155    9 November 2016  Go_Presidential_Election_Outcome_2016
185    8 November 2010                      Indian_Parliament
241       18 July 2014                   MH_Flight_17_Downing
279   26 February 2016                       NY_NJ_Explosions
308  19 September 2016       Post_Iran_Nuclear_Accord_Presser
322       15 July 2015     Recovery_and_Reinvestment_Act_2016


In [15]:
#Change the format of 'date' as the datetime formate
speech_df['date'] = pd.to_datetime(speech_df['date'], dayfirst=True, format="%d %B %Y")
speech_df.head()

Unnamed: 0,date,title,content
0,2004-07-27,2004_DNC_Address,"On behalf of the great state of Illinois, cro..."
1,2010-02-01,2010_Budget_to_Congress,"Good morning, everybody. This morning, I sent..."
2,2016-09-11,911_After_15_Years,"Good morning. Scripture tells us, “Let not s..."
3,2009-09-11,911_Pentagon_Memorial_2009,"Secretary Gates, Admiral Mullen and members o..."
4,2010-09-11,911_Pentagon_Memorial_2010,"Secretary Gates, Admiral Mullen and members o..."


In [16]:
#Because of President Obama's tenure，only select contents whose dates are during the period from '2009-1-20' to '2017-1-10'.
speech_df_selected = speech_df[(speech_df['date'] > '2009-1-1') &
          (speech_df['date'] <= '2017-1-10')]

In [17]:
#Print the first five rows of "speech_df"
speech_df_selected.head()

Unnamed: 0,date,title,content
1,2010-02-01,2010_Budget_to_Congress,"Good morning, everybody. This morning, I sent..."
2,2016-09-11,911_After_15_Years,"Good morning. Scripture tells us, “Let not s..."
3,2009-09-11,911_Pentagon_Memorial_2009,"Secretary Gates, Admiral Mullen and members o..."
4,2010-09-11,911_Pentagon_Memorial_2010,"Secretary Gates, Admiral Mullen and members o..."
5,2011-03-18,Address_on_Libya,"Good afternoon, everybody.I want to take thi..."


In [18]:
#Print the number of speeches after selecting 
speech_df_selected.shape[0]

405

In [19]:
#Write the data frame to a txt file named "raw_speeches_df.txt"
speech_df_selected.to_csv(Path.cwd()/"raw_speeches_df.txt", header= True, index= False) 