# Washington's Farewell Address Word Frequency Analysis

### Using Web Scraping + Regular Expression + Pandas

In [1]:
from bs4 import BeautifulSoup
import requests

In [36]:
# Getting and Reading Webpage

url = 'https://avalon.law.yale.edu/18th_century/washing.asp'

page = requests.get(url)

soup = BeautifulSoup(page.text,'html')

In [3]:
print(soup)

<html>
<head>
<link href="../css/site.css" rel="stylesheet" type="text/css"/>
<title>Avalon Project - Washington's Farewell Address 1796</title>
</head>
<body>
<div class="HeaderContainer">
<ul class="HeaderTopTools">
<li class="Search">
<table border="0" cellpadding="0" cellspacing="0">
<form action="https://www.google.com/search?q=searchsite:avalon.law.yale.edu" id="form" method="get" name="form" target="_blank">
<input name="domains" type="hidden" value="yale.edu"/>
<input name="sitesearch" type="hidden" value="avalon.law.yale.edu/"/>
<tr>
<td><input alt="Search Avalon" class="SiteSearchBox" name="q" onblur="if (this.value==''){this.value='Search Avalon'}" onfocus="if (this.value=='Search Avalon'){this.value=''}" type="text" value="Search Avalon"/></td>
<!--<td><a href="javascript:this.form.submit();"><img src="http://www.law.yale.edu/library/searchGo.gif" alt="Begin Search" /></a></td>-->
<td><input alt="Submit Search" class="go" src="/images/searchGo.gif" style="margin-left:5px;" 

In [37]:
# Finds all paragraphs
soup.find_all('p')

[<p>Friends and Citizens: </p>,
 <p>The period for a new election of a citizen to administer the executive government of the United States being not far distant, and the time actually arrived when your thoughts must be employed in designating the person who is to be clothed with that important trust, it appears to me proper, especially as it may conduce to a more distinct expression of the public voice, that I should now apprise you of the resolution I have formed, to decline being considered among the number of those out of whom a choice is to be made. </p>,
 <p>I beg you, at the same time, to do me the justice to be assured that this resolution has not been taken without a strict regard to all the considerations appertaining to the relation which binds a dutiful citizen to his country; and that in withdrawing the tender of service, which silence in my situation might imply, I am influenced by no diminution of zeal for your future interest, no deficiency of grateful respect for your p

In [38]:
# Saves all paragraphs as a list
washington_speech = soup.find_all('p')

In [8]:
type(washington_speech)

bs4.element.ResultSet

In [39]:
# Extracts text from paragraphs and prints it
speech_combined = [p.text for p in washington_speech]

print(speech_combined)



In [10]:
type(speech_combined)

list

In [42]:
' '.join(speech_combined)



In [43]:
# Saves combined text as a string

string_speech = ' '.join(speech_combined)

In [16]:
string_speech[:500]

'Friends and Citizens:  The period for a new election of a citizen to administer the executive government of the United States being not far distant, and the time actually arrived when your thoughts must be employed in designating the person who is to be clothed with that important trust, it appears to me proper, especially as it may conduce to a more distinct expression of the public voice, that I should now apprise you of the resolution I have formed, to decline being considered among the numbe'

In [17]:
string_speech.replace('\r\n', ' ')



In [46]:
# Replacing all formatting (line breaks) with one blank space
string_speech_cleaned = string_speech.replace('\r\n', ' ')

In [19]:
string_speech_cleaned



In [20]:
import re

In [47]:
# Removes punctuation and prints the result
speech_no_punc = re.sub(r'[^\w\s]','',string_speech_cleaned)

print(speech_no_punc)



In [48]:
# Converts text to lowercase
speech_lower = speech_no_punc.lower()

In [49]:
# Splits text into words
speech_broken_out = re.split(r'\s+', speech_lower)

In [26]:
speech_broken_out

['friends',
 'and',
 'citizens',
 'the',
 'period',
 'for',
 'a',
 'new',
 'election',
 'of',
 'a',
 'citizen',
 'to',
 'administer',
 'the',
 'executive',
 'government',
 'of',
 'the',
 'united',
 'states',
 'being',
 'not',
 'far',
 'distant',
 'and',
 'the',
 'time',
 'actually',
 'arrived',
 'when',
 'your',
 'thoughts',
 'must',
 'be',
 'employed',
 'in',
 'designating',
 'the',
 'person',
 'who',
 'is',
 'to',
 'be',
 'clothed',
 'with',
 'that',
 'important',
 'trust',
 'it',
 'appears',
 'to',
 'me',
 'proper',
 'especially',
 'as',
 'it',
 'may',
 'conduce',
 'to',
 'a',
 'more',
 'distinct',
 'expression',
 'of',
 'the',
 'public',
 'voice',
 'that',
 'i',
 'should',
 'now',
 'apprise',
 'you',
 'of',
 'the',
 'resolution',
 'i',
 'have',
 'formed',
 'to',
 'decline',
 'being',
 'considered',
 'among',
 'the',
 'number',
 'of',
 'those',
 'out',
 'of',
 'whom',
 'a',
 'choice',
 'is',
 'to',
 'be',
 'made',
 'i',
 'beg',
 'you',
 'at',
 'the',
 'same',
 'time',
 'to',
 'do',


In [27]:
import pandas as pd

In [50]:
# Counts word frequency and displays results
df = pd.DataFrame(speech_broken_out).value_counts()

df

the          417
of           344
to           275
and          210
in           125
            ... 
fortunate      1
fortress       1
fortify        1
forms          1
insult         1
Name: count, Length: 1698, dtype: int64

In [51]:
# Saves word counts to a CSV file
df.to_csv('/Users/alejandrolondono/Desktop/Analyst_Builder/GWachington_Speech_Counts.csv', header = ['Counts'], index_label = 'Word')