# 1. Data Collection
a) Scrapes the website and downloads all of Obama's speeches as PDF's [378 Speeches Downloaded]

b) Converts PDF's into text files (.txt) 

c) Changes the name of each file into the date the speech was given

In [None]:

import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.americanrhetoric.com/barackobamaspeeches.htm"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}

folder_location = r'webscraping_final'
if not os.path.exists(folder_location):
   os.mkdir(folder_location)

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
print(response.text)
for link in soup.select("a[href$='.pdf']"):
   filename = os.path.join(folder_location,link['href'].split('/')[-1])
   with open(filename, 'wb') as f:
      f.write(requests.get(urljoin(url,link['href']), headers=headers).content)

In [None]:
import os

from tika import parser
directory = r'webscraping_final'
folder_location = r'textdata_final'
if not os.path.exists(folder_location):
	os.mkdir(folder_location)
for filename in os.listdir(directory):
    print (filename)
    input_file= directory + "/" + filename
    file_data = parser.from_file(input_file)
    text = file_data['content']
    output_filepath= folder_location + "/" + filename + '.txt'
    f = open(output_filepath, "a")
    f.write(text)
    f.close()


In [None]:
import os
from pathlib import Path
import re
import glob
import datetime
list_of_months =  ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
path_to_speeches = "/Users/dhruv_batra/Desktop/SentimentAnalysis_Final/textdata_final/"
date_of_speeches_list = []
for file in glob.glob(path_to_speeches + "*.txt"):
    speech_file = open(file,"r").read()
    speech_lines = speech_file.split("\n")
    string = "Delivered" #All the dates for the speeches come after the word delivered
    counter = 0
    #print(speech_lines)
    for line in speech_lines:
        if line.startswith(string) or line.startswith(string.lower()):
            end_index = line.find(",") 
            date_of_speech = line[10:end_index] #Date of speech ends with a comma
            date_of_speech_components = date_of_speech.split(" ")
            if (len(date_of_speech_components) == 3 and (date_of_speech_components[1] in list_of_months)):
                long_month_name = date_of_speech_components[1]
                datetime_object = datetime.datetime.strptime(long_month_name, "%B") #converts the month name into its number
                month_number = datetime_object.month 
                modified_date = "%s-%s-%s" % (date_of_speech_components[2],str(month_number),date_of_speech_components[0]) #modified date allows me to order it for time series plot
                if(not(date_of_speech in date_of_speeches_list)): #only want one speech for every date
                    date_of_speeches_list.append(line[10:end_index])
                    os.rename(r'%s' % (Path(file)),r'%s.txt' % (modified_date))
            break
print(len(date_of_speeches_list)) #number of speeches to be used for sentiment analysis

# 2. Data Preparation
 a) Text Standardization: Expanding Contractions 
 
 b) Tokenization
 
 c) Lemmatization
 
 d) TFIDF

In [9]:
# Data Preparation: Cleans each speech by removing stop words & lemmatizing words to make sure they are of the same base
# Then, vectorized each speech and used inverse document frequency to get the relative importance (weight) of each word
# Lastly, used sklearn's train-test-split and multiple regression model to get the sentiment scores

contractions = {
"can't": "cannot",
"can't've": "cannot have",
"could've": "could have",
"couldn't": "could not",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'll": "I will",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"so've": "so have",
"that's": "that is",
"that'll": "that will",
"there's": "there is",
"they'd've": "they would have",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who shall / who will",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"would've": "would have",
"wouldn't": "would not",
"y'all": "you all",
"you'll": "you will",
"you're": "you are",
"you've": "you have",
"Biden's": "Biden is",
} #Dictionary with contractions: For each speech, I removed the contractions and made it to 2 separate words

import nltk
import numpy as np
from nltk.corpus import twitter_samples 
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn.naive_bayes import GaussianNB
from nltk import FreqDist
import glob
from pathlib import Path
from textblob import TextBlob
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
text_document_list = []
sentiment_list = [] #polarity associated with each speech from textblob 
date_speeches = []  #Adds the date for every speech
speeches = glob.glob("*.txt")
path_to_speeches = "/Users/dhruv_batra/Desktop/SentimentAnalysis_Final/"

#Goes through each speech and adds the sentiment analysis score through TextBlob into the sentiment list
#Used TextBlob as the true y values for my ML Mutliple Regression Model
for file in glob.glob(path_to_speeches + "*.txt"):
    speech = open(file,"r").read()
    date_speeches.append(Path(file).stem[-10:])
    sentiment_list.append(TextBlob(speech).sentiment.polarity*1000) #multiplied sentiment by 1000 for easier readability

    for word in speech.split():
        if word.lower() in contractions: #replaces all contractions in the dictionary as 2 words
            speech = speech.replace(word, contractions[word.lower()])

    tokenizer = nltk.RegexpTokenizer(r"\w+") #Only contains alphanumeric characters (no punctuation)
    speech_tokens = tokenizer.tokenize(speech) #Tokenization


#Changes form of words:Lemmatizing
    def lemmatizer(speech_tokens):
        new_speech = []
        for word, tag in pos_tag(speech_tokens):
            if (tag[0:2] == "NN"):
                pos = "n"
            elif (tag[0:2] == "VB"):
                pos = "v"
            else:
                pos = "a"
            new_speech.append(WordNetLemmatizer().lemmatize(word,pos))
        return new_speech


#Removes unneccessary stuff/noise from Data: removing stop words
    stop_words = set(stopwords.words('english'))
    speech_new = lemmatizer(speech_tokens)
    speech_minus_stopwords = []

    for word in speech_new:
        if word.lower() not in stop_words:
            speech_minus_stopwords.append(word)

    filtered_speech = speech_minus_stopwords
    filtered_speech_lower = []
    for word in filtered_speech:
        filtered_speech_lower.append(word.lower()) #Added all of the words that were not stop words into a list
    
    speech_str = " ".join(filtered_speech_lower) #joined the words of each speech in the list to a string
    text_document_list.append(speech_str)

#Vectorized every speech in the text_document_list
vectorizer = CountVectorizer()
vectorizer.fit(text_document_list)


counts = vectorizer.transform(text_document_list)


count_vect = CountVectorizer()
X_tf = count_vect.fit_transform(text_document_list)

#Inverse Document Frequency to account for the commonality of each word in the speech
vectorize = TfidfTransformer()
vectorize.fit(counts)
freq = vectorize.transform(counts)

X_tfidf = vectorize.fit_transform(X_tf)

X_final = X_tfidf.toarray()
print(X_final) #2D Array of all speeches where each row vector contains the vectorized form for each speech

y = np.array(sentiment_list) #list of all TextBlob sentiment scores associated with each speech



[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.05388055 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.01869618 0.         ... 0.         0.         0.        ]
 [0.         0.01229029 0.         ... 0.         0.         0.        ]
 [0.         0.00581359 0.         ... 0.         0.         0.        ]]


# 3. Model Selection

a) Train-Test-Split

b) Algorithm: SciKit Learn's Multiple Linear Regression

c) Train Model

In [10]:
# 70% Training and 30% Testing
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.30) 
print("X_train: ")
print(X_train)
print(X_train.shape)

print("X_test: ")
print(X_test)
print(X_test.shape)

print("y_train: ")
print(y_train)
print(y_train.shape)

print("y_test: ")
print(y_test)
print(y_test.shape)

#Training Linear Regression (Multiple Regression)
reg = LinearRegression().fit(X_train, y_train)

#print(reg.intercept_)


X_train: 
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01892081 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.00958603 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(264, 18362)
X_test: 
[[0.         0.00436393 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01998476 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.00986762 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01070469 0.         ... 0.         0.         0.        ]]
(114, 18362)
y_train: 
[162.82793009 147.11978719 163.28601631 227.06924804 160.

# 4. Model Prediction


In [11]:
y_pred = reg.predict(X_test) 
print(y_pred)


[174.37145783 208.13019988 164.82097978 169.89747709 163.13086534
  33.94876413 156.80559488 158.43474028 145.65844563 112.82191165
 174.53749218 176.6004885  140.92973892 122.50231813 120.36073818
 189.50211067 156.63132228 157.48369974 194.04381745 129.50675586
 146.15590428 153.83735728 162.10118942 144.69530086 184.68885759
 145.90054443 136.58569602 147.74855583 170.02228489 112.63456564
 143.51092049 187.26079168 145.868321   154.78304373 165.59974727
 178.28334784 103.90290525 143.32396627 165.31102472 125.87412298
 117.71391728 150.59389454 164.17129641 156.48708751 149.29647926
 156.33373707 159.84762553 123.5308424  144.57133215 147.00421158
 108.09753862 134.02975642 177.30511407 156.14698963 185.5974083
 168.56781002 153.97353689 132.16608253 151.47204194 169.02334474
 108.61612056 131.64173123 142.2318709  165.19784782 144.36280603
 125.02584254 155.55925031 145.2050085  164.0570392  161.46904377
 112.82130607 156.92620685 106.80954276 121.95641804 135.70091071
 128.837031

# 5. Model Evaluation


In [12]:
r2_score(y_test, y_pred) #R^2 value for regression model (Variation in y that is explained by the predictors in the model)

0.4172999582528957

# 6. Plotting Results 

a) Dictionary of dates and sentiment scores

b) Ordering Results By Date

c) Plotting Time Series Graph with Bokeh

d) Bar Graph of Most Common Words in Speeches

In [13]:
#Creating a dictionary with keys as Dates and values as sentiment scores

import sys
print(len(date_speeches))
list_of_sentiments = [] #list of sentiments from the machine learning model
list_of_sentiments_final = [] 

for i in range(len(X_final)):
    arr_2d = [X_final[i]] 
    prediction = reg.predict(arr_2d)
    list_of_sentiments.append(prediction) #adds sentiment values from the ML model

for sentiment in list_of_sentiments:
    list_of_sentiments_final.append(sentiment.tolist()[0]) 

sentiment_dict = dict(zip(date_speeches, list_of_sentiments_final)) #dictionary w/ date and sentiment on that speech date
print(sentiment_dict)
    
    
print(len(sentiment_dict))


378
{'2014-4-17': 112.12210122266778, '2012-7-23': 162.10118941742115, '2011-10-16': 137.1709238986727, '2011-3-28': 107.40216561645155, '2010-3-28': 184.13966192537632, '2009-9-9': 168.63667365620415, '2011-6-24': 227.06924803591306, '2016-2-25': 121.95641804224525, '2015-1-27': 184.68885759111942, '2015-11-13': 130.210803689064, '2013-2-12': 144.5713321519807, '2009-7-7': 144.6953008637631, '2007-3-13': 16.56572800137093, '2009-9-8': 158.91911212187648, '2014-9-3': 195.22328878850746, '2010-4-08': 133.97710891259285, '2016-9-11': 241.3559754851911, '2013-11-14': 137.03699078699054, '2012-10-23': 164.994358185534, '2013-12-5': 280.96590909090725, '2012-4-3': 130.94023726728665, '2012-7-20': 126.85941043083817, '2014-4-28': 145.39889149011668, '2012-9-25': 62.29560038070571, '2008-3-18': 96.59849221073682, '2011-5-1': 152.18413803069774, '2013-8-9': 161.67997282885418, '2010-4-22': 102.04703575645183, '2016-7-16': 176.23467952581808, '2011-1-12': 178.96632553314674, '2014-8-14': 123.58

In [14]:
#Ordering the Dictionary by date
del sentiment_dict["20105-1-9"]
from datetime import datetime

list_sentiment_dict = []
for key, value in sentiment_dict.items():
    temp = {"Date": key, "Sentiment":value}
    list_sentiment_dict.append(temp)


list_sentiment_dict.sort(key = lambda x: datetime.strptime(x['Date'], "%Y-%m-%d"))
print(list_sentiment_dict)
print(len(list_sentiment_dict))

[{'Date': '2004-7-27', 'Sentiment': 151.4720419418783}, {'Date': '2005-4-21', 'Sentiment': 195.2692069302812}, {'Date': '2005-6-4', 'Sentiment': 149.29647925833515}, {'Date': '2005-10-25', 'Sentiment': 33.94876412732188}, {'Date': '2005-12-15', 'Sentiment': 85.39229024943307}, {'Date': '2006-7-20', 'Sentiment': 128.2761813083912}, {'Date': '2007-3-13', 'Sentiment': 16.56572800137093}, {'Date': '2007-5-23', 'Sentiment': 86.14532985892122}, {'Date': '2007-11-10', 'Sentiment': 106.37597782334728}, {'Date': '2008-1-3', 'Sentiment': 79.28408509053617}, {'Date': '2008-1-20', 'Sentiment': 90.45173425608228}, {'Date': '2008-3-18', 'Sentiment': 96.59849221073682}, {'Date': '2008-10-1', 'Sentiment': 97.87122441952036}, {'Date': '2008-12-1', 'Sentiment': 160.11610532887116}, {'Date': '2009-1-8', 'Sentiment': 117.15121999212887}, {'Date': '2009-1-20', 'Sentiment': 132.166082527467}, {'Date': '2009-1-24', 'Sentiment': 134.52540778490123}, {'Date': '2009-1-26', 'Sentiment': 158.67961296447734}, {'Da

In [15]:
#Plotting the data points in a time-series graph using Bokeh

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import numpy as np
import pandas as pd
from pandas import DataFrame
from bokeh.io import output_file

y_sentiment = []
x_dates = []
for dictionary in list_sentiment_dict:
    y_sentiment.append(dictionary['Sentiment'])
    x_dates.append(dictionary['Date'])

dates = [datetime.strptime(x,'%Y-%m-%d').date() for x in x_dates]

df = DataFrame(dates,columns=['Dates'])

df.index = pd.to_datetime(df['Dates'])
df.index.name = 'Dates'

print(df.index)
p = figure(x_axis_type = "datetime", plot_height = 800, plot_width = 5000, x_axis_label="Date of Speech",
           y_axis_label="Sentiment Score (Multiplied by 1000)", title = "Sentiment Analysis Score for Obama's Speeches")
p.line(df.index,y_sentiment, line_width = 1, color = "red", line_dash = "dashed")
p.circle(df.index,y_sentiment,size = 5, fill_color = "black", color = "black")
p.title.text_font_size = '30pt'
p.xaxis.axis_label_text_font_size = "15pt"
p.yaxis.axis_label_text_font_size = "15pt"

output_file("SentimentAnalysisScore.html")
show(p)

DatetimeIndex(['2004-07-27', '2005-04-21', '2005-06-04', '2005-10-25',
               '2005-12-15', '2006-07-20', '2007-03-13', '2007-05-23',
               '2007-11-10', '2008-01-03',
               ...
               '2016-12-06', '2016-12-16', '2016-12-27', '2017-01-04',
               '2017-01-06', '2017-01-10', '2017-01-12', '2017-01-17',
               '2017-01-18', '2017-01-20'],
              dtype='datetime64[ns]', name='Dates', length=377, freq=None)


In [16]:
#Bar Graph of most frequent words in speech

from nltk import FreqDist
from bokeh.io import output_notebook
output_notebook()
total_text = " "
total_text = total_text.join(text_document_list)
freq_dist_pos = FreqDist(total_text.split())
freq_dist = freq_dist_pos.most_common(10)
print(freq_dist)

words_list = []
frequency_list = []

for element in freq_dist:
    words_list.append(element[0])
    frequency_list.append(element[1])

h = figure(title = "Frequency of most common words in President Obama's Speeches", x_axis_label = "Words", y_axis_label = "Frequency", x_range = words_list)

#2nd step: render the data
h.vbar(x = words_list, top = frequency_list, color = "red", width = 0.5)

#3rd: show
show(h)


[('people', 6613), ('go', 6176), ('make', 6051), ('get', 5485), ('us', 4234), ('work', 4160), ('say', 4011), ('president', 3865), ('one', 3594), ('year', 3547)]
