# 1. Data Collection
a) Scrapes the website and downloads all of Obama's speeches as PDF's [378 Speeches Downloaded]

b) Converts PDF's into text files (.txt) 

c) Changes the name of each file into the date the speech was given

In [None]:

import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.americanrhetoric.com/barackobamaspeeches.htm"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}

folder_location = r'webscraping_final'
if not os.path.exists(folder_location):
   os.mkdir(folder_location)

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
print(response.text)
for link in soup.select("a[href$='.pdf']"):
   filename = os.path.join(folder_location,link['href'].split('/')[-1])
   with open(filename, 'wb') as f:
      f.write(requests.get(urljoin(url,link['href']), headers=headers).content)

In [None]:
import os

from tika import parser
directory = r'webscraping_final'
folder_location = r'textdata_final'
if not os.path.exists(folder_location):
	os.mkdir(folder_location)
for filename in os.listdir(directory):
    print (filename)
    input_file= directory + "/" + filename
    file_data = parser.from_file(input_file)
    text = file_data['content']
    output_filepath= folder_location + "/" + filename + '.txt'
    f = open(output_filepath, "a")
    f.write(text)
    f.close()


In [None]:
import os
from pathlib import Path
import re
import glob
import datetime
list_of_months =  ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
path_to_speeches = "/Users/dhruv_batra/Desktop/SentimentAnalysis_Final/textdata_final/"
date_of_speeches_list = []
for file in glob.glob(path_to_speeches + "*.txt"):
    speech_file = open(file,"r").read()
    speech_lines = speech_file.split("\n")
    string = "Delivered" #All the dates for the speeches come after the word delivered
    counter = 0
    #print(speech_lines)
    for line in speech_lines:
        if line.startswith(string) or line.startswith(string.lower()):
            end_index = line.find(",") 
            date_of_speech = line[10:end_index] #Date of speech ends with a comma
            date_of_speech_components = date_of_speech.split(" ")
            if (len(date_of_speech_components) == 3 and (date_of_speech_components[1] in list_of_months)):
                long_month_name = date_of_speech_components[1]
                datetime_object = datetime.datetime.strptime(long_month_name, "%B") #converts the month name into its number
                month_number = datetime_object.month 
                modified_date = "%s-%s-%s" % (date_of_speech_components[2],str(month_number),date_of_speech_components[0]) #modified date allows me to order it for time series plot
                if(not(date_of_speech in date_of_speeches_list)): #only want one speech for every date
                    date_of_speeches_list.append(line[10:end_index])
                    os.rename(r'%s' % (Path(file)),r'%s.txt' % (modified_date))
            break
print(len(date_of_speeches_list)) #number of speeches to be used for sentiment analysis

# 2. Data Preparation
 a) Text Standardization: Expanding Contractions 
 
 b) Tokenization
 
 c) Lemmatization
 
 d) TFIDF

In [None]:
# Data Preparation: Cleans each speech by removing stop words & lemmatizing words to make sure they are of the same base
# Then, vectorized each speech and used inverse document frequency to get the relative importance (weight) of each word
# Lastly, used sklearn's train-test-split and multiple regression model to get the sentiment scores

contractions = {
"can't": "cannot",
"can't've": "cannot have",
"could've": "could have",
"couldn't": "could not",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'll": "I will",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"so've": "so have",
"that's": "that is",
"that'll": "that will",
"there's": "there is",
"they'd've": "they would have",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who shall / who will",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"would've": "would have",
"wouldn't": "would not",
"y'all": "you all",
"you'll": "you will",
"you're": "you are",
"you've": "you have",
"Biden's": "Biden is",
} #Dictionary with contractions: For each speech, I removed the contractions and made it to 2 separate words

import nltk
import numpy as np
from nltk.corpus import twitter_samples 
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn.naive_bayes import GaussianNB
from nltk import FreqDist
import glob
from pathlib import Path
from textblob import TextBlob
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
text_document_list = []
sentiment_list = [] #polarity associated with each speech from textblob 
date_speeches = []  #Adds the date for every speech
speeches = glob.glob("*.txt")
path_to_speeches = "/Users/dhruv_batra/Desktop/SentimentAnalysis_Final/"

#Goes through each speech and adds the sentiment analysis score through TextBlob into the sentiment list
#Used TextBlob as the true y values for my ML Mutliple Regression Model
for file in glob.glob(path_to_speeches + "*.txt"):
    speech = open(file,"r").read()
    date_speeches.append(Path(file).stem[-10:])
    sentiment_list.append(TextBlob(speech).sentiment.polarity*1000) #multiplied sentiment by 1000 for easier readability

    for word in speech.split():
        if word.lower() in contractions: #replaces all contractions in the dictionary as 2 words
            speech = speech.replace(word, contractions[word.lower()])

    tokenizer = nltk.RegexpTokenizer(r"\w+") #Only contains alphanumeric characters (no punctuation)
    speech_tokens = tokenizer.tokenize(speech) #Tokenization


#Changes form of words:Lemmatizing
    def lemmatizer(speech_tokens):
        new_speech = []
        for word, tag in pos_tag(speech_tokens):
            if (tag[0:2] == "NN"):
                pos = "n"
            elif (tag[0:2] == "VB"):
                pos = "v"
            else:
                pos = "a"
            new_speech.append(WordNetLemmatizer().lemmatize(word,pos))
        return new_speech


#Removes unneccessary stuff/noise from Data: removing stop words
    stop_words = set(stopwords.words('english'))
    speech_new = lemmatizer(speech_tokens)
    speech_minus_stopwords = []

    for word in speech_new:
        if word.lower() not in stop_words:
            speech_minus_stopwords.append(word)

    filtered_speech = speech_minus_stopwords
    filtered_speech_lower = []
    for word in filtered_speech:
        filtered_speech_lower.append(word.lower()) #Added all of the words that were not stop words into a list
    
    speech_str = " ".join(filtered_speech_lower) #joined the words of each speech in the list to a string
    text_document_list.append(speech_str)

#Vectorized every speech in the text_document_list
vectorizer = CountVectorizer()
vectorizer.fit(text_document_list)


counts = vectorizer.transform(text_document_list)


count_vect = CountVectorizer()
X_tf = count_vect.fit_transform(text_document_list)

#Inverse Document Frequency to account for the commonality of each word in the speech
vectorize = TfidfTransformer()
vectorize.fit(counts)
freq = vectorize.transform(counts)

X_tfidf = vectorize.fit_transform(X_tf)

X_final = X_tfidf.toarray()
print(X_final) #2D Array of all speeches where each row vector contains the vectorized form for each speech

y = np.array(sentiment_list) #list of all TextBlob sentiment scores associated with each speech



# 3. Model Selection

a) Train-Test-Split

b) Algorithm: SciKit Learn's Multiple Linear Regression

c) Train Model

In [None]:
# 70% Training and 30% Testing
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.30) 
print("X_train: ")
print(X_train)
print(X_train.shape)

print("X_test: ")
print(X_test)
print(X_test.shape)

print("y_train: ")
print(y_train)
print(y_train.shape)

print("y_test: ")
print(y_test)
print(y_test.shape)

#Training Linear Regression (Multiple Regression)
reg = LinearRegression().fit(X_train, y_train)

#print(reg.intercept_)


# 4. Model Prediction


In [None]:
y_pred = reg.predict(X_test) 
print(y_pred)


# 5. Model Evaluation


In [None]:
r2_score(y_test, y_pred) #R^2 value for regression model (Variation in y that is explained by the predictors in the model)

# 6. Plotting Results 

a) Dictionary of dates and sentiment scores

b) Ordering Results By Date

c) Plotting Time Series Graph with Bokeh

In [None]:
#Creating a dictionary with keys as Dates and values as sentiment scores

import sys
print(len(date_speeches))
list_of_sentiments = [] #list of sentiments from the machine learning model
list_of_sentiments_final = [] 

for i in range(len(X_final)):
    arr_2d = [X_final[i]] 
    prediction = reg.predict(arr_2d)
    list_of_sentiments.append(prediction) #adds sentiment values from the ML model

for sentiment in list_of_sentiments:
    list_of_sentiments_final.append(sentiment.tolist()[0]) 

sentiment_dict = dict(zip(date_speeches, list_of_sentiments_final)) #dictionary w/ date and sentiment on that speech date
print(sentiment_dict)
    
    
print(len(sentiment_dict))


In [None]:
#Ordering the Dictionary by date
del sentiment_dict["20105-1-9"]
from datetime import datetime

list_sentiment_dict = []
for key, value in sentiment_dict.items():
    temp = {"Date": key, "Sentiment":value}
    list_sentiment_dict.append(temp)


list_sentiment_dict.sort(key = lambda x: datetime.strptime(x['Date'], "%Y-%m-%d"))
print(list_sentiment_dict)
print(len(list_sentiment_dict))

In [None]:
#Plotting the data points in a time-series graph using Bokeh

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import numpy as np
import pandas as pd
from pandas import DataFrame
y_sentiment = []
x_dates = []
for dictionary in list_sentiment_dict:
    y_sentiment.append(dictionary['Sentiment'])
    x_dates.append(dictionary['Date'])

dates = [datetime.strptime(x,'%Y-%m-%d').date() for x in x_dates]

df = DataFrame(dates,columns=['Dates'])

df.index = pd.to_datetime(df['Dates'])
df.index.name = 'Dates'

print(df.index)
p = figure(x_axis_type = "datetime", plot_height = 800, plot_width = 5000, x_axis_label="Date of Speech",
           y_axis_label="Sentiment Score (Multiplied by 1000)", title = "Sentiment Analysis Score for Obama's Speeches")
p.line(df.index,y_sentiment, line_width = 1, color = "red", line_dash = "dashed")
p.circle(df.index,y_sentiment,size = 5, fill_color = "black", color = "black")
p.title.text_font_size = '30pt'
p.xaxis.axis_label_text_font_size = "15pt"
p.yaxis.axis_label_text_font_size = "15pt"
show(p)