# 1. Data Collection
(Don't need to run these cells if files already downloaded from the first code file)

a) Scrapes the website and downloads all of Obama's speeches as PDF's [378 Speeches Downloaded]

b) Converts PDF's into text files (.txt) 

In [None]:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.americanrhetoric.com/barackobamaspeeches.htm"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}

folder_location = r'webscraping_final'
if not os.path.exists(folder_location):
   os.mkdir(folder_location)

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
print(response.text)
for link in soup.select("a[href$='.pdf']"):
   filename = os.path.join(folder_location,link['href'].split('/')[-1])
   with open(filename, 'wb') as f:
      f.write(requests.get(urljoin(url,link['href']), headers=headers).content)

In [None]:
import os

from tika import parser
directory = r'webscraping_final'
folder_location = r'textdata_final'
if not os.path.exists(folder_location):
	os.mkdir(folder_location)
for filename in os.listdir(directory):
    print (filename)
    input_file= directory + "/" + filename
    file_data = parser.from_file(input_file)
    text = file_data['content']
    output_filepath= folder_location + "/" + filename + '.txt'
    f = open(output_filepath, "a")
    f.write(text)
    f.close()

# 2. Data Preparation
 a) Text Standardization: Expanding Contractions 
 
 b) Tokenization
 
 c) Lemmatization
 
 d) TFIDF

In [1]:
# Data Preparation: Cleans each speech by removing stop words & lemmatizing words to make sure they are of the same base
# Then, vectorized each speech and used inverse document frequency to get the relative importance (weight) of each word
# Lastly, used sklearn's train-test-split and Gaussian Naive Bayes Classifier

contractions = {
"can't": "cannot",
"can't've": "cannot have",
"could've": "could have",
"couldn't": "could not",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'll": "I will",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"so've": "so have",
"that's": "that is",
"that'll": "that will",
"there's": "there is",
"they'd've": "they would have",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who shall / who will",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"would've": "would have",
"wouldn't": "would not",
"y'all": "you all",
"you'll": "you will",
"you're": "you are",
"you've": "you have",
"Biden's": "Biden is",
} #Dictionary with contractions: For each speech, I removed the contractions and made it to 2 separate words

import nltk
import numpy as np
from nltk.corpus import twitter_samples 
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn.naive_bayes import GaussianNB
from nltk import FreqDist
import glob
from pathlib import Path
from textblob import TextBlob
from sklearn.neighbors import KNeighborsClassifier

text_document_list = []
sentiment_list = [] #polarity associated with each speech from textblob
date_speeches = []  #Adds the date for every speech
speeches = glob.glob("*.txt")
path_to_speeches = "/Users/dhruv_batra/Desktop/SentimentAnalysis_Final/"

#Goes through each speech and adds the sentiment analysis score through TextBlob into the sentiment list
#Used TextBlob as the true y values for my Gaussian Naive Bayes Model
for file in glob.glob(path_to_speeches + "*.txt"):
    speech = open(file,"r").read()
    date_speeches.append(Path(file).stem[-10:])
    if(TextBlob(speech).sentiment.polarity*1000 <= -200): #multiplied sentiment by 1000 for easier readability
        sentiment_list.append("Very Negative")
    elif(-200 < TextBlob(speech).sentiment.polarity*1000 < 0):
        sentiment_list.append("Negative")
    elif(TextBlob(speech).sentiment.polarity*1000 == 0):
        sentiment_list.append("Neutral")
    elif(0 < TextBlob(speech).sentiment.polarity*1000 <= 250):
        sentiment_list.append("Positive")
    elif(TextBlob(speech).sentiment.polarity*1000 > 250):
        sentiment_list.append("Very Positive")

    for word in speech.split():
        if word.lower() in contractions: #replaces all contractions as 2 words
            speech = speech.replace(word, contractions[word.lower()])

    tokenizer = nltk.RegexpTokenizer(r"\w+") #Only contains alphanumeric characters (no punctuation)
    speech_tokens = tokenizer.tokenize(speech) #Tokenization

#Changes form of words:Lemmatizing
    def lemmatizer(speech_tokens):
        new_speech = []
        for word, tag in pos_tag(speech_tokens):
            if (tag[0:2] == "NN"):
                pos = "n"
            elif (tag[0:2] == "VB"):
                pos = "v"
            else:
                pos = "a"
            new_speech.append(WordNetLemmatizer().lemmatize(word,pos))
        return new_speech

#Removes unneccessary stuff/noise from Data: removing stop words
    stop_words = set(stopwords.words('english'))
    speech_new = lemmatizer(speech_tokens)
    speech_minus_stopwords = []

    for word in speech_new:
        if word.lower() not in stop_words:
            speech_minus_stopwords.append(word)

    filtered_speech = speech_minus_stopwords
    filtered_speech_lower = []
    for word in filtered_speech:
        filtered_speech_lower.append(word.lower()) #Added all of the words that were not stop words into a list
    
    speech_str = " ".join(filtered_speech_lower) #joined the words of each speech in the list to a string
    text_document_list.append(speech_str)

#Vectorized every speech in the text_document_list
vectorizer = CountVectorizer()
vectorizer.fit(text_document_list)


counts = vectorizer.transform(text_document_list)


count_vect = CountVectorizer()
X_tf = count_vect.fit_transform(text_document_list)

#Inverse Document Frequency to account for the commonality of each word in the speech
vectorize = TfidfTransformer()
vectorize.fit(counts)
freq = vectorize.transform(counts)

X_tfidf = vectorize.fit_transform(X_tf)

X_final = X_tfidf.toarray()
print(X_final) #2D Array of all speeches where each row vector contains the vectorized form for each speech

y = np.array(sentiment_list) #list of all TextBlob sentiment scores associated with each speech




[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.05388055 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.01869618 0.         ... 0.         0.         0.        ]
 [0.         0.01229029 0.         ... 0.         0.         0.        ]
 [0.         0.00581359 0.         ... 0.         0.         0.        ]]


# 3. Model Selection

a) Train-Test-Split

b) Algorithm: SciKit Learn's Gaussian Naive Bayes

c) Train Model

In [2]:
# 70% Training and 30% Testing
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.30)
print("X_train: ")
print(X_train)
print(X_train.shape)

print("X_test: ")
print(X_test)
print(X_test.shape)

print("y_train: ")
print(y_train)
print(y_train.shape)

print("y_test: ")
print(y_test)
print(y_test.shape)

#Gaussian Naive Bayes Model
gnb = GaussianNB()

X_train: 
[[0.         0.01312801 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.0215482  0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.00499305 0.         ... 0.         0.         0.        ]]
(264, 18362)
X_test: 
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01406815 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(114, 18362)
y_train: 
['Positive' 'Positive' 'Positive' 'Positive' 'Positive' '

# 4. Model Prediction

In [3]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(y_pred)

['Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Positive' 'Posit

# 5. Model Evaluation

In [4]:
#Determines the number of points that have been labeled incorrectly based on y_test

print("Number of mislabeled points out of a total %d points : %d"
       % (X_test.shape[0], (y_test != y_pred).sum()))
print("Percentage Correct w/ Gaussian Naive Bayes Classifier: " + str((1-((y_test != y_pred).sum())/X_test.shape[0])*100) + "%")


Number of mislabeled points out of a total 114 points : 1
Percentage Correct w/ Gaussian Naive Bayes Classifier: 99.12280701754386%
