In [76]:
import re
import PyPDF2

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer, WordNetLemmatizer

import pandas as pd
import json
import pprint

import seaborn as sns
import matplotlib.pyplot as plt 

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BennettCelichowski\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
# CREATE PDF OBJECTS AND BASIC PARAMS

#pdf_obj = open('CIM-02-American-Casino.pdf','rb')
pdf_Reader = PyPDF2.PdfFileReader(pdf_obj)

num_pages = pdf_Reader.numPages

print(num_pages)



54


In [78]:
# EXTRACT TEXT VIA LOOP


text_list = []

for page in range(num_pages):
    text = pdf_Reader.getPage(page).extractText()
    text_list.append(text)

In [79]:
len(text_list)

54

In [80]:
# CLEAN TEXT

stop_words = stopwords.words('english')

# LEMMATIZER
lemmatizer = WordNetLemmatizer()

# CLEANING LOOP

clean_data = []

for word in range(num_pages):
    #REMOVE PUNCTUATION
    words = re.sub('[^a-zA-Z]',' ',text_list[word])
    #MAKE EVERYHTING LOWERCASE
    words=words.lower().split()
    #ELIMINATE STOPWORDS
    words =[lemmatizer.lemmatize(word) for word in words if (word not in stop_words)]
    #CREATE LIST OF STOPWORDS
    words=' '.join(words)
    clean_data.append(words)



In [81]:
# SEPARATE WORDS

clean_words = []

for page in clean_data:
    for word in page.split():
        clean_words.append(word)




In [82]:
# SEPARATE WORDS & PAGE #'s


def text_by_page(data):

    clean_page = {}

    for page in range(num_pages):
        text = pdf_Reader.getPage(page).extractText()
        pg = pdf_Reader.getPageNumber(pdf_Reader.getPage(page))
        clean_page.update({pg:text})

    clean_page = pd.DataFrame(clean_page, index=[0]).transpose()

    return clean_page 

page_text = text_by_page(pdf_Reader)

page_text.shape


(54, 1)

In [83]:
# CLEAN PAGE DATA

clean_page_data = []

def clean_pages(data):

    for word in data[0]:
        #REMOVE PUNCTUATION
        words = re.sub('[^a-zA-Z]',' ',word)
        #MAKE EVERYHTING LOWERCASE
        words=words.lower().split()
        #ELIMINATE STOPWORDS
        words =[lemmatizer.lemmatize(word) for word in words if (word not in stop_words)]
        #CREATE LIST OF STOPWORDS
        words=' '.join(words)
        clean_page_data.append(words)

    clean_page_df = pd.DataFrame(clean_page_data)


    clean_page_df.reset_index(inplace=True)
    clean_page_df.rename(columns = {'index': 'Page'}, inplace=True)
    clean_page_df.rename(columns = {0: 'text'}, inplace=True)

    return clean_page_df

clean_pages = clean_pages(page_text)

clean_pages.shape


(54, 2)

In [84]:
from collections import Counter

def count_common_words(lst,count):

    most_common_words= [word for word in Counter(lst).most_common(count)]
    
    return most_common_words


most_common_words = pd.DataFrame(count_common_words(clean_words,30))

In [85]:
# IMPORT AFINN MODEL TO EVALUATE POSTIIVE NEGATIVE SENTIMENT OF RAW TWEETS

from afinn import Afinn


# SET AFINN DEFAULT AS ENGLISH
afinn = Afinn(language = 'en')

In [86]:
def AFINN(data):

    afinn_lst = []



    for str in data['text']:
        
        # NORMALIZE FOR TWEET LENGTH 
        adj_AFINN = (afinn.score(str)/ len(str.split()))
        afinn_lst.append(adj_AFINN)
    
    afinn_df = pd.DataFrame(afinn_lst)
    afinn_df.reset_index(inplace=True)
    afinn_df.rename(columns = {'index': 'Page'}, inplace=True)
    afinn_df.rename(columns = {0: 'AFINN SCORE'}, inplace=True)


    return afinn_df

AFINN_df = AFINN(clean_pages)

AFINN_df

Unnamed: 0,Page,AFINN SCORE
0,0,0.0
1,1,0.050595
2,2,0.0
3,3,0.0
4,4,0.080357
5,5,0.040323
6,6,-0.098765
7,7,0.0
8,8,0.132743
9,9,0.025907


In [87]:
# VADER SENTIMENT  - IMPORT LIBS

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [88]:
# APPLY VADER SENTIMENT

Vader_dict = {}

def Vaderize_words(data):
    for word in data:
        Vader_sentiment = analyzer.polarity_scores(word)
        Vader_dict.update({word:Vader_sentiment})

    Vader_df = pd.DataFrame(Vader_dict)
    Vader_df = Vader_df.transpose()

    Vader_df.reset_index(inplace=True)
    Vader_df.rename(columns = {'index': 'text'}, inplace=True)
    Vader_df.reset_index(inplace=True)
    Vader_df.rename(columns = {'index': 'Page'}, inplace=True)

    return Vader_df

Vader_page = pd.DataFrame(Vaderize_words(clean_pages['text']))

Vader_page.head()


Unnamed: 0,Page,text,neg,neu,pos,compound
0,0,book number issued confidential information me...,0.0,0.822,0.178,0.0772
1,1,bear stearns co inc v confidential confidentia...,0.018,0.83,0.153,0.9915
2,2,bear stearns co inc v confidential table conte...,0.0,1.0,0.0,0.0
3,3,section executive summary,0.0,1.0,0.0,0.0
4,4,bear stearns co inc confidential executive sum...,0.0,0.825,0.175,0.9888


In [89]:
# JOIN AFINN AND VADER

Sentiment_df = Vader_page.merge(AFINN_df, on='Page', how='left')
Sentiment_df.head()

Unnamed: 0,Page,text,neg,neu,pos,compound,AFINN SCORE
0,0,book number issued confidential information me...,0.0,0.822,0.178,0.0772,0.0
1,1,bear stearns co inc v confidential confidentia...,0.018,0.83,0.153,0.9915,0.050595
2,2,bear stearns co inc v confidential table conte...,0.0,1.0,0.0,0.0,0.0
3,3,section executive summary,0.0,1.0,0.0,0.0,0.0
4,4,bear stearns co inc confidential executive sum...,0.0,0.825,0.175,0.9888,0.080357


In [92]:
Sentiment_pivot =  pd.DataFrame(round(pd.pivot_table(data=Sentiment_df, index= 'Page' ,values=['neg','pos','AFINN SCORE','compound'], aggfunc='mean'),4)).sort_values(by= 'compound', ascending=False)

Sentiment_pivot

Unnamed: 0_level_0,AFINN SCORE,compound,neg,pos
Page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21,0.0985,0.9935,0.0,0.171
22,0.0601,0.992,0.02,0.176
35,0.0979,0.9919,0.025,0.218
1,0.0506,0.9915,0.018,0.153
23,0.068,0.9914,0.027,0.214
53,0.0596,0.9897,0.025,0.2
18,0.0708,0.9893,0.0,0.155
4,0.0804,0.9888,0.0,0.175
13,0.1193,0.9882,0.007,0.21
11,0.1875,0.9869,0.012,0.302


In [91]:
# ID PAGES WITH MOST +/- SENTIMENT


Vader_page_dict = {}

def Vaderize_pages(data):
    for word in page_text[0]:
        Vader_sentiment = analyzer.polarity_scores(word)
        Vader_page_dict.update({word:Vader_sentiment})

    Vader_df = pd.DataFrame(Vader_dict)
    Vader_page_df = Vader_df.transpose()
    return Vader_page_df

Vaderize_pages(clean_page_data).shape



(54, 4)