### The objective is to do Sentiment analysis on the company filings reports

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import re
import urllib.request
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

### The data is about company filings.

### This data is taken from 'EDGAR' (Electronic Data Gathering, Analysis and Retrieval system) of U.S Security and  exchange commision

In [2]:
data=pd.read_excel('cik_list.xlsx')[:5]

In [3]:
data.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


In [4]:
data['SECFNAME']=data['SECFNAME'].apply(lambda x:'https://www.sec.gov/Archives/'+x)

In [5]:
data.head(3)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...


#### These Types of financial reports have their own stopwords that are provided on the website

### Creating An array of stopwords Given in different text files

In [6]:
fp1=open('StopWords_Auditor.txt')
fp2=open('StopWords_Currencies.txt')
fp3=open('StopWords_DatesandNumbers.txt')
fp4=open('StopWords_Generic.txt')
fp5=open('StopWords_GenericLong.txt')
fp6=open('StopWords_Geographic.txt')
fp7=open('StopWords_Names.txt')

In [7]:
d1=fp1.read()
d2=fp2.read()
d3=fp3.read()
d4=fp4.read()
d5=fp5.read()
d6=fp6.read()
d7=fp7.read()

In [8]:
s1=pd.Series(d1.split('\n')).apply(lambda x:x.lower().strip()).tolist()
s2=pd.Series(d2.split('\n'))
s21=s2.apply(lambda x:x.split('|')[0].lower().strip()).tolist()
s22=s2.apply(lambda x:x.split('|')[-1].lower().strip()).tolist()
s2=s21+s22
s3=pd.Series(d3.split('\n'))
s3=s3.apply(lambda x:x.split('|')[0].lower().strip()).tolist()
s4=pd.Series(d4.split('\n')).apply(lambda x:x.lower().strip()).tolist()
s5=pd.Series(d5.split('\n')).apply(lambda x:x.strip().lower()).tolist()
s6=pd.Series(d6.split('\n'))
s6=s6.apply(lambda x:x.split('|')[0].strip().lower()).tolist()
s7=pd.Series(d7.split('\n'))
s7=s7.apply(lambda x:x.split('|')[0].strip().lower()).tolist()

In [9]:
s_words=s1+s2+s3+s4+s5+s6+s7

In [10]:
stop_words=[]
for i in s_words:
    if i:
        stop_words.append(i)

In [11]:
stop_words

['ernst',
 'young',
 'deloitte',
 'touche',
 'kpmg',
 'pricewaterhousecoopers',
 'pricewaterhouse',
 'coopers',
 'afghani',
 'ariary',
 'baht',
 'balboa',
 'birr',
 'bolivar',
 'boliviano',
 'cedi',
 'colon',
 'córdoba',
 'dalasi',
 'denar',
 'dinar',
 'dirham',
 'dobra',
 'dong',
 'dram',
 'escudo',
 'euro',
 'florin',
 'forint',
 'gourde',
 'guarani',
 'gulden',
 'hryvnia',
 'kina',
 'kip',
 'konvertibilna marka',
 'koruna',
 'krona',
 'krone',
 'kroon',
 'kuna',
 'kwacha',
 'kwanza',
 'kyat',
 'lari',
 'lats',
 'lek',
 'lempira',
 'leone',
 'leu',
 'lev',
 'lilangeni',
 'lira',
 'litas',
 'loti',
 'manat',
 'metical',
 'naira',
 'nakfa',
 'new lira',
 'new sheqel',
 'ngultrum',
 'nuevo sol',
 'ouguiya',
 'pataca',
 'peso',
 'pound',
 'pula',
 'quetzal',
 'rand',
 'real',
 'renminbi',
 'rial',
 'riel',
 'ringgit',
 'riyal',
 'ruble',
 'rufiyaa',
 'rupee',
 'rupee',
 'rupiah',
 'shilling',
 'som',
 'somoni',
 'special drawing rights',
 'taka',
 'tala',
 'tenge',
 'tugrik',
 'vatu',
 '

### There is a file which contains the dictionary of positive , negative, syllables, uncertain_words, constraining words to categorize them from the reports.

In [12]:
words=pd.read_csv('LoughranMcDonald_MasterDictionary_2020.csv')

In [13]:
po_words=words[words['Positive']>0]['Word'].apply(lambda x:x.lower()).tolist()
ne_words=words[words['Positive']>0]['Word'].apply(lambda x:x.lower()).tolist()
com_words=words[words['Syllables']>2]['Word'].apply(lambda x:x.lower()).tolist()

In [14]:
uncer_words=pd.read_excel('uncertainty_dictionary.xlsx')['Word'].apply(lambda x:x.lower()).tolist()
const_words=pd.read_excel('constraining_dictionary.xlsx')['Word'].apply(lambda x:x.lower()).tolist()

### Now the Objective is to add some columns in the data containing the information of every report like , postive_score, negative_score, avg_sentence_length,fog_index, word_count,uncertainty_score,polarity_score etc.

In [28]:
port=PorterStemmer()
wnet=WordNetLemmatizer()

In [39]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
headers = {"User-Agent": user_agent}
po_count=[]
ne_count=[]
co_count=[]
avg_sen_lengths=[]
fog_indices=[]
percentage_co_words=[]
word_count=[]
uncertainty_score=[]
constraining_score=[]
polarity_score=[]
positive_word_proportion=[]
negative_word_proportion=[]
uncertainty_word_proportion=[]
constraining_word_proportion=[]
constraining_words_whole_report=[]
c=1
print(c)
for url in data['SECFNAME']:
    resp = requests.get(url, headers=headers)
    df=(resp.text).lower()
    text_1=word_tokenize(df)
    print('1st')
    clean_text_1=[]
    for i in text_1:
        res=re.sub(r'[^\w\s]',"",i)
        if res!="":
            clean_text_1.append(res)
    word_count.append(len(clean_text_1))
    clean_text=clean_text_1.copy()
    print('2nd')
    clean_text_2=[]
    for i in clean_text:
        if i.lower() in stop_words:
            clean_text.remove(i)
        else:
            clean_text_2.append(wnet.lemmatize(port.stem(i)))
            
    po=0
    ne=0
    co=0
    uncer=0
    const=0
    for j in clean_text_2:
        if j in po_words:
            po+=1
        elif j in ne_words:
            ne+=1
        elif j in com_words:
            co+=1
        elif j in uncer_words:
            uncer+=1
        elif j in const_words:
            const+=1       
    p_score=(po-ne)/((po+ne)+0.000001)
    polarity_score.append(p_score)
    print('3rd')        
    po_count.append(po)
    ne_count.append(ne)
    co_count.append(co)
    uncertainty_score.append(uncer)
    constraining_score.append(const)
    positive_word_proportion.append(po/len(clean_text_2))
    negative_word_proportion.append(ne/len(clean_text_2))
    uncertainty_word_proportion.append(uncer/len(clean_text_2))
    constraining_word_proportion.append(const/len(clean_text_2))
    constraining_words_whole_report.append(const)
    percentage_co_words.append(co/len(clean_text_2))

    sentences=df.split('.')
    avg_sen_len=len(clean_text_2)//len(sentences)
    avg_sen_lengths.append(avg_sen_len)
    per_co_words=co/len(clean_text_2)
    fog_index=round(0.4*(avg_sen_len + per_co_words),2)
    fog_indices.append(fog_index)
    c+=1

1
1st
2nd
3rd
4th
1st
2nd
3rd
4th
1st
2nd
3rd
4th
1st
2nd
3rd
4th
1st
2nd
3rd
4th


In [42]:
data['positive_score']=po_count
data['negative_score']=ne_count
data['polarity_score']=polarity_score
data['average_setence_length']=avg_sen_lengths
data['percentage_of_complex_words']=percentage_co_words
data['fog_index']=fog_indices
data['complex_word_count']=co_count
data['word_count']=word_count
data['uncertainty_score']=uncertainty_score
data['constraining_score']=constraining_score
data['positive_word_proportion']=positive_word_proportion
data['negative_word_proportion']=negative_word_proportion
data['uncertainty_word_proportion']=uncertainty_word_proportion
data['constraining_word_proportion']=constraining_word_proportion
data['constraining_words_whole_report']=constraining_words_whole_report

In [43]:
data

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,positive_score,negative_score,polarity_score,average_setence_length,fog_index,complex_word_count,word_count,uncertainty_score,constraining_score,positive_word_proportion,negative_word_proportion,uncertainty_word_proportion,constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,49,0,1.0,0,767.4,3837,170952,32,339,24.5,0.0,16.0,169.5,339
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,31,0,1.0,0,463.2,2316,112214,25,190,15.5,0.0,12.5,95.0,190
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,0,0,0.0,0,2.6,13,905,0,0,0.0,0.0,0.0,0.0,0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,https://www.sec.gov/Archives/edgar/data/3662/0...,47,0,1.0,0,362.6,1813,84741,29,172,23.5,0.0,14.5,86.0,172
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,0,0,0.0,0,3.0,15,1129,1,0,0.0,0.0,0.5,0.0,0
