<h1 align='center'> Data Extraction and Text Analysis <br><br>Blackcoffer Consulting</h1>

In [1]:
#----------------Importing Basic packages---------------#
import pandas as pd
import numpy as np

In [2]:
#----------------Loading Data---------------#
df_cik = pd.read_excel("./cik_list.xlsx")
rows, cols = df_cik.shape
df_cik.head(5)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


In [3]:
link = 'https://www.sec.gov/Archives/'
df_cik.SECFNAME = link + df_cik.SECFNAME
df_cik.head()

0    https://www.sec.gov/Archives/edgar/data/3662/0...
1    https://www.sec.gov/Archives/edgar/data/3662/0...
2    https://www.sec.gov/Archives/edgar/data/3662/0...
3    https://www.sec.gov/Archives/edgar/data/3662/0...
4    https://www.sec.gov/Archives/edgar/data/3662/0...
Name: SECFNAME, dtype: object

In [4]:
#----------------Reading Dictornary---------------#

c_dict = set(pd.read_excel('./constraining_dictionary.xlsx',index_col = 0).index)
u_dict = set(pd.read_excel('./uncertainty_dictionary.xlsx', index_col = 0).index)

<h1>Text preprocessing</h1>

In [5]:
#----------------Importing Preprocessing packages---------------#
!pip install nltk
!pip install bs4
import requests
import re, string, unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer



In [6]:
nltk.download('stopwords')
nltk.download('punkt')
set_stoper = set(w.upper() for w in stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
!pip install glob2

import glob
path = "StopWords*.txt"
glob.glob(path)
for filename in glob.glob(path):
    with open(filename, 'r') as f:
        html = f.read()
        html = re.sub(r"\s+\|\s+[\w]*" , "", html)        
        set_stoper.update(html.upper().split())



In [8]:
from nltk.corpus import cmudict
nltk.download('cmudict')
d = cmudict.dict()

def vowel(word):
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

def notvowel(word):
    try:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    except KeyError:
        #if no word in cmudict
        return vowel(word)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [9]:
def strip_html(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(html):
    return re.sub('\[[^]]*\]', '', html)

def remove_digits(html):
    return re.sub('[\d%/$]', '', html)

def denoise_text(html):
    html = strip_html(html)
    html = remove_between_square_brackets(html)
    html = remove_digits(html)
    return html

def remove_non_ascii(total_words):
    """Remove non-ASCII characters from list of tokenized total_words"""
    new_words = []
    for word in total_words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_upper_case(total_words):
    """Convert all characters to lowercase from list of tokenized total_words"""
    new_words = []
    for word in total_words:
        new_word = word.upper()
        new_words.append(new_word)
    return new_words

def remove_punctuation(total_words):
    """Remove punctuation from list of tokenized total_words"""
    new_words = []
    for word in total_words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def remove_stopwords(total_words):
    """Remove stop total_words from list of tokenized total_words"""
    new_words = []
    for word in total_words:
        if word not in set_stoper:
            new_words.append(word)
    return new_words

def stem_words(total_words):
    """Stem total_words in list of tokenized total_words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in total_words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(total_words):
    """Lemmatize verbs in list of tokenized total_words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in total_words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(total_words):
    total_words = remove_non_ascii(total_words)
    total_words = to_upper_case(total_words)
    total_words = remove_punctuation(total_words)
    total_words = remove_stopwords(total_words)
    return total_words

def stem_and_lemmatize(total_words):
    stems = stem_words(total_words)
    lemmas = lemmatize_verbs(total_words)
    return stems, lemmas

In [10]:
# section names
MDA = "Management's Discussion and Analysis"
QQDMR = "Quantitative and Qualitative Disclosures about Market Risk"
RF = "Risk Factors"
section_name = ['MDA','QQDMR',"RF"]
section = [MDA.upper(),QQDMR.upper(),RF.upper()]
variables = ['positive_score','negative_score','polarity_score','average_sentence_length', 'percentage_of_complex_words',\
                   'fog_index','complex_word_count','word_count','uncertainty_score','constraining_score', 'positive_word_proportion',\
                   'negative_word_proportion', 'uncertainty_word_proportion', 'constraining_word_proportion' ]

In [11]:
import itertools

constrain_word_report = pd.Series(name='constrain_word_report')

df_col = [sec.lower() + '_' + var for sec,var in itertools.product(section_name,variables) ]
df = pd.DataFrame(columns=df_col)

  constrain_word_report = pd.Series(name='constrain_word_report')


In [12]:
df_cik.loc[64]

CIK                                                      4962
CONAME                                    AMERICAN EXPRESS CO
FYRMO                                                  201407
FDATE                                     2014-07-30 00:00:00
FORM                                                     10-Q
SECFNAME    https://www.sec.gov/Archives/edgar/data/4962/0...
Name: 64, dtype: object

In [13]:
# # saving all forms locally 
import os

for i in range(rows):   
    html = requests.get(df_cik.SECFNAME[i]).text
    file_name = 'form' + str(i)
    completeName = os.path.join('forms', file_name)
    f = open(completeName, 'a+', encoding="utf-8")
    f.write(html)
    f.close()

In [14]:
for i in range(rows):
    file_name = './forms/form' + str(i)
    html = open(file_name,'r').read()
    print('reading', end = ' ')
                
    
    #constrain_word_report
    constraining_words_whole_report_count = 0
    for word in denoise_text(html).split():
        if word in c_dict:
            constraining_words_whole_report_count += 1
        constrain_word_report.loc[i] = constraining_words_whole_report_count

    df.loc[i] = np.zeros(42)
    # other variable per sections
    for j in range(3):
        if i in [63,64]:
            continue
        print(i,j,sep= '|', end = ' ')
        exp = r".*(?P<start>ITEM [\d]\. " + re.escape(section[j]) + r")(?P<MDA>.*)(?P<body>[\s\S]*)(?P<end>ITEM \d|SIGNATURES)"
        regexp = re.compile(exp)
        s = regexp.search(html)
        
        if s:
            data = s.group('body')
            html = denoise_text(data)
            sent_list = sent_tokenize(html)
            sentence_length = len(sent_list)

            sample = html.split()
            sample = normalize(sample)
            word_count = len(sample)
            complex_word_count = 0
            
            for word in sample:
                if notvowel(word.lower()) > 2:
                    complex_word_count += 1
            
            average_sentence_length = word_count/sentence_length
            percentage_of_complex_words = complex_word_count/word_count
            fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)
            
            positive_score = 0
            negative_score = 0
            uncertainty_score = 0
            constraining_score = 0
            for word in sample:
                if word in u_dict:
                    uncertainty_score += 1
                if word in c_dict:
                    constraining_score += 1
            polarity_score = (positive_score-negative_score)/(positive_score + negative_score + .000001)
            positive_word_proportion = positive_score/word_count
            negative_word_proportion = negative_score/word_count
            uncertainty_word_proportion = uncertainty_score/word_count
            constraining_word_proportion = constraining_score/word_count
            
            df.loc[i][section_name[j].lower() + "_positive_score"] = positive_score
            df.loc[i][section_name[j].lower() + "_negative_score"] = negative_score
            df.loc[i][section_name[j].lower() + "_polarity_score"] = polarity_score
            df.loc[i][section_name[j].lower() + "_average_sentence_length"] = average_sentence_length
            df.loc[i][section_name[j].lower() + "_percentage_of_complex_words"] = percentage_of_complex_words
            df.loc[i][section_name[j].lower() + "_fog_index"] = fog_index
            df.loc[i][section_name[j].lower() + "_complex_word_count"] = complex_word_count
            df.loc[i][section_name[j].lower() + "_word_count"] = word_count
            df.loc[i][section_name[j].lower() + "_uncertainty_score"] = uncertainty_score
            df.loc[i][section_name[j].lower() + "_constraining_score"] = constraining_score
            df.loc[i][section_name[j].lower() + "_positive_word_proportion"] = positive_word_proportion
            df.loc[i][section_name[j].lower() + "_negative_word_proportion"] = negative_word_proportion
            df.loc[i][section_name[j].lower() + "_uncertainty_word_proportion"] = uncertainty_word_proportion
            df.loc[i][section_name[j].lower() + "_constraining_word_proportion"] = constraining_word_proportion

reading
0|0 0|1 0|2 reading
1|0 1|1 1|2 reading
2|0 2|1 2|2 reading
3|0 3|1 3|2 reading
4|0 4|1 4|2 reading
5|0 5|1 5|2 reading
6|0 6|1 6|2 reading
7|0 7|1 7|2 reading
8|0 8|1 8|2 reading
9|0 9|1 9|2 reading
10|0 10|1 10|2 reading
11|0 11|1 11|2 reading
12|0 12|1 12|2 reading
13|0 13|1 13|2 reading
14|0 14|1 14|2 reading
15|0 15|1 15|2 reading
16|0 16|1 16|2 reading
17|0 17|1 17|2 reading
18|0 18|1 18|2 reading
19|0 19|1 19|2 reading
20|0 20|1 20|2 reading
21|0 21|1 21|2 reading
22|0 22|1 22|2 reading
23|0 23|1 23|2 reading
24|0 24|1 24|2 reading
25|0 25|1 25|2 reading
26|0 26|1 26|2 reading
27|0 27|1 27|2 reading
28|0 28|1 28|2 reading
29|0 29|1 29|2 reading
30|0 30|1 30|2 reading
31|0 31|1 31|2 reading
32|0 32|1 32|2 reading
33|0 33|1 33|2 reading
34|0 34|1 34|2 reading
35|0 35|1 35|2 reading
36|0 36|1 36|2 reading
37|0 37|1 37|2 reading
38|0 38|1 38|2 reading
39|0 39|1 39|2 reading
40|0 40|1 40|2 reading
41|0 41|1 41|2 reading
42|0 42|1 42|2 reading
43|0 43|1 43|2 reading
44|0 44|1 

In [15]:
for i in range(rows):
    print(i)
    file_name = './forms/form' + str(i)
    html = open(file_name,'r').read()
    print('reading..', end = ' ')
    
    #constrain_word_report
    constrain_word_report.loc[i] = 0
    constraining_words_whole_report_count = 0
    for word in denoise_text(html).split():
        if word in c_dict:
            constraining_words_whole_report_count += 1
    constrain_word_report.loc[i] = constraining_words_whole_report_count

0
reading.. 1
reading.. 2
reading.. 3
reading.. 4
reading.. 5
reading.. 6
reading.. 7
reading.. 8
reading.. 9
reading.. 10
reading.. 11
reading.. 12
reading.. 13
reading.. 14
reading.. 15
reading.. 16
reading.. 17
reading.. 18
reading.. 19
reading.. 20
reading.. 21
reading.. 22
reading.. 23
reading.. 24
reading.. 25
reading.. 26
reading.. 27
reading.. 28
reading.. 29
reading.. 30
reading.. 31
reading.. 32
reading.. 33
reading.. 34
reading.. 35
reading.. 36
reading.. 37
reading.. 38
reading.. 39
reading.. 40
reading.. 41
reading.. 42
reading.. 43
reading.. 44
reading.. 45
reading.. 46
reading.. 47
reading.. 48
reading.. 49
reading.. 50
reading.. 51
reading.. 52
reading.. 53
reading.. 54
reading.. 55
reading.. 56
reading.. 57
reading.. 58
reading.. 59
reading.. 60
reading.. 61
reading.. 62
reading.. 63
reading.. 64
reading.. 65
reading.. 66
reading.. 67
reading.. 68
reading.. 69
reading.. 70
reading.. 71
reading.. 72
reading.. 73
reading.. 74
reading.. 75
reading.. 76
reading.. 77
readin

In [16]:
df = pd.concat([df_cik,df,constrain_word_report], axis = 1)
df.shape
df.to_csv('output.csv', index=False)