# Data Extraction and Text Analysis
## Blackcoffer Consulting

In [1]:
# Importing libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from nltk.tokenize import RegexpTokenizer, sent_tokenize
import numpy as np

In [2]:
# Filepath locations
stop_words_file = 'StopWords_Generic.txt'
positive_words_file = pd.read_excel("LoughranMcDonald_SentimentWordLists_2018.xlsx", "Positive")
nagitive_words_file = pd.read_excel("LoughranMcDonald_SentimentWordLists_2018.xlsx", "Negative")
uncertainty_dictionary_file = pd.read_excel('uncertainty_dictionary.xlsx')
constraining_dictionary_file = pd.read_excel('constraining_dictionary.xlsx')
cik_list_file = pd.read_excel('cik_list.xlsx')

In [3]:
# Creating stop words list
with open(stop_words_file ,'r') as stop_words:
    stop_words = stop_words.read().lower()
stop_words_list = stop_words.split('\n')
stop_words_list[-1:] = []

In [4]:
# Tokenizer
def tokenizer(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = list(filter(lambda token: token not in stop_words_list, tokens))
    return filtered_words

In [5]:
# Creating positive words list
positive_words_list = [word.lower() for word in positive_words_file.iloc[:,0].tolist()]

In [6]:
# Creating negative words list
negative_words_list = [word.lower() for word in nagitive_words_file.iloc[:,0].tolist()]

In [7]:
# Creating uncertain words list
uncertain_words_list = [word.lower() for word in uncertainty_dictionary_file.iloc[:,0].tolist()]

In [8]:
# Creating constraining words list
constraining_words_list = [word.lower() for word in constraining_dictionary_file.iloc[:,0].tolist()]

In [9]:
# Text extraction regex patterns
mda_regex = r"item[^a-zA-Z\n]*\d\s*\.\s*management\'s discussion and analysis.*?^\s*item[^a-zA-Z\n]*\d\s*\.*"
qqd_regex = r"item[^a-zA-Z\n]*\d[a-z]?\.?\s*Quantitative and Qualitative Disclosures about " \
            r"Market Risk.*?^\s*item\s*\d\s*"
riskfactor_regex = r"item[^a-zA-Z\n]*\d[a-z]?\.?\s*Risk Factors.*?^\s*item\s*\d\s*"
html_regex = re.compile(r'<.*?>')

## Section 1: Positive score, negative score, polarity score

In [10]:
# Calculating positive score 
def positive_score(text):
    psitive_words = 0
    raw_token = tokenizer(text)
    for word in raw_token:
        if word in positive_words_list:
            psitive_words  += 1
    return psitive_words

In [11]:
# Calculating Negative score
def negative_score(text):
    negetive_words=0
    raw_token = tokenizer(text)
    for word in raw_token:
        if word in negative_words_list:
            negetive_words +=1
    return negetive_words

In [12]:
# Calculating polarity score
def polarity_score(positive_score, negative_score):
    pol_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    return pol_score

## Section 2: Analysis of Readability -  Average Sentence Length, percentage of complex words, fog index

In [13]:
# Calculating Average sentence length 
# Average Sentence Length = the number of words / the number of sentences
     
def average_sentence_length(text):
    sentence_list = sent_tokenize(text)
    tokens = tokenizer(text)
    total_words_count = len(tokens)
    total_sentences = len(sentence_list)
    average_sent = 0
    if total_sentences != 0:
        average_sent = total_words_count / total_sentences
    average_sent_length= average_sent
    return round(average_sent_length)

In [14]:
# Calculating percentage of complex word 
# Percentage of Complex words = the number of complex words / the number of words 

def percentage_complex_words(text):
    tokens = tokenizer(text)
    complex_words = 0
    complex_words_percentage = 0
    for word in tokens:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complex_words += 1
    if len(tokens) != 0:
        complex_words_percentage = complex_words/len(tokens)
    return complex_words_percentage

In [15]:
# calculating Fog Index 
# Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

def fog_index(average_sentence_length, percentage_complex_words):
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    return fog_index


## Section 4: Complex word count

In [16]:
# Counting complex words
def complex_words_count(text):
    tokens = tokenizer(text)
    complex_words = 0
    
    for word in tokens:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complex_words += 1
    return complex_words

## Section 5: Word count

In [17]:
#Counting total words

def total_words_count(text):
    tokens = tokenizer(text)
    return len(tokens)

In [18]:
# calculating uncertainty_score

def uncertainty_score(text):
    uncertain_words =0
    raw_token = tokenizer(text)
    for word in raw_token:
        if word in uncertain_words_list:
            uncertain_words +=1
    uncertain_words_score = uncertain_words 
    
    return uncertain_words_score

In [19]:
# calculating constraining score

def constraining_score(text):
    constrain_word =0
    raw_token = tokenizer(text)
    for word in raw_token:
        if word in constraining_words_list:
            constrain_word +=1
    constrain_words_score = constrain_word 
    
    return constrain_words_score

In [20]:
# Calculating positive word proportion

def positive_words_prop(positive_score,word_count):
    positive_words_proportion = 0
    if word_count !=0:
        positive_words_proportion = positive_score / word_count
        
    return positive_words_proportion

In [21]:
# Calculating negative word proportion

def negative_words_prop(negative_score,word_count):
    negative_word_proportion = 0
    if word_count !=0:
        negative_word_proportion = negative_score / word_count
        
    return negative_word_proportion

In [22]:
# Calculating uncertain word proportion

def uncertain_words_prop(uncertain_score,word_count):
    uncertain_word_proportion = 0
    if word_count !=0:
        uncertain_word_proportion = uncertain_score / word_count
        
    return uncertain_word_proportion

In [23]:
# Calculating constraining word proportion

def constraining_words_prop(constraining_score,word_count):
    constraining_word_proportion = 0
    if word_count !=0:
        constraining_word_proportion = constraining_score / word_count
        
    return constraining_word_proportion

In [24]:
# calculating Constraining words for whole report

def constrain_words_whole(mdaText,qqdmrText,rfText):
    whole_doc = mdaText + qqdmrText + rfText
    constrain_word_whole =0
    raw_token = tokenizer(whole_doc)
    for word in raw_token:
        if word in constraining_words_list:
            constrain_word_whole +=1
    sum_constrain_word_whole = constrain_word_whole 
    return sum_constrain_word_whole

## Reading edgar files and extracting useful text.

In [25]:
def extract_text(cik_list_file):
    extraxted_text=[]
    processed_files = 0
    total_files = cik_list_file.shape[0]
    for url in cik_list_file.iloc[:,5]:
        resultdict = dict()
        html_doc = requests.get("https://www.sec.gov/Archives/" + url).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        text = soup.get_text()
        text = re.sub(html_regex,'',text)
        text = text.replace('&nbsp;','')
        text = re.sub(r'&#\d+;', '', text)
        matches_mda = re.findall(mda_regex, text, re.IGNORECASE | re.DOTALL | re.MULTILINE)
        if matches_mda:
            result = max(matches_mda, key=len)
            result = str(result).replace('\n', '')
            resultdict['mda_extract'] = result
        else:
            resultdict['mda_extract'] = ""
        match_qqd = re.findall(qqd_regex, text, re.IGNORECASE | re.DOTALL | re.MULTILINE)
        if match_qqd:
            result_qqd = max(match_qqd, key=len)
            result_qqd = str(result_qqd).replace('\n','')
            resultdict['qqd_extract']= result_qqd
        else:
            resultdict['qqd_extract'] = ""
        match_riskfactor = re.findall(riskfactor_regex, text, re.IGNORECASE | re.DOTALL | re.MULTILINE)
        if match_riskfactor:
            result_riskfactor = max(match_riskfactor, key=len)
            result_riskfactor = str(result_riskfactor).replace('\n', '')
            resultdict['riskfactor_extract'] = result_riskfactor
        else:
            resultdict['riskfactor_extract'] = ""
        extraxted_text.append(resultdict)
        processed_files+=1
        print("Processed files: " , processed_files , "/" , total_files)
        
    return(extraxted_text)

In [26]:
text_data =  extract_text(cik_list_file)

Processed files:  1 / 152
Processed files:  2 / 152
Processed files:  3 / 152
Processed files:  4 / 152
Processed files:  5 / 152
Processed files:  6 / 152
Processed files:  7 / 152
Processed files:  8 / 152
Processed files:  9 / 152
Processed files:  10 / 152
Processed files:  11 / 152
Processed files:  12 / 152
Processed files:  13 / 152
Processed files:  14 / 152
Processed files:  15 / 152
Processed files:  16 / 152
Processed files:  17 / 152
Processed files:  18 / 152
Processed files:  19 / 152
Processed files:  20 / 152
Processed files:  21 / 152
Processed files:  22 / 152
Processed files:  23 / 152
Processed files:  24 / 152
Processed files:  25 / 152
Processed files:  26 / 152
Processed files:  27 / 152
Processed files:  28 / 152
Processed files:  29 / 152
Processed files:  30 / 152
Processed files:  31 / 152
Processed files:  32 / 152
Processed files:  33 / 152
Processed files:  34 / 152
Processed files:  35 / 152
Processed files:  36 / 152
Processed files:  37 / 152
Processed 

## Creating a dataframe and writing required information.

In [27]:
data_frame = pd.DataFrame(text_data)
data_frame['CIK'] = cik_list_file['CIK']
data_frame['CONAME'] = cik_list_file['CONAME']
data_frame['FYRMO'] = cik_list_file['FYRMO']
data_frame['FDATE'] = cik_list_file['FDATE']
data_frame['FORM'] = cik_list_file['FORM']
data_frame['SECFNAME'] = cik_list_file['SECFNAME']

#--------------------------------------------Management's Discussion and Analysis----------------------------------------------#
data_frame['mda_positive_score'] = data_frame.mda_extract.apply(positive_score)
data_frame['mda_negative_score'] = data_frame.mda_extract.apply(negative_score)
data_frame['mda_polarity_score'] = np.vectorize(polarity_score)(data_frame['mda_positive_score'],data_frame['mda_negative_score'])
data_frame['mda_average_sentence_length'] = data_frame.mda_extract.apply(average_sentence_length)
data_frame['mda_percentage_of_complex_words'] = data_frame.mda_extract.apply(percentage_complex_words)
data_frame['mda_fog_index'] = np.vectorize(fog_index)(data_frame['mda_average_sentence_length'],data_frame['mda_percentage_of_complex_words'])
data_frame['mda_complex_word_count']= data_frame.mda_extract.apply(complex_words_count)
data_frame['mda_word_count'] = data_frame.mda_extract.apply(total_words_count)
data_frame['mda_uncertainty_score']=data_frame.mda_extract.apply(uncertainty_score)
data_frame['mda_constraining_score'] = data_frame.mda_extract.apply(constraining_score)
data_frame['mda_positive_word_proportion'] = np.vectorize(positive_words_prop)(data_frame['mda_positive_score'],data_frame['mda_word_count'])
data_frame['mda_negative_word_proportion'] = np.vectorize(negative_words_prop)(data_frame['mda_negative_score'],data_frame['mda_word_count'])
data_frame['mda_uncertainty_word_proportion'] = np.vectorize(uncertain_words_prop)(data_frame['mda_uncertainty_score'],data_frame['mda_word_count'])
data_frame['mda_constraining_word_proportion'] = np.vectorize(constraining_words_prop)(data_frame['mda_constraining_score'],data_frame['mda_word_count'])

#--------------------------------Quantitative and Qualitative Disclosures about Market Risk------------------------------------#
data_frame['qqdmr_positive_score'] = data_frame.qqd_extract.apply(positive_score)
data_frame['qqdmr_negative_score'] = data_frame.qqd_extract.apply(negative_score)
data_frame['qqdmr_polarity_score'] = np.vectorize(polarity_score)(data_frame['qqdmr_positive_score'],data_frame['qqdmr_negative_score'])
data_frame['qqdmr_average_sentence_length'] = data_frame.qqd_extract.apply(average_sentence_length)
data_frame['qqdmr_percentage_of_complex_words'] = data_frame.qqd_extract.apply(percentage_complex_words)
data_frame['qqdmr_fog_index'] = np.vectorize(fog_index)(data_frame['qqdmr_average_sentence_length'],data_frame['qqdmr_percentage_of_complex_words'])
data_frame['qqdmr_complex_word_count']= data_frame.qqd_extract.apply(complex_words_count)
data_frame['qqdmr_word_count'] = data_frame.qqd_extract.apply(total_words_count)
data_frame['qqdmr_uncertainty_score']=data_frame.qqd_extract.apply(uncertainty_score)
data_frame['qqdmr_constraining_score'] = data_frame.qqd_extract.apply(constraining_score)
data_frame['qqdmr_positive_word_proportion'] = np.vectorize(positive_words_prop)(data_frame['qqdmr_positive_score'],data_frame['qqdmr_word_count'])
data_frame['qqdmr_negative_word_proportion'] = np.vectorize(negative_words_prop)(data_frame['qqdmr_negative_score'],data_frame['qqdmr_word_count'])
data_frame['qqdmr_uncertainty_word_proportion'] = np.vectorize(uncertain_words_prop)(data_frame['qqdmr_uncertainty_score'],data_frame['qqdmr_word_count'])
data_frame['qqdmr_constraining_word_proportion'] = np.vectorize(constraining_words_prop)(data_frame['qqdmr_constraining_score'],data_frame['qqdmr_word_count'])

#-------------------------------------------------Risk Factors------------------------------------------------------------------#
data_frame['rf_positive_score'] = data_frame.riskfactor_extract.apply(positive_score)
data_frame['rf_negative_score'] = data_frame.riskfactor_extract.apply(negative_score)
data_frame['rf_polarity_score'] = np.vectorize(polarity_score)(data_frame['rf_positive_score'],data_frame['rf_negative_score'])
data_frame['rf_average_sentence_length'] = data_frame.riskfactor_extract.apply(average_sentence_length)
data_frame['rf_percentage_of_complex_words'] = data_frame.riskfactor_extract.apply(percentage_complex_words)
data_frame['rf_fog_index'] = np.vectorize(fog_index)(data_frame['rf_average_sentence_length'],data_frame['rf_percentage_of_complex_words'])
data_frame['rf_complex_word_count']= data_frame.riskfactor_extract.apply(complex_words_count)
data_frame['rf_word_count'] = data_frame.riskfactor_extract.apply(total_words_count)
data_frame['rf_uncertainty_score']=data_frame.riskfactor_extract.apply(uncertainty_score)
data_frame['rf_constraining_score'] = data_frame.riskfactor_extract.apply(constraining_score)
data_frame['rf_positive_word_proportion'] = np.vectorize(positive_words_prop)(data_frame['rf_positive_score'],data_frame['rf_word_count'])
data_frame['rf_negative_word_proportion'] = np.vectorize(negative_words_prop)(data_frame['rf_negative_score'],data_frame['rf_word_count'])
data_frame['rf_uncertainty_word_proportion'] = np.vectorize(uncertain_words_prop)(data_frame['rf_uncertainty_score'],data_frame['rf_word_count'])
data_frame['rf_constraining_word_proportion'] = np.vectorize(constraining_words_prop)(data_frame['rf_constraining_score'],data_frame['rf_word_count'])


data_frame['constraining_words_whole_report'] = np.vectorize(constrain_words_whole)(data_frame['mda_extract'],data_frame['qqd_extract'],data_frame['riskfactor_extract'])

input_text_col = ['mda_extract','qqd_extract','riskfactor_extract']
output = data_frame.drop(input_text_col,1)

## Output

In [28]:
output

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,mda_positive_score,mda_negative_score,mda_polarity_score,mda_average_sentence_length,...,rf_fog_index,rf_complex_word_count,rf_word_count,rf_uncertainty_score,rf_constraining_score,rf_positive_word_proportion,rf_negative_word_proportion,rf_uncertainty_word_proportion,rf_constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,17,61,-0.564103,24,...,0.000000,0,0,0,0,0,0,0,0,10
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,9,46,-0.672727,30,...,0.000000,0,0,0,0,0,0,0,0,2
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,0,0,0.000000,0,...,0.000000,0,0,0,0,0,0,0,0,0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,40,119,-0.496855,23,...,0.000000,0,0,0,0,0,0,0,0,39
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,0,0,0.000000,0,...,0.000000,0,0,0,0,0,0,0,0,0
5,3662,SUNBEAM CORP/FL/,199811,1998-11-25,10-Q/A,edgar/data/3662/0000950170-98-002278.txt,18,63,-0.555556,23,...,0.000000,0,0,0,0,0,0,0,0,23
6,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002401.txt,39,106,-0.462069,22,...,0.000000,0,0,0,0,0,0,0,0,34
7,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002402.txt,37,102,-0.467626,22,...,0.000000,0,0,0,0,0,0,0,0,32
8,3662,SUNBEAM CORP/FL/,199903,1999-03-31,NT 10-K,edgar/data/3662/0000950172-99-000362.txt,0,0,0.000000,0,...,0.000000,0,0,0,0,0,0,0,0,0
9,3662,SUNBEAM CORP/FL/,199905,1999-05-11,10-K,edgar/data/3662/0000950170-99-000775.txt,70,271,-0.589443,23,...,0.000000,0,0,0,0,0,0,0,0,74


## Writing to csv file

In [30]:
output.to_csv('text_analysis_output.csv', sep=',', encoding='utf-8')