# Author: Arsh Dinesh Vijayvargiya

## *INDEX*

### 1. *[*Data Loading*](#first)
### 2. *[*Loading Web Data*](#second)
### 3. *[*Creating Utility Functions*](#third)
### 4. *[*Solution*](#fourth)


<a class="anchor" id="first"></a>
## Importing Libraries

In [1]:
from string import punctuation
import nltk
import pandas as pd
import numpy as np
import re
import requests


## Importing Data and Lists 


In [2]:
# Data Loading

data = pd.read_excel(r'Documents\Blackcoffer\cik_list.xlsx')
master_dict = pd.read_excel(r'Documents\Blackcoffer\Master_dictionary.xlsx')

stop_words = []
with open('Documents\Blackcoffer\StopWords_GenericLong.txt') as words:
    for word in words: 
        stop_words.append(word.replace('\n',''))

In [3]:
#positive and negative word list:
positive_words = [words.lower() for words in master_dict.Word[master_dict.Positive > 0]]

negative_words = [words.lower() for words in master_dict.Word[master_dict.Negative > 0]]

In [4]:
uncertain = pd.read_excel(r'Documents\Blackcoffer\uncertainty_dictionary.xlsx')
uncertain_words = [word for word in uncertain.Word]
uncertain_words[0]

'ABEYANCE'

In [5]:
constrain = pd.read_excel(r'Documents\Blackcoffer\constraining_dictionary.xlsx')
constrain_words = [word for word in constrain.Word]
constrain_words[:5]

['ABIDE', 'ABIDING', 'BOUND', 'BOUNDED', 'COMMIT']

<a class="anchor" id="second"></a>
## Getting Data from Web

In [6]:
# defining a function to extract document from web
path_adder = 'https://www.sec.gov/Archives/'


def find_mda(file_content,result_dict):
    mda_regex = r"item[^a-zA-Z\n]*\d\s*\.\s*management\'s discussion and analysis.*?^\s*item[^a-zA-Z\n]*\d\s*\.*"
    matches_mda = re.findall(mda_regex, file_content, re.IGNORECASE | re.DOTALL | re.MULTILINE)
    if matches_mda:
        result = max(matches_mda, key=len)
        result = str(result).replace('\n', '')
        result_dict['mda_extract'] = result
    else:
        result_dict['mda_extract'] = ""
    return result_dict


def find_qqdmr(file_content,result_dict):
    qqdmr_regex = r"item[^a-zA-Z\n]*\d[a-z]?\.?\s*Quantitative and Qualitative Disclosures about Market Risk.*?^\s*item\s*\d\s*"
    match_qqdmr = re.findall(qqdmr_regex, file_content, re.IGNORECASE | re.DOTALL | re.MULTILINE)
    if match_qqdmr:
        result_qqdmr = max(match_qqdmr, key=len)
        result_qqdmr = str(result_qqdmr).replace('\n','')
        result_dict['qqdmr_extract']= result_qqdmr
    else:
        result_dict['qqdmr_extract'] = ""
    return result_dict


def find_rf(file_content,result_dict):
    rf_regex = r"item[^a-zA-Z\n]*\d[a-z]?\.?\s*Risk Factors.*?^\s*item\s*\d\s*"
    match_rf = re.findall(rf_regex, file_content, re.IGNORECASE | re.DOTALL | re.MULTILINE)
    if match_rf:
        result_rf = max(match_rf, key=len)
        result_rf = str(result_rf).replace('\n', '')
        result_dict['rf_extract'] = result_rf
    else:
        result_dict['rf_extract'] = ""
    return result_dict


def get_file(path):
    comp_path = path_adder+path
#     removing HTML tags
    html_regex = re.compile(r'<.*?>')
    result_dict = {}
    content = requests.get(comp_path).text
    file_content = content.lower()
    del content
    file_content = re.sub(html_regex,'',file_content)
    file_content = file_content.replace('&nbsp;','')
    file_content = re.sub(r'&#\d+;', '', file_content)
    find_mda(file_content,result_dict)
    find_qqdmr(file_content,result_dict)
    find_rf(file_content,result_dict)
    del file_content
    return result_dict


In [7]:
#testing functions
path = data.SECFNAME[0]
first = get_file(path)
first

 'qqdmr_extract': '',
 'rf_extract': ''}

<a class="anchor" id="third"></a>
## Creating Utility Functions

In [8]:
#removing stop words
#returns tokenised words
def simplify(content):
    if content == '':
        return ''
    content_word = nltk.word_tokenize(content)
    tokenised_word = [token for token in content_word if token not in stop_words and list(punctuation)]
    return tokenised_word

In [9]:
def word_count(content):
    if content == '':
        return 0
    tokenized_word = simplify(content)
    return len(tokenized_word)

In [10]:
def pos_score(content):
    if content == '':
        return 0
    tokenised_content = simplify(content)
    return sum([1 for words in tokenised_content if words in positive_words])

In [11]:
def neg_score(content):
    if content == '':
        return 0
    tokenised_content = simplify(content)
    return sum([1 for words in tokenised_content if words in negative_words])

In [12]:
#here we aren't gonna call pos_score or neg_score function inside rather just use the pre-computed value
def polarity_score(pos_score,neg_score):
    return (pos_score-neg_score)/(pos_score+neg_score + 0.000001)

In [13]:
def avg_sent_len(word_count,content):
    if content == '':
        return 0
    content_sent = nltk.sent_tokenize(content)
    sent_len = len(content_sent)
    return (word_count/sent_len)

In [14]:
def complex_word_count(content): 
    if content == '':
        return 0
    tokenised_word = simplify(content)
    complex_word = 0
    vowel = ['a','e','i','o','u']
    for word in tokenised_word:
        if word.endswith(('es','ed')):
            pass
        else:
            vowel_count = sum([1 for w in word if w in vowel])
            if vowel_count > 2:
                complex_word += 1       
    return complex_word
            
    

In [15]:
def complex_word_percent(complex_count,word_count):
    if word_count == 0:
        return 0
    return round((complex_count/word_count)*100,4)

In [16]:
def fog_index(avg_sent_length,complex_word_per):
    return 0.4*(avg_sent_length + complex_word_per)

In [17]:
def uncertain_score(content):
    if content == '':
        return 0
    tokenised_content = simplify(content)
    return sum([1 for words in tokenised_content if words in uncertain_words])

In [18]:
def constrain_score(content):
    if content == '':
        return 0
    tokenised_content = simplify(content)
    return sum([1 for words in tokenised_content if words in constrain_words])

In [19]:
def pos_word_prop(pos_score,word_count):
    return pos_score/word_count if word_count != 0 else 0

In [20]:
def neg_word_prop(neg_score,word_count):
    return neg_score/word_count if word_count != 0 else 0

In [21]:
def cons_word_prop(cons_score,word_count):
    return cons_score/word_count if word_count != 0 else 0

In [22]:
def uncertain_word_prop(uncer_score,word_count):
    return uncer_score/word_count if word_count != 0 else 0

In [23]:
def constrain_word_whole(mda_word_count,qqdmr_word_count,rf_word_count):
    return mda_word_count+qqdmr_word_count+rf_word_count
    

<a class="anchor" id="fourth"></a>
## Solution

In [24]:
extract_data = []
for path in data.SECFNAME:
    extract_data.append(get_file(path))
    
extract_df = pd.DataFrame(extract_data)
out_df = pd.concat([data,extract_df],axis=1)

In [25]:
# Working on MDA data

out_df['mda_positive_score'] = out_df.mda_extract.apply(pos_score) #correct
out_df['mda_negative_score'] = out_df.mda_extract.apply(neg_score) # correct
out_df['mda_polarity_score'] = np.vectorize(polarity_score)(out_df['mda_positive_score'],out_df['mda_negative_score'])
out_df['mda_word_count'] = out_df.mda_extract.apply(word_count)
out_df['mda_average_sentence_length'] = np.vectorize(avg_sent_len)(out_df['mda_word_count'],out_df['mda_extract'])
out_df['mda_complex_word_count']= out_df.mda_extract.apply(complex_word_count)
out_df['mda_percentage_of_complex_words'] = np.vectorize(complex_word_percent)(out_df['mda_complex_word_count'],out_df['mda_word_count'])
out_df['mda_fog_index'] = np.vectorize(fog_index)(out_df['mda_average_sentence_length'],out_df['mda_percentage_of_complex_words'])
out_df['mda_uncertainty_score']=out_df.mda_extract.apply(uncertain_score)
out_df['mda_constraining_score'] = out_df.mda_extract.apply(constrain_score)
out_df['mda_positive_word_proportion'] = np.vectorize(pos_word_prop)(out_df['mda_positive_score'],out_df['mda_word_count'])
out_df['mda_negative_word_proportion'] = np.vectorize(neg_word_prop)(out_df['mda_negative_score'],out_df['mda_word_count'])
out_df['mda_uncertainty_word_proportion'] = np.vectorize(uncertain_word_prop)(out_df['mda_uncertainty_score'],out_df['mda_word_count'])
out_df['mda_constraining_word_proportion'] = np.vectorize(cons_word_prop)(out_df['mda_constraining_score'],out_df['mda_word_count'])

In [26]:
# Working on QQDMR data

out_df['qqdmr_positive_score'] = out_df.qqdmr_extract.apply(pos_score) #correct
out_df['qqdmr_negative_score'] = out_df.qqdmr_extract.apply(neg_score) # correct
out_df['qqdmr_polarity_score'] = np.vectorize(polarity_score)(out_df['qqdmr_positive_score'],out_df['qqdmr_negative_score'])
out_df['qqdmr_word_count'] = out_df.qqdmr_extract.apply(word_count)
out_df['qqdmr_average_sentence_length'] = np.vectorize(avg_sent_len)(out_df['qqdmr_word_count'],out_df['qqdmr_extract'])
out_df['qqdmr_complex_word_count']= out_df.qqdmr_extract.apply(complex_word_count)
out_df['qqdmr_percentage_of_complex_words'] = np.vectorize(complex_word_percent)(out_df['qqdmr_complex_word_count'],out_df['qqdmr_word_count'])
out_df['qqdmr_fog_index'] = np.vectorize(fog_index)(out_df['qqdmr_average_sentence_length'],out_df['qqdmr_percentage_of_complex_words'])
out_df['qqdmr_uncertainty_score']=out_df.qqdmr_extract.apply(uncertain_score)
out_df['qqdmr_constraining_score'] = out_df.qqdmr_extract.apply(constrain_score)
out_df['qqdmr_positive_word_proportion'] = np.vectorize(pos_word_prop)(out_df['qqdmr_positive_score'],out_df['qqdmr_word_count'])
out_df['qqdmr_negative_word_proportion'] = np.vectorize(neg_word_prop)(out_df['qqdmr_negative_score'],out_df['qqdmr_word_count'])
out_df['qqdmr_uncertainty_word_proportion'] = np.vectorize(uncertain_word_prop)(out_df['qqdmr_uncertainty_score'],out_df['qqdmr_word_count'])
out_df['qqdmr_constraining_word_proportion'] = np.vectorize(cons_word_prop)(out_df['qqdmr_constraining_score'],out_df['qqdmr_word_count'])

In [27]:
# Working on Risk Factor data

out_df['rf_positive_score'] = out_df.rf_extract.apply(pos_score) #correct
out_df['rf_negative_score'] = out_df.rf_extract.apply(neg_score) # correct
out_df['rf_polarity_score'] = np.vectorize(polarity_score)(out_df['rf_positive_score'],out_df['rf_negative_score'])
out_df['rf_word_count'] = out_df.rf_extract.apply(word_count)
out_df['rf_average_sentence_length'] = np.vectorize(avg_sent_len)(out_df['rf_word_count'],out_df['rf_extract'])
out_df['rf_complex_word_count']= out_df.rf_extract.apply(complex_word_count)
out_df['rf_percentage_of_complex_words'] = np.vectorize(complex_word_percent)(out_df['rf_complex_word_count'],out_df['rf_word_count'])
out_df['rf_fog_index'] = np.vectorize(fog_index)(out_df['rf_average_sentence_length'],out_df['rf_percentage_of_complex_words'])
out_df['rf_uncertainty_score']=out_df.rf_extract.apply(uncertain_score)
out_df['rf_constraining_score'] = out_df.rf_extract.apply(constrain_score)
out_df['rf_positive_word_proportion'] = np.vectorize(pos_word_prop)(out_df['rf_positive_score'],out_df['rf_word_count'])
out_df['rf_negative_word_proportion'] = np.vectorize(neg_word_prop)(out_df['rf_negative_score'],out_df['rf_word_count'])
out_df['rf_uncertainty_word_proportion'] = np.vectorize(uncertain_word_prop)(out_df['rf_uncertainty_score'],out_df['rf_word_count'])
out_df['rf_constraining_word_proportion'] = np.vectorize(cons_word_prop)(out_df['rf_constraining_score'],out_df['rf_word_count'])

In [28]:
out_df['constraining_words_whole_report'] = np.vectorize(constrain_word_whole)(out_df['mda_constraining_score'],out_df['qqdmr_constraining_score'],out_df['rf_constraining_score'])

In [29]:
out_df.head(5)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,mda_extract,qqdmr_extract,rf_extract,mda_positive_score,...,rf_complex_word_count,rf_percentage_of_complex_words,rf_fog_index,rf_uncertainty_score,rf_constraining_score,rf_positive_word_proportion,rf_negative_word_proportion,rf_uncertainty_word_proportion,rf_constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,,,,0,...,0,0,0.0,0,0,0,0,0,0,0
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,,,,0,...,0,0,0.0,0,0,0,0,0,0,0
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,,,,0,...,0,0,0.0,0,0,0,0,0,0,0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,item 7. management's discussion and analysis o...,,,38,...,0,0,0.0,0,0,0,0,0,0,0
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,,,,0,...,0,0,0.0,0,0,0,0,0,0,0


In [30]:
out_df.shape

(152, 52)

In [31]:
final_output = out_df.drop(['mda_extract','qqdmr_extract','rf_extract'], axis =1)

In [32]:
final_output.tail(10)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,mda_positive_score,mda_negative_score,mda_polarity_score,mda_word_count,...,rf_complex_word_count,rf_percentage_of_complex_words,rf_fog_index,rf_uncertainty_score,rf_constraining_score,rf_positive_word_proportion,rf_negative_word_proportion,rf_uncertainty_word_proportion,rf_constraining_word_proportion,constraining_words_whole_report
142,11860,BETHLEHEM STEEL CORP /DE/,200003,2000-03-09,10-K,edgar/data/11860/0000011860-00-000019.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
143,11860,BETHLEHEM STEEL CORP /DE/,200005,2000-05-03,10-Q,edgar/data/11860/0000011860-00-000022.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
144,11860,BETHLEHEM STEEL CORP /DE/,200006,2000-06-28,10-K/A,edgar/data/11860/0000011860-00-000025.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
145,11860,BETHLEHEM STEEL CORP /DE/,200007,2000-07-26,10-Q,edgar/data/11860/0000011860-00-000028.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
146,11860,BETHLEHEM STEEL CORP /DE/,200010,2000-10-25,10-Q,edgar/data/11860/0000011860-00-000038.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
147,12239,SPHERIX INC,200704,2007-04-02,10-K,edgar/data/12239/0001104659-07-024804.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
148,12239,SPHERIX INC,200705,2007-05-16,NT 10-Q,edgar/data/12239/0001104659-07-040463.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
149,12239,SPHERIX INC,200705,2007-05-18,10-Q,edgar/data/12239/0001104659-07-041441.txt,0,0,0.0,0,...,0,0,0.0,0,0,0,0,0,0,0
150,12239,SPHERIX INC,200705,2007-05-23,10-K/A,edgar/data/12239/0001104659-07-042333.txt,0,0,0.0,0,...,555,45,24.0,0,0,0,0,0,0,0
151,12239,SPHERIX INC,200708,2007-08-14,10-Q,edgar/data/12239/0001104659-07-062470.txt,0,0,0.0,0,...,23,38,23.2,0,0,0,0,0,0,0


## Saving Output File

In [33]:
final_output.to_csv('Output.csv')