In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import stats
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import re
from bs4 import BeautifulSoup as BS
import urllib.request
import os
from os.path import join, getsize
import nltk
from nltk import word_tokenize, sent_tokenize, tokenize
import estnltk

p = pd.read_csv('../data/share_repurchase_paragraphs.csv')
f = pd.read_csv('../data/nc_validation_filings.csv')

# _Identify share repurchase activity in 10-K/10-Q docs_

> ##### For clarity, there won’t necessarily be separate paragraphs for each of those 7 data points we’ve identified (e.g., Share Repurchase Authorization Date, Share Repurchase Authorization, Share Repurchase Intention, etc.) - you could find each of those 7 data points all in a single paragraph depending on the disclosure in the filing. So if it made sense to, you could look at the problem in two stages - first, identifying those paragraphs generally that are about share repurchases, and second, classifying the data inside the paragraph into those 7 categories.

> ##### For the deliverable, please prepare a csv file with the data points that are included in share_repurchase_paragraphs.csv

> ##### For errors, we’d prefer to err on the side of a false positive. I think those will also make for easier feedback than negative cases.

> ##### Other Relevant Files
> #####  1.  nc_training_filings.zip, which contains the HTML filings for each of the items in the training set CSV we sent over (which I believe is a set of 500). That CSV includes accession_number, and all the HTML filings in that archive are named <accession_number>.html .
> #####  2.  nc_validation_filings.zip, which contains the HTML filings for an extra 100 filings that aren’t in the training set. These are in case your students want to be able to try their models out on some unmarked data - we’ll provide feedback on those if they do.
> #####  3.  nc_validation_filings.csv, which contains the ticker and accession number for the filings in #2.

In [2]:
# I've been tasked with the data engineering process to generate the data to be used in our model. 
p.head(10)

Unnamed: 0,ticker,accession_number,data_key_friendly_name,text,data_value,reported_data_value,reported_units,paragraph_text
0,A,0001090872-17-000018,Share Repurchase Authorization Date,"May 28, 2015",20150528,20200000.0,ones,"On May 28, 2015 we announced that our board ..."
1,A,0001090872-17-000018,Share Repurchase Authorization,The 2015 share repurchase program authorizes t...,1140000000,1.14,billions,"On May 28, 2015 we announced that our board ..."
2,A,0001090872-17-000018,Share Repurchase Intention,remaining authorization to repurchase up to,610000000,610.0,millions,"Table of Contents 2016, upon the completion ..."
3,A,0001090872-17-000018,Share Repurchase Count,repurchased,4100000,4.1,millions,"Table of Contents 2016, upon the completion ..."
4,A,0001090872-17-000018,Amount Spent on Share Repurchases,repurchased,194000000,194.0,millions,"Table of Contents 2016, upon the completion ..."
5,AAL,0000006201-18-000009,Share Repurchase Authorization Date,July 2014,20140715,20100000.0,ones,4. Share Repurchase Programs and Dividends S...
6,AAL,0000006201-18-000009,Share Repurchase Authorization,share repurchase programs aggregating,11000000000,11.0,billions,4. Share Repurchase Programs and Dividends S...
7,AAL,0000006201-18-000009,Share Repurchase Intention,remained unused under a repurchase program,450000000,450.0,millions,4. Share Repurchase Programs and Dividends S...
8,AAL,0000006201-18-000009,Share Repurchase Count,repurchased,33900000,33.9,millions,"During the year ended December 31, 2017, we ..."
9,AAL,0000006201-18-000009,Amount Spent on Share Repurchases,repurchased,1600000000,1.6,billions,"During the year ended December 31, 2017, we ..."


In [3]:
p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 8 columns):
ticker                    1781 non-null object
accession_number          1781 non-null object
data_key_friendly_name    1781 non-null object
text                      1781 non-null object
data_value                1781 non-null int64
reported_data_value       1781 non-null float64
reported_units            1781 non-null object
paragraph_text            1781 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 111.4+ KB


In [4]:
p.paragraph_text[0]

'On May 28, 2015 we  announced that our board  of directors had approved a  new share repurchase program  (the   "2015 repurchase program"). The 2015 share repurchase program authorizes the purchase of up to $1.14  billion   of our common stock at the company\'s discretion  through and including November 1, 2018. The 2015  repurchase   program does  not require  the company  to  acquire a  specific number  of  shares and  may be  suspended  or   discontinued at any time. During the year ended October 31,                                                        95'

## _Parcing Item 8 up to the exhibts from a sample filing_

In [5]:
# Starting with just one html file in order to manually parse all the elements before item 8 out of file. 
# It does not look like these html files will be parceable with beautifulsoup. 
filename = '0000002969-17-000039.html'
request = urllib.request.Request('file:///C:/Users/carmijh0/Desktop/Data_Science/Q8/data/nc_training_filings/'+filename)
result = urllib.request.urlopen(request)
resulttext = result.read()

In [6]:
# Results in all the text from the html file
soup_test = BS(resulttext, 'html.parser')
soup_test.prettify();

In [7]:
print(type(soup_test))

<class 'bs4.BeautifulSoup'>


In [8]:
# Trying soup.get_text() to make a massive string and use regex to separate item 8
soup_text = soup_test.get_text()

In [9]:
# ITEM 8 begins here
soup_text.find('FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA')

281149

In [10]:
# Stopping at the exhibts 
soup_text.find('EXHIBITS AND FINANCIAL STATEMENT SCHEDULES')

528264

In [11]:
soup_text_parsed = soup_text[281149:528264]

In [12]:
print(type(soup_text_parsed))

<class 'str'>


In [13]:
len(soup_text_parsed)

247115

## _Identifying share repurchase paragraphs via texttiling tokenization_

In [14]:
# Word tokenization
test_tok_word = word_tokenize(soup_text_parsed)

In [15]:
# 4 instances of repurchase in this filling.
test_tok_word.count('repurchase')

4

In [16]:
test_tok_word.count('ASR')

0

In [17]:
keyword_list = ['repurchase', 'ASR']
if any(word in test_tok_word for word in keyword_list):
    print('found one of em')

found one of em


In [18]:
# Attempting to use the Estonian Natural Language Processing... Seems to be some sort of install issue. Can't recognize "Tokenizer()."
# https://github.com/estnltk/estnltk
# tokenizer = estnltk.Tokenizer()
# test_tok_para = tokenizer.tokenize(soup_text_parsed) 

In [19]:
# Can I tokenize the sample str into paragraphs? 
ttt = nltk.tokenize.TextTilingTokenizer()
para_list = ttt.tokenize(soup_text_parsed)

In [20]:
print(type(para_list))

<class 'list'>


In [21]:
# Identifying specific paragraph where the repurchase is mentioned
counter = 0
for p in para_list:
    p_token = word_tokenize(p)
    if "repurchase" in p_token:
        counter += 1
        print(counter)

1


In [22]:
# Remove paragraphs from the list that don't contain repurchase
approved = ['repurchase','ASR']
refined_list = [para for para in para_list if any(instance in para for instance in approved)]

In [23]:
# I'll need to remove '/n's' and any other html elements from the final list of paragraphs.
refined_list

['\n\n\n18. CAPITAL STOCK\nCommon Stock\nAuthorized common stock consists of 300\nmillion shares with a par value of $1 per share. As of 30 September 2017, 249 million shares were issued, with 218 million\noutstanding.\nOn 15 September 2011, the Board of Directors authorized the repurchase of up\nto $1,000 of our outstanding common stock. We repurchase\nshares pursuant to Rules 10b5-1 and 10b-18 under the Securities Exchange Act of 1934, as amended, through\nrepurchase agreements established with several brokers. We did not purchase any of our outstanding shares\nduring fiscal year 2017. At 30 September 2017,\n$485.3 in share repurchase authorization remains.\nThe following table reflects the changes in common shares:']

## _Creating a pipeline to isolate all paragraph instances of repurchase to create a test dataset for our model_

In [24]:
def find_repurchase_paragraphs(paragraph_list):
    approved = ['repurchase','ASR']
    relevant_list = [para for para in paragraph_list if any(instance in para for instance in approved)]
    return relevant_list

In [25]:
def split_into_paragraphs(file):
    t = nltk.tokenize.TextTilingTokenizer()
    p_list = t.tokenize(file)
    return p_list

In [54]:
def get_soup(path):
    request = urllib.request.Request(path)
    result = urllib.request.urlopen(request)
    resulttext = result.read()
    soup = BS(resulttext, 'html.parser')
    soup = soup.get_text()
    return soup

In [72]:
# path = 'file:///C:/Users/carmijh0/Desktop/Data_Science/Q8/data_1/nc_training_filings_test/0000002969-17-000039.html'

# testing = get_soup(path)
# print(testing)

In [77]:
para_list_test = []
path = 'C:/Users/carmijh0/Desktop/Data_Science/Q8/data_1'
prefix = 'file:///'

for root, dirs, files in os.walk(path, topdown=True):
    for f in files:
#         print(prefix + os.path.join(root, f))
        s = get_soup(prefix + os.path.join(root, f))
        print(s)
        

#         soup_tokenized = split_into_paragraphs(soup)
#         p_list = find_repurchase_paragraph(soup_tokenized)
#         para_list_test= para_list_test.append(p_list)







Document
Exhibit
Exhibit
Document
Document
Document
Exhibit
Exhibit
Exhibit
Exhibit
Exhibit
Exhibit
Exhibit

/*<![CDATA[*/
 body.c653 {font-family:Times New Roman;font-size:10pt;}
 table.c652 {padding-bottom:16px;font-family:Times New Roman; font-size:10pt;}
 div.c651 {font-family: inherit; font-size: 10pt; line-height: 120%; padding-left: 48px}
 td.c650 {width:108px;}
 div.c649 {line-height:120%;padding-top:12px;text-indent:32px;font-size:10pt;}
 div.c648 {font-family: inherit; font-size: 10pt; line-height: 120%; padding-top: 12px; text-align: center}
 table.c647 {font-family:Times New Roman;font-size:10pt;width:44.3359375%;border-collapse:collapse;text-align:left;margin-left:auto;margin-right:0;}
 div.c646 {line-height:120%;padding-top:12px;text-indent:106px;font-size:10pt;}
 div.c645 {line-height:120%;padding-left:24px;font-size:10pt;}
 div.c644 {line-height:120%;padding-left:24px;text-indent:0px;}
 span.c643 {font-family:inherit;font-size:10pt;padding-right:24px;}
 table.c642

NameError: name 's' is not defined