### Load Packages

In [47]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
import project_tests
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.common.by import By
from datetime import datetime
from selenium.webdriver import EdgeOptions
from parser_10KQ import get_word_list
from itertools import islice
# options = EdgeOptions()
# options.add_argument("--headless")
# options.add_argument("--window-size=%s" % "3840, 2160")

### Get S&P500 CIKs
Filter the sp500_constituents csv by removing companies that were out after 2017.
Use the sp500_constituents permnos to filter sp500_data and get a dictionary of tickers and
CIKs.

In [48]:
sp500_constituents = pd.read_csv("sp500_constituents.csv", dtype={"permno":int}, index_col=0)
sp500_constituents = sp500_constituents[(sp500_constituents["ending"] > "2017-01-01")]
sp500_constituents

Unnamed: 0,permno,start,ending
6,10104,1989-08-03,2022-03-31
7,10107,1994-06-07,2022-03-31
11,10138,1999-10-13,2022-03-31
12,10145,1925-12-31,2022-03-31
28,10299,2000-04-03,2017-03-10
...,...,...,...
2008,93096,2012-12-03,2022-03-31
2009,93132,2018-10-11,2022-03-31
2011,93246,2021-03-22,2022-03-31
2013,93429,2017-03-01,2022-03-31


In [55]:
sp500_data = pd.read_csv("sp500_w_addl_id_with_cik.csv",dtype={"cik":str, "permno":int})
sp500_data = sp500_data[["ticker", "permno", "cik"]].set_index("ticker")
sp500_data = sp500_data[sp500_data["permno"].isin(sp500_constituents["permno"])]
sp500_data.drop_duplicates(inplace=True)
sp500_data.dropna(inplace=True)
cik_lookup = sp500_data.to_dict()["cik"]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [60]:
dict(islice(cik_lookup.items(), 0, 20))


{'NWL': '0000814453',
 'BBY': '0000764478',
 'AIV': '0000922864',
 'AXP': '0000004962',
 'TIF': '0000098246',
 'BAC': '0000070858',
 'TGT': '0000027419',
 'CVS': '0000064803',
 'WFMI': '0000865436',
 'ECL': '0000031462',
 'PBCT': '0001378946',
 'TAP': '0000024545',
 'FLIR': '0000354908',
 'TDC': '0000816761',
 'DHI': '0000882184',
 'GWW': '0000277135',
 'WAT': '0001000697',
 'CERN': '0000804753',
 'SCG': '0000754737',
 'PEP': '0000077476'}

In [59]:
len(cik_lookup)

608

{'NWL': '0000814453', 'BBY': '0000764478', 'AIV': '0000922864'}

In [14]:
def chunks(data, SIZE=100):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}

In [15]:
for item in chunks(cik_lookup, 3):
    print(item)

{'NWL': '0000814453', 'BBY': '0000764478', 'AIV': '0000922864'}


In [16]:
from bs4 import BeautifulSoup
import requests
sec_api = project_helper.SecAPI()
example_ticker = "AMZN"
sec_data = {ticker: [] for ticker in cik_lookup}
headers = {'Host': 'www.sec.gov', 'Connection': 'close',
           'Accept': 'application/json, text/javascript, */*; q=0.01',
           'X-Requested-With': 'XMLHttpRequest',
           'User-Agent': 'ruizhuoj@andrew.cmu.edu'
           }
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"
base_url_sec = r"https://www.sec.gov"
for ticker in cik_lookup:
    # define our parameters dictionary
    param_dict_10k = {'action': 'getcompany',
                  'CIK': cik_lookup[ticker],
                  'type': '10-k',
                  'dateb': '20220101',
                  'owner': 'exclude',
                  'start': '',
                  'output': '',
                  'count': '10'}
    # request the url, and then parse the response.
    response_10k = requests.get(url=endpoint, params=param_dict_10k, headers=headers)
    soup_10k = BeautifulSoup(response_10k.content, 'html.parser')
    doc_table_10k = soup_10k.find_all('table', class_='tableFile2')
    param_dict_10q = {'action': 'getcompany',
                  'CIK': cik_lookup[ticker],
                  'type': '10-Q',
                  'dateb': '20220101',
                  'owner': 'exclude',
                  'start': '',
                  'output': '',
                  'count': '20'}
    # request the url, and then parse the response.
    response_10q = requests.get(url=endpoint, params=param_dict_10q, headers=headers)
    soup_10q = BeautifulSoup(response_10q.content, 'html.parser')
    doc_table_10q = soup_10q.find_all('table', class_='tableFile2')
    #Get 10-Ks
    for row in doc_table_10k[0].find_all('tr'):
        # find all the columns
        cols = row.find_all('td')
        # if there are no columns move on to the next row.
        if len(cols) != 0:
            # grab the text
            filing_type = cols[0].text.strip()
            filing_date = cols[3].text.strip()
            if datetime.strptime(filing_date, '%Y-%m-%d').date() < datetime.strptime("2017", '%Y').date():
                pass
            else:
                filing_numb = cols[4].text.strip()
                # find the links
                filing_doc_href = cols[1].find('a', {'href': True, 'id': 'documentsbutton'})
                filing_int_href = cols[1].find('a', {'href': True, 'id': 'interactiveDataBtn'})
                filing_doc_link = base_url_sec + filing_doc_href['href']
                sec_data[ticker].append((filing_doc_link, filing_type, filing_date))
    #Get 10-Qs
    for row in doc_table_10q[0].find_all('tr'):
        # find all the columns
        cols = row.find_all('td')
        # if there are no columns move on to the next row.
        if len(cols) != 0:
            # grab the text
            filing_type = cols[0].text.strip()
            filing_date = cols[3].text.strip()
            filing_numb = cols[4].text.strip()
            # find the links
            filing_doc_href = cols[1].find('a', {'href': True, 'id': 'documentsbutton'})
            filing_int_href = cols[1].find('a', {'href': True, 'id': 'interactiveDataBtn'})
            filing_doc_link = base_url_sec + filing_doc_href['href']
            sec_data[ticker].append((filing_doc_link, filing_type, filing_date))
    print(ticker, "request successful")

NWL request successful
BBY request successful
AIV request successful


### Download 10-ks
As you see, this is a list of urls. These urls point to a file that contains metadata related to each filling. Since we don't care about the metadata, we'll pull the filling by replacing the url with the filling url.

In [17]:
import re
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

In [18]:

fillings_by_ticker = {}
browser = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))
for ticker, data in sec_data.items():
    fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        print(index_url, file_type, file_date)
        if (file_type == '10-K' or file_type == '10-Q'):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')
            fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)
with open('fillings_by_ticker_dict', 'wb') as handle:
    pickle.dump(fillings_by_ticker, handle, protocol=pickle.HIGHEST_PROTOCOL)

Downloading NWL Fillings:   4%|▍         | 1/25 [00:00<00:03,  7.35filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445321000050/0000814453-21-000050-index.htm 10-K 2021-02-19
https://www.sec.gov/Archives/edgar/data/814453/000081445320000078/0000814453-20-000078-index.htm 10-K 2020-03-02


Downloading NWL Fillings:  12%|█▏        | 3/25 [00:00<00:02,  7.50filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000119312519061714/0001193125-19-061714-index.htm 10-K 2019-03-04
https://www.sec.gov/Archives/edgar/data/814453/000119312518067603/0001193125-18-067603-index.htm 10-K 2018-03-01


Downloading NWL Fillings:  20%|██        | 5/25 [00:00<00:02,  7.86filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445317000027/0000814453-17-000027-index.htm 10-K 2017-03-01
https://www.sec.gov/Archives/edgar/data/814453/000081445321000151/0000814453-21-000151-index.htm 10-Q 2021-10-29


Downloading NWL Fillings:  28%|██▊       | 7/25 [00:01<00:03,  5.01filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445321000120/0000814453-21-000120-index.htm 10-Q 2021-07-30
https://www.sec.gov/Archives/edgar/data/814453/000081445321000072/0000814453-21-000072-index.htm 10-Q 2021-04-30
https://www.sec.gov/Archives/edgar/data/814453/000081445320000230/0000814453-20-000230-index.htm 10-Q 2020-10-30


Downloading NWL Fillings:  40%|████      | 10/25 [00:01<00:02,  6.97filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445320000215/0000814453-20-000215-index.htm 10-Q 2020-08-05
https://www.sec.gov/Archives/edgar/data/814453/000081445320000114/0000814453-20-000114-index.htm 10-Q 2020-05-01


Downloading NWL Fillings:  48%|████▊     | 12/25 [00:02<00:02,  4.40filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445319000111/0000814453-19-000111-index.htm 10-Q 2019-11-04
https://www.sec.gov/Archives/edgar/data/814453/000162828019009873/0001628280-19-009873-index.htm 10-Q 2019-08-02


Downloading NWL Fillings:  56%|█████▌    | 14/25 [00:02<00:01,  5.78filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000119312519141536/0001193125-19-141536-index.htm 10-Q 2019-05-08
https://www.sec.gov/Archives/edgar/data/814453/000119312518323823/0001193125-18-323823-index.htm 10-Q 2018-11-09
https://www.sec.gov/Archives/edgar/data/814453/000119312518244528/0001193125-18-244528-index.htm 10-Q 2018-08-09


Downloading NWL Fillings:  72%|███████▏  | 18/25 [00:03<00:01,  5.77filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000119312518159187/0001193125-18-159187-index.htm 10-Q 2018-05-10
https://www.sec.gov/Archives/edgar/data/814453/000119312517337345/0001193125-17-337345-index.htm 10-Q 2017-11-08
https://www.sec.gov/Archives/edgar/data/814453/000119312517252867/0001193125-17-252867-index.htm 10-Q 2017-08-09


Downloading NWL Fillings:  80%|████████  | 20/25 [00:03<00:00,  7.04filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000119312517166044/0001193125-17-166044-index.htm 10-Q 2017-05-10
https://www.sec.gov/Archives/edgar/data/814453/000081445316000273/0000814453-16-000273-index.htm 10-Q 2016-11-08


Downloading NWL Fillings:  88%|████████▊ | 22/25 [00:04<00:00,  4.55filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445316000258/0000814453-16-000258-index.htm 10-Q 2016-08-09
https://www.sec.gov/Archives/edgar/data/814453/000081445316000202/0000814453-16-000202-index.htm 10-Q 2016-05-09


Downloading NWL Fillings: 100%|██████████| 25/25 [00:04<00:00,  5.48filling/s]

https://www.sec.gov/Archives/edgar/data/814453/000081445315000131/0000814453-15-000131-index.htm 10-Q 2015-11-09
https://www.sec.gov/Archives/edgar/data/814453/000081445315000108/0000814453-15-000108-index.htm 10-Q 2015-08-07



Downloading BBY Fillings:   0%|          | 0/25 [00:00<?, ?filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447821000024/0000764478-21-000024-index.htm 10-K 2021-03-19


Downloading BBY Fillings:   8%|▊         | 2/25 [00:00<00:08,  2.80filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447820000017/0000764478-20-000017-index.htm 10-K 2020-03-23
https://www.sec.gov/Archives/edgar/data/764478/000076447819000009/0000764478-19-000009-index.htm 10-K 2019-03-28


Downloading BBY Fillings:  16%|█▌        | 4/25 [00:01<00:04,  5.17filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447818000013/0000764478-18-000013-index.htm 10-K 2018-04-02
https://www.sec.gov/Archives/edgar/data/764478/000076447817000008/0000764478-17-000008-index.htm 10-K 2017-03-24


Downloading BBY Fillings:  20%|██        | 5/25 [00:01<00:03,  5.93filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447821000068/0000764478-21-000068-index.htm 10-Q 2021-12-03


Downloading BBY Fillings:  28%|██▊       | 7/25 [00:01<00:03,  4.90filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447821000060/0000764478-21-000060-index.htm 10-Q 2021-08-31
https://www.sec.gov/Archives/edgar/data/764478/000076447821000039/0000764478-21-000039-index.htm 10-Q 2021-06-04
https://www.sec.gov/Archives/edgar/data/764478/000076447820000062/0000764478-20-000062-index.htm 10-Q 2020-11-30


Downloading BBY Fillings:  40%|████      | 10/25 [00:01<00:02,  7.13filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447820000054/0000764478-20-000054-index.htm 10-Q 2020-08-31
https://www.sec.gov/Archives/edgar/data/764478/000076447820000029/0000764478-20-000029-index.htm 10-Q 2020-05-27


Downloading BBY Fillings:  48%|████▊     | 12/25 [00:02<00:02,  4.36filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447819000057/0000764478-19-000057-index.htm 10-Q 2019-12-06
https://www.sec.gov/Archives/edgar/data/764478/000076447819000042/0000764478-19-000042-index.htm 10-Q 2019-09-06


Downloading BBY Fillings:  60%|██████    | 15/25 [00:03<00:01,  6.81filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447819000028/0000764478-19-000028-index.htm 10-Q 2019-06-07
https://www.sec.gov/Archives/edgar/data/764478/000076447818000053/0000764478-18-000053-index.htm 10-Q 2018-12-07
https://www.sec.gov/Archives/edgar/data/764478/000076447818000043/0000764478-18-000043-index.htm 10-Q 2018-09-10


Downloading BBY Fillings:  72%|███████▏  | 18/25 [00:03<00:01,  5.35filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447818000024/0000764478-18-000024-index.htm 10-Q 2018-06-08
https://www.sec.gov/Archives/edgar/data/764478/000076447817000039/0000764478-17-000039-index.htm 10-Q 2017-12-01


Downloading BBY Fillings:  80%|████████  | 20/25 [00:03<00:00,  6.79filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447817000032/0000764478-17-000032-index.htm 10-Q 2017-09-05
https://www.sec.gov/Archives/edgar/data/764478/000076447817000018/0000764478-17-000018-index.htm 10-Q 2017-06-05
https://www.sec.gov/Archives/edgar/data/764478/000076447816000093/0000764478-16-000093-index.htm 10-Q 2016-12-02


Downloading BBY Fillings:  88%|████████▊ | 22/25 [00:04<00:00,  4.37filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447816000088/0000764478-16-000088-index.htm 10-Q 2016-09-02
https://www.sec.gov/Archives/edgar/data/764478/000076447816000075/0000764478-16-000075-index.htm 10-Q 2016-06-09


Downloading BBY Fillings: 100%|██████████| 25/25 [00:05<00:00,  6.48filling/s]

https://www.sec.gov/Archives/edgar/data/764478/000076447815000051/0000764478-15-000051-index.htm 10-Q 2015-12-04
https://www.sec.gov/Archives/edgar/data/764478/000076447815000042/0000764478-15-000042-index.htm 10-Q 2015-09-04


Downloading BBY Fillings: 100%|██████████| 25/25 [00:05<00:00,  4.96filling/s]
Downloading AIV Fillings:   0%|          | 0/26 [00:00<?, ?filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000156459021022517/0001564590-21-022517-index.htm 10-K/A 2021-04-30
https://www.sec.gov/Archives/edgar/data/922864/000156459021012671/0001564590-21-012671-index.htm 10-K 2021-03-12


Downloading AIV Fillings:  12%|█▏        | 3/26 [00:00<00:05,  4.22filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000156459020006053/0001564590-20-006053-index.htm 10-K 2020-02-24
https://www.sec.gov/Archives/edgar/data/922864/000092286419000007/0000922864-19-000007-index.htm 10-K 2019-02-20


Downloading AIV Fillings:  19%|█▉        | 5/26 [00:00<00:03,  5.83filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286418000006/0000922864-18-000006-index.htm 10-K 2018-03-01
https://www.sec.gov/Archives/edgar/data/922864/000092286417000006/0000922864-17-000006-index.htm 10-K 2017-02-24


Downloading AIV Fillings:  23%|██▎       | 6/26 [00:01<00:03,  6.46filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000156459021055777/0001564590-21-055777-index.htm 10-Q 2021-11-09


Downloading AIV Fillings:  31%|███       | 8/26 [00:01<00:03,  4.81filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000156459021044377/0001564590-21-044377-index.htm 10-Q 2021-08-16
https://www.sec.gov/Archives/edgar/data/922864/000156459021028429/0001564590-21-028429-index.htm 10-Q 2021-05-17
https://www.sec.gov/Archives/edgar/data/922864/000156459020049394/0001564590-20-049394-index.htm 10-Q 2020-11-02


Downloading AIV Fillings:  42%|████▏     | 11/26 [00:01<00:02,  7.06filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000156459020036228/0001564590-20-036228-index.htm 10-Q 2020-08-04
https://www.sec.gov/Archives/edgar/data/922864/000156459020023679/0001564590-20-023679-index.htm 10-Q 2020-05-11


Downloading AIV Fillings:  54%|█████▍    | 14/26 [00:02<00:02,  5.34filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000156459019039821/0001564590-19-039821-index.htm 10-Q 2019-11-04
https://www.sec.gov/Archives/edgar/data/922864/000156459019031035/0001564590-19-031035-index.htm 10-Q 2019-08-09


Downloading AIV Fillings:  58%|█████▊    | 15/26 [00:02<00:01,  5.97filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286419000024/0000922864-19-000024-index.htm 10-Q 2019-05-06
https://www.sec.gov/Archives/edgar/data/922864/000092286418000047/0000922864-18-000047-index.htm 10-Q 2018-11-05
https://www.sec.gov/Archives/edgar/data/922864/000092286418000037/0000922864-18-000037-index.htm 10-Q 2018-08-07


Downloading AIV Fillings:  73%|███████▎  | 19/26 [00:03<00:01,  5.45filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286418000021/0000922864-18-000021-index.htm 10-Q 2018-05-08
https://www.sec.gov/Archives/edgar/data/922864/000092286417000043/0000922864-17-000043-index.htm 10-Q 2017-11-01


Downloading AIV Fillings:  77%|███████▋  | 20/26 [00:03<00:01,  5.99filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286417000035/0000922864-17-000035-index.htm 10-Q 2017-08-02
https://www.sec.gov/Archives/edgar/data/922864/000092286417000016/0000922864-17-000016-index.htm 10-Q 2017-05-05


Downloading AIV Fillings:  81%|████████  | 21/26 [00:04<00:00,  6.57filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286416000124/0000922864-16-000124-index.htm 10-Q 2016-10-28


Downloading AIV Fillings:  88%|████████▊ | 23/26 [00:04<00:00,  4.17filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286416000112/0000922864-16-000112-index.htm 10-Q 2016-07-29
https://www.sec.gov/Archives/edgar/data/922864/000092286416000089/0000922864-16-000089-index.htm 10-Q 2016-04-29


Downloading AIV Fillings:  96%|█████████▌| 25/26 [00:05<00:00,  5.52filling/s]

https://www.sec.gov/Archives/edgar/data/922864/000092286415000054/0000922864-15-000054-index.htm 10-Q 2015-11-05
https://www.sec.gov/Archives/edgar/data/922864/000092286415000033/0000922864-15-000033-index.htm 10-Q 2015-07-31


Downloading AIV Fillings: 100%|██████████| 26/26 [00:05<00:00,  5.03filling/s]


In [19]:
with open('fillings_by_ticker_dict', 'rb') as handle:
    fillings_by_ticker = pickle.load(handle)

In [46]:
print(fillings_by_ticker["AIV"]["2016-10-28"][:2000])

<SEC-DOCUMENT>0000922864-16-000124.txt : 20161028
<SEC-HEADER>0000922864-16-000124.hdr.sgml : 20161028
<ACCEPTANCE-DATETIME>20161028152849
ACCESSION NUMBER:		0000922864-16-000124
CONFORMED SUBMISSION TYPE:	10-Q
PUBLIC DOCUMENT COUNT:		56
CONFORMED PERIOD OF REPORT:	20160930
FILED AS OF DATE:		20161028
DATE AS OF CHANGE:		20161028

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			APARTMENT INVESTMENT & MANAGEMENT CO
		CENTRAL INDEX KEY:			0000922864
		STANDARD INDUSTRIAL CLASSIFICATION:	REAL ESTATE INVESTMENT TRUSTS [6798]
		IRS NUMBER:				841259577
		STATE OF INCORPORATION:			MD
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-Q
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-13232
		FILM NUMBER:		161958364

	BUSINESS ADDRESS:	
		STREET 1:		4582 SOUTH ULSTER STREET
		STREET 2:		SUITE 1100
		CITY:			DENVER
		STATE:			CO
		ZIP:			80237
		BUSINESS PHONE:		3037578101

	MAIL ADDRESS:	
		STREET 1:		4582 SOUTH ULSTER STREET
		STREET 2:		SUITE 1100
		CITY:			DENVER
		STATE:			CO
		ZI

In [20]:
# ten_ks_by_ticker = {}
#
# for ticker, filling_documents in filling_documents_by_ticker.items():
#     ten_ks_by_ticker[ticker] = []
#     for file_date, documents in filling_documents.items():
#         for document in documents:
#             if get_document_type(document) == '10-k':
#                 ten_ks_by_ticker[ticker].append({
#                     'cik': cik_lookup[ticker],
#                     'file': document,
#                     'file_date': file_date})
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])
#

In [21]:
# raw_10k = raw_fillings_by_ticker["AMZN"]["2019-02-01"]
# import re
# # Regex to find <DOCUMENT> tags
# doc_start_pattern = re.compile(r'<DOCUMENT>')
# doc_end_pattern = re.compile(r'</DOCUMENT>')
# # Regex to find <TYPE> tag prceeding any characters, terminating at new line
# type_pattern = re.compile(r'<TYPE>[^\n]+')
# # Create 3 lists with the span idices for each regex
#
# ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
# ### First filter will give us document tag start <end> and document tag end's <start>
# ### We will use this to later grab content in between these tags
# doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
# doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
#
# ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
# ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
# ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
# ### as section names
# doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
# document = {}
#
# # Create a loop to go through each section type and save only the 10-K section in the dictionary
# for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
#     if doc_type == '10-K':
#         document[doc_type] = raw_10k[doc_start:doc_end]
#

In [22]:
# # display excerpt the document
# document['10-K'][0:500]
#

In [23]:
# # Write the regex
# regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')
#

In [24]:
# matches = regex.finditer(document['10-K'])
#
# # Create the dataframe
# test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
#
# test_df.columns = ['item', 'start', 'end']
# test_df['item'] = test_df.item.str.lower()
#
# # Display the dataframe
# test_df.head()
#

In [25]:
# # Get rid of unnesesary charcters from the dataframe
# test_df.replace('&#160;',' ',regex=True,inplace=True)
# test_df.replace('&nbsp;',' ',regex=True,inplace=True)
# test_df.replace(' ','',regex=True,inplace=True)
# test_df.replace('\.','',regex=True,inplace=True)
# test_df.replace('>','',regex=True,inplace=True)
#

In [26]:
# pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
# pos_dat.set_index('item', inplace=True)
#

In [27]:
# pos_dat
#

In [28]:
# document['10-K']
# #Get Item 1a
# item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
# #item_1b_raw = document['10-K'][pos_dat['start'].loc['item1b']:pos_dat['start'].loc['item7a']]
# item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
# item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
# #item_8_raw = document['10-K'][pos_dat['start'].loc['item8']:]
#

In [29]:
# "a"+" "+"b"
#

In [30]:
# item_content = BeautifulSoup(item_1a_raw + item_7_raw + item_7a_raw, 'lxml')
# print(item_content.get_text("\n\n"))
#

In [31]:
# len(item_content.get_text("\n\n"))
#

### Get Documents
With theses fillings downloaded, we want to break them into their associated documents. These documents are sectioned off in the fillings with the tags `<DOCUMENT>` for the start of each document and `</DOCUMENT>` for the end of each document. There's no overlap with these documents, so each `</DOCUMENT>` tag should come after the `<DOCUMENT>` with no `<DOCUMENT>` tag in between.

Implement `get_documents` to return a list of these documents from a filling. Make sure not to include the tag in the returned document text.

In [32]:
# import re
#
#
# def get_documents(text):
#     """
#     Extract the documents from the text
#
#     Parameters
#     ----------
#     text : str
#         The text with the document strings inside
#
#     Returns
#     -------
#     extracted_docs : list of str
#         The document strings found in `text`
#     """
#
#     # TODO: Implement
#     extracted_docs = []
#
#     doc_start_pattern = re.compile(r'<DOCUMENT>')
#     doc_end_pattern = re.compile(r'</DOCUMENT>')
#
#     doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
#     doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
#
#     for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
#             extracted_docs.append(text[doc_start_i:doc_end_i])
#
#     return extracted_docs
#
#
# project_tests.test_get_documents(get_documents)
#

With the `get_documents` function implemented, let's extract all the documents.

In [33]:
# filling_documents_by_ticker = {}
#
# for ticker, raw_fillings in raw_fillings_by_ticker.items():
#     filling_documents_by_ticker[ticker] = {}
#     for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
#         filling_documents_by_ticker[ticker][file_date] = get_documents(filling)
#
#
# print('\n\n'.join([
#     'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
#     for file_date, docs in filling_documents_by_ticker[example_ticker].items()
#     for doc_i, doc in enumerate(docs)][:3]))
#

### Get Document Types
Now that we have all the documents, we want to find the 10-k form in this 10-k filing. Implement the `get_document_type` function to return the type of document given. The document type is located on a line with the `<TYPE>` tag. For example, a form of type "TEST" would have the line `<TYPE>TEST`. Make sure to return the type as lowercase, so this example would be returned as "test".

In [34]:
# def get_document_type(doc):
#     type_pattern = re.compile(r'<TYPE>[^\n]+')
#
#     doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):]
#
#     return doc_type.lower()
#
#
# project_tests.test_get_document_type(get_document_type)
#

With the `get_document_type` function, we'll filter out all non 10-k documents.

In [35]:
# ten_ks_by_ticker = {}
#
# for ticker, filling_documents in filling_documents_by_ticker.items():
#     ten_ks_by_ticker[ticker] = []
#     for file_date, documents in filling_documents.items():
#         for document in documents:
#             if get_document_type(document) == '10-k':
#                 ten_ks_by_ticker[ticker].append({
#                     'cik': cik_lookup[ticker],
#                     'file': document,
#                     'file_date': file_date})
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])
#

In [36]:
# def remove_html_tags(text):
#     text = BeautifulSoup(text, 'html.parser').get_text()
#
#     return text
#
#
# def clean_text(text):
#     text = text.lower()
#     text = remove_html_tags(text)
#
#     return text
#

Using the `clean_text` function, we'll clean up all the documents.

In [37]:
# for ticker, ten_ks in ten_ks_by_ticker.items():
#     for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
#         ten_k['file_clean'] = clean_text(ten_k['file'])
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_clean'])
#

### Lemmatize
With the text cleaned up, it's time to distill the verbs down. Implement the `lemmatize_words` function to lemmatize verbs in the list of words provided.

In [38]:
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import wordnet
#
#
# def lemmatize_words(words):
#     """
#     Lemmatize words
#
#     Parameters
#     ----------
#     words : list of str
#         List of words
#
#     Returns
#     -------
#     lemmatized_words : list of str
#         List of lemmatized words
#     """
#
#     # TODO: Implement
#     lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
#
#     return lemmatized_words
#
#
# project_tests.test_lemmatize_words(lemmatize_words)
#

With the `lemmatize_words` function implemented, let's lemmatize all the data.

In [39]:
# ten_ks[0]["file_clean"] = item_content.get_text("\n\n")
#

In [40]:
# word_pattern = re.compile('\w+')
#
# for ticker, ten_ks in ten_ks_by_ticker.items():
#     for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
#         ten_k['file_lemma'] = lemmatize_words(word_pattern.findall(ten_k['file_clean']))
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_lemma'])
#

### Remove Stopwords

In [41]:
# from nltk.corpus import stopwords
#
#
# lemma_english_stopwords = lemmatize_words(stopwords.words('english'))
#
# for ticker, ten_ks in ten_ks_by_ticker.items():
#     for ten_k in tqdm(ten_ks, desc='Remove Stop Words for {} 10-Ks'.format(ticker), unit='10-K'):
#         ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]
#
#
# print('Stop Words Removed')
# if ("7A" in ten_ks[0]['file_lemma']):
#     print("Element Exists")
#

In [42]:
# ten_ks[0]['file_lemma']
#

In [43]:
# len(ten_ks[0]['file_lemma'])