### Load Packages

In [34]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
import project_tests
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.common.by import By
from datetime import datetime
from selenium.webdriver import EdgeOptions
from parser_10KQ import get_word_list
from itertools import islice
options = EdgeOptions()
options.add_argument("--headless")
options.add_argument("--window-size=%s" % "3840, 2160")

### Get S&P500 CIKs
Filter the sp500_constituents csv by removing companies that were out after 2017.
Use the sp500_constituents permnos to filter sp500_data and get a dictionary of tickers and
CIKs.

In [3]:
sp500_constituents = pd.read_csv("sp500_constituents.csv", dtype={"permno":int}, index_col=0)
sp500_constituents = sp500_constituents[(sp500_constituents["ending"] > "2017-01-01")]
sp500_constituents

Unnamed: 0,permno,start,ending
6,10104,1989-08-03,2022-03-31
7,10107,1994-06-07,2022-03-31
11,10138,1999-10-13,2022-03-31
12,10145,1925-12-31,2022-03-31
28,10299,2000-04-03,2017-03-10
...,...,...,...
2008,93096,2012-12-03,2022-03-31
2009,93132,2018-10-11,2022-03-31
2011,93246,2021-03-22,2022-03-31
2013,93429,2017-03-01,2022-03-31


In [35]:
sp500_data = pd.read_csv("sp500_w_addl_id_with_cik.csv",dtype={"cik":str, "permno":int})
sp500_data = sp500_data[["ticker", "permno", "cik"]].set_index("ticker")
sp500_data = sp500_data[sp500_data["permno"].isin(sp500_constituents["permno"])]
sp500_data.drop_duplicates(inplace=True)
sp500_data.dropna(inplace=True)
cik_lookup = sp500_data.to_dict()["cik"]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [37]:
cik_lookup = dict(islice(cik_lookup.items(), 10))
cik_lookup

{'NWL': '0000814453',
 'BBY': '0000764478',
 'AIV': '0000922864',
 'AXP': '0000004962',
 'TIF': '0000098246',
 'BAC': '0000070858',
 'TGT': '0000027419',
 'CVS': '0000064803',
 'WFMI': '0000865436',
 'ECL': '0000031462'}

In [39]:
def chunks(data, SIZE=100):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}

In [40]:
for item in chunks(cik_lookup, 3):
    print(item)

{'NWL': '0000814453', 'BBY': '0000764478', 'AIV': '0000922864'}
{'AXP': '0000004962', 'TIF': '0000098246', 'BAC': '0000070858'}
{'TGT': '0000027419', 'CVS': '0000064803', 'WFMI': '0000865436'}
{'ECL': '0000031462'}


In [6]:
from bs4 import BeautifulSoup
import requests
sec_api = project_helper.SecAPI()
example_ticker = "AMZN"
sec_data = {ticker: [] for ticker in cik_lookup}
headers = {'Host': 'www.sec.gov', 'Connection': 'close',
           'Accept': 'application/json, text/javascript, */*; q=0.01',
           'X-Requested-With': 'XMLHttpRequest',
           'User-Agent': 'ruizhuoj@andrew.cmu.edu'
           }
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"
base_url_sec = r"https://www.sec.gov"
for ticker in cik_lookup:
    # define our parameters dictionary
    param_dict_10k = {'action': 'getcompany',
                  'CIK': cik_lookup[ticker],
                  'type': '10-k',
                  'dateb': '20220101',
                  'owner': 'exclude',
                  'start': '',
                  'output': '',
                  'count': '10'}
    # request the url, and then parse the response.
    response_10k = requests.get(url=endpoint, params=param_dict_10k, headers=headers)
    soup_10k = BeautifulSoup(response_10k.content, 'html.parser')
    doc_table_10k = soup_10k.find_all('table', class_='tableFile2')
    param_dict_10q = {'action': 'getcompany',
                  'CIK': cik_lookup[ticker],
                  'type': '10-Q',
                  'dateb': '20220101',
                  'owner': 'exclude',
                  'start': '',
                  'output': '',
                  'count': '20'}
    # request the url, and then parse the response.
    response_10q = requests.get(url=endpoint, params=param_dict_10q, headers=headers)
    soup_10q = BeautifulSoup(response_10q.content, 'html.parser')
    doc_table_10q = soup_10q.find_all('table', class_='tableFile2')
    #Get 10-Ks
    for row in doc_table_10k[0].find_all('tr'):
        # find all the columns
        cols = row.find_all('td')
        # if there are no columns move on to the next row.
        if len(cols) != 0:
            # grab the text
            filing_type = cols[0].text.strip()
            filing_date = cols[3].text.strip()
            if datetime.strptime(filing_date, '%Y-%m-%d').date() < datetime.strptime("2017", '%Y').date():
                pass
            else:
                filing_numb = cols[4].text.strip()
                # find the links
                filing_doc_href = cols[1].find('a', {'href': True, 'id': 'documentsbutton'})
                filing_int_href = cols[1].find('a', {'href': True, 'id': 'interactiveDataBtn'})
                filing_doc_link = base_url_sec + filing_doc_href['href']
                sec_data[ticker].append((filing_doc_link, filing_type, filing_date))
    #Get 10-Qs
    for row in doc_table_10q[0].find_all('tr'):
        # find all the columns
        cols = row.find_all('td')
        # if there are no columns move on to the next row.
        if len(cols) != 0:
            # grab the text
            filing_type = cols[0].text.strip()
            filing_date = cols[3].text.strip()
            filing_numb = cols[4].text.strip()
            # find the links
            filing_doc_href = cols[1].find('a', {'href': True, 'id': 'documentsbutton'})
            filing_int_href = cols[1].find('a', {'href': True, 'id': 'interactiveDataBtn'})
            filing_doc_link = base_url_sec + filing_doc_href['href']
            sec_data[ticker].append((filing_doc_link, filing_type, filing_date))
    print(ticker, "request successful")

AMZN request successful


### Download 10-ks
As you see, this is a list of urls. These urls point to a file that contains metadata related to each filling. Since we don't care about the metadata, we'll pull the filling by replacing the url with the filling url.

In [7]:
import re
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

In [8]:

fillings_by_ticker = {}
browser = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))
for ticker, data in sec_data.items():
    fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        print(index_url, file_type, file_date)
        if (file_type == '10-K' or file_type == '10-Q'):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')
            browser.get(file_url)
            raw_file = browser.find_element(By.TAG_NAME, "body").text
            file_lemmaR = get_word_list(raw_file)
            fillings_by_ticker[ticker][file_date] = file_lemmaR
with open('fillings_by_ticker_dict', 'wb') as handle:
    pickle.dump(fillings_by_ticker, handle, protocol=pickle.HIGHEST_PROTOCOL)

Downloading AMZN Fillings:   0%|          | 0/25 [00:00<?, ?filling/s]

https://www.sec.gov/Archives/edgar/data/1018724/000101872421000004/0001018724-21-000004-index.htm 10-K 2021-02-03


Downloading AMZN Fillings:   4%|▍         | 1/25 [00:07<03:10,  7.95s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872420000004/0001018724-20-000004-index.htm 10-K 2020-01-31


Downloading AMZN Fillings:   8%|▊         | 2/25 [00:14<02:41,  7.04s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872419000004/0001018724-19-000004-index.htm 10-K 2019-02-01


Downloading AMZN Fillings:  12%|█▏        | 3/25 [00:19<02:15,  6.14s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872418000005/0001018724-18-000005-index.htm 10-K 2018-02-02


Downloading AMZN Fillings:  16%|█▌        | 4/25 [00:24<02:02,  5.83s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872417000011/0001018724-17-000011-index.htm 10-K 2017-02-10


Downloading AMZN Fillings:  20%|██        | 5/25 [00:29<01:51,  5.56s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872421000028/0001018724-21-000028-index.htm 10-Q 2021-10-29


Downloading AMZN Fillings:  24%|██▍       | 6/25 [00:33<01:31,  4.81s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872421000020/0001018724-21-000020-index.htm 10-Q 2021-07-30


Downloading AMZN Fillings:  28%|██▊       | 7/25 [00:36<01:18,  4.38s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872421000010/0001018724-21-000010-index.htm 10-Q 2021-04-30


Downloading AMZN Fillings:  32%|███▏      | 8/25 [00:39<01:06,  3.91s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872420000030/0001018724-20-000030-index.htm 10-Q 2020-10-30


Downloading AMZN Fillings:  36%|███▌      | 9/25 [00:42<00:59,  3.72s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872420000021/0001018724-20-000021-index.htm 10-Q 2020-07-31


Downloading AMZN Fillings:  40%|████      | 10/25 [00:46<00:55,  3.72s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/0001018724-20-000010-index.htm 10-Q 2020-05-01


Downloading AMZN Fillings:  44%|████▍     | 11/25 [00:49<00:48,  3.46s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872419000089/0001018724-19-000089-index.htm 10-Q 2019-10-25


Downloading AMZN Fillings:  48%|████▊     | 12/25 [00:52<00:44,  3.43s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872419000071/0001018724-19-000071-index.htm 10-Q 2019-07-26


Downloading AMZN Fillings:  52%|█████▏    | 13/25 [00:56<00:40,  3.41s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/0001018724-19-000043-index.htm 10-Q 2019-04-26


Downloading AMZN Fillings:  56%|█████▌    | 14/25 [00:58<00:33,  3.08s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872418000159/0001018724-18-000159-index.htm 10-Q 2018-10-26


Downloading AMZN Fillings:  60%|██████    | 15/25 [01:01<00:31,  3.11s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872418000108/0001018724-18-000108-index.htm 10-Q 2018-07-27


Downloading AMZN Fillings:  64%|██████▍   | 16/25 [01:04<00:28,  3.12s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872418000072/0001018724-18-000072-index.htm 10-Q 2018-04-27


Downloading AMZN Fillings:  68%|██████▊   | 17/25 [01:07<00:22,  2.84s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872417000135/0001018724-17-000135-index.htm 10-Q 2017-10-27


Downloading AMZN Fillings:  72%|███████▏  | 18/25 [01:09<00:20,  2.86s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872417000100/0001018724-17-000100-index.htm 10-Q 2017-07-28


Downloading AMZN Fillings:  76%|███████▌  | 19/25 [01:12<00:16,  2.82s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872417000051/0001018724-17-000051-index.htm 10-Q 2017-04-28


Downloading AMZN Fillings:  80%|████████  | 20/25 [01:14<00:12,  2.52s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872416000324/0001018724-16-000324-index.htm 10-Q 2016-10-28


Downloading AMZN Fillings:  84%|████████▍ | 21/25 [01:17<00:10,  2.55s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872416000286/0001018724-16-000286-index.htm 10-Q 2016-07-29


Downloading AMZN Fillings:  88%|████████▊ | 22/25 [01:19<00:07,  2.65s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872416000227/0001018724-16-000227-index.htm 10-Q 2016-04-29


Downloading AMZN Fillings:  92%|█████████▏| 23/25 [01:21<00:04,  2.44s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872415000126/0001018724-15-000126-index.htm 10-Q 2015-10-23


Downloading AMZN Fillings:  96%|█████████▌| 24/25 [01:24<00:02,  2.47s/filling]

https://www.sec.gov/Archives/edgar/data/1018724/000101872415000087/0001018724-15-000087-index.htm 10-Q 2015-07-24


Downloading AMZN Fillings: 100%|██████████| 25/25 [01:26<00:00,  3.47s/filling]


In [33]:
with open('fillings_by_ticker_dict', 'rb') as handle:
    fillings_by_ticker = pickle.load(handle)

{'AMZN': {'2021-02-03': ['Table',
   'Contents',
   'Item',
   '1A',
   'Risk',
   'Factors',
   'Please',
   'carefully',
   'consider',
   'follow',
   'discussion',
   'significant',
   'factor',
   'events',
   'uncertainties',
   'make',
   'investment',
   'securities',
   'risky',
   'The',
   'events',
   'consequences',
   'discuss',
   'risk',
   'factor',
   'could',
   'circumstances',
   'may',
   'may',
   'able',
   'accurately',
   'predict',
   'recognize',
   'control',
   'material',
   'adverse',
   'effect',
   'business',
   'growth',
   'reputation',
   'prospect',
   'financial',
   'condition',
   'operate',
   'result',
   'include',
   'components',
   'financial',
   'result',
   'cash',
   'flow',
   'liquidity',
   'stock',
   'price',
   'These',
   'risk',
   'factor',
   'identify',
   'risk',
   'face',
   'operations',
   'could',
   'also',
   'affect',
   'factor',
   'events',
   'uncertainties',
   'presently',
   'know',
   'us',
   'currently',


In [9]:
# ten_ks_by_ticker = {}
#
# for ticker, filling_documents in filling_documents_by_ticker.items():
#     ten_ks_by_ticker[ticker] = []
#     for file_date, documents in filling_documents.items():
#         for document in documents:
#             if get_document_type(document) == '10-k':
#                 ten_ks_by_ticker[ticker].append({
#                     'cik': cik_lookup[ticker],
#                     'file': document,
#                     'file_date': file_date})
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])
#

In [10]:
# raw_10k = raw_fillings_by_ticker["AMZN"]["2019-02-01"]
# import re
# # Regex to find <DOCUMENT> tags
# doc_start_pattern = re.compile(r'<DOCUMENT>')
# doc_end_pattern = re.compile(r'</DOCUMENT>')
# # Regex to find <TYPE> tag prceeding any characters, terminating at new line
# type_pattern = re.compile(r'<TYPE>[^\n]+')
# # Create 3 lists with the span idices for each regex
#
# ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
# ### First filter will give us document tag start <end> and document tag end's <start>
# ### We will use this to later grab content in between these tags
# doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
# doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
#
# ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
# ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
# ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
# ### as section names
# doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
# document = {}
#
# # Create a loop to go through each section type and save only the 10-K section in the dictionary
# for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
#     if doc_type == '10-K':
#         document[doc_type] = raw_10k[doc_start:doc_end]
#

In [11]:
# # display excerpt the document
# document['10-K'][0:500]
#

In [12]:
# # Write the regex
# regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')
#

In [13]:
# matches = regex.finditer(document['10-K'])
#
# # Create the dataframe
# test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
#
# test_df.columns = ['item', 'start', 'end']
# test_df['item'] = test_df.item.str.lower()
#
# # Display the dataframe
# test_df.head()
#

In [14]:
# # Get rid of unnesesary charcters from the dataframe
# test_df.replace('&#160;',' ',regex=True,inplace=True)
# test_df.replace('&nbsp;',' ',regex=True,inplace=True)
# test_df.replace(' ','',regex=True,inplace=True)
# test_df.replace('\.','',regex=True,inplace=True)
# test_df.replace('>','',regex=True,inplace=True)
#

In [15]:
# pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
# pos_dat.set_index('item', inplace=True)
#

In [16]:
# pos_dat
#

In [17]:
# document['10-K']
# #Get Item 1a
# item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
# #item_1b_raw = document['10-K'][pos_dat['start'].loc['item1b']:pos_dat['start'].loc['item7a']]
# item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
# item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]
# #item_8_raw = document['10-K'][pos_dat['start'].loc['item8']:]
#

In [18]:
# "a"+" "+"b"
#

In [19]:
# item_content = BeautifulSoup(item_1a_raw + item_7_raw + item_7a_raw, 'lxml')
# print(item_content.get_text("\n\n"))
#

In [20]:
# len(item_content.get_text("\n\n"))
#

### Get Documents
With theses fillings downloaded, we want to break them into their associated documents. These documents are sectioned off in the fillings with the tags `<DOCUMENT>` for the start of each document and `</DOCUMENT>` for the end of each document. There's no overlap with these documents, so each `</DOCUMENT>` tag should come after the `<DOCUMENT>` with no `<DOCUMENT>` tag in between.

Implement `get_documents` to return a list of these documents from a filling. Make sure not to include the tag in the returned document text.

In [21]:
# import re
#
#
# def get_documents(text):
#     """
#     Extract the documents from the text
#
#     Parameters
#     ----------
#     text : str
#         The text with the document strings inside
#
#     Returns
#     -------
#     extracted_docs : list of str
#         The document strings found in `text`
#     """
#
#     # TODO: Implement
#     extracted_docs = []
#
#     doc_start_pattern = re.compile(r'<DOCUMENT>')
#     doc_end_pattern = re.compile(r'</DOCUMENT>')
#
#     doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
#     doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
#
#     for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
#             extracted_docs.append(text[doc_start_i:doc_end_i])
#
#     return extracted_docs
#
#
# project_tests.test_get_documents(get_documents)
#

With the `get_documents` function implemented, let's extract all the documents.

In [22]:
# filling_documents_by_ticker = {}
#
# for ticker, raw_fillings in raw_fillings_by_ticker.items():
#     filling_documents_by_ticker[ticker] = {}
#     for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
#         filling_documents_by_ticker[ticker][file_date] = get_documents(filling)
#
#
# print('\n\n'.join([
#     'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
#     for file_date, docs in filling_documents_by_ticker[example_ticker].items()
#     for doc_i, doc in enumerate(docs)][:3]))
#

### Get Document Types
Now that we have all the documents, we want to find the 10-k form in this 10-k filing. Implement the `get_document_type` function to return the type of document given. The document type is located on a line with the `<TYPE>` tag. For example, a form of type "TEST" would have the line `<TYPE>TEST`. Make sure to return the type as lowercase, so this example would be returned as "test".

In [23]:
# def get_document_type(doc):
#     type_pattern = re.compile(r'<TYPE>[^\n]+')
#
#     doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):]
#
#     return doc_type.lower()
#
#
# project_tests.test_get_document_type(get_document_type)
#

With the `get_document_type` function, we'll filter out all non 10-k documents.

In [24]:
# ten_ks_by_ticker = {}
#
# for ticker, filling_documents in filling_documents_by_ticker.items():
#     ten_ks_by_ticker[ticker] = []
#     for file_date, documents in filling_documents.items():
#         for document in documents:
#             if get_document_type(document) == '10-k':
#                 ten_ks_by_ticker[ticker].append({
#                     'cik': cik_lookup[ticker],
#                     'file': document,
#                     'file_date': file_date})
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])
#

In [25]:
# def remove_html_tags(text):
#     text = BeautifulSoup(text, 'html.parser').get_text()
#
#     return text
#
#
# def clean_text(text):
#     text = text.lower()
#     text = remove_html_tags(text)
#
#     return text
#

Using the `clean_text` function, we'll clean up all the documents.

In [26]:
# for ticker, ten_ks in ten_ks_by_ticker.items():
#     for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
#         ten_k['file_clean'] = clean_text(ten_k['file'])
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_clean'])
#

### Lemmatize
With the text cleaned up, it's time to distill the verbs down. Implement the `lemmatize_words` function to lemmatize verbs in the list of words provided.

In [27]:
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import wordnet
#
#
# def lemmatize_words(words):
#     """
#     Lemmatize words
#
#     Parameters
#     ----------
#     words : list of str
#         List of words
#
#     Returns
#     -------
#     lemmatized_words : list of str
#         List of lemmatized words
#     """
#
#     # TODO: Implement
#     lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
#
#     return lemmatized_words
#
#
# project_tests.test_lemmatize_words(lemmatize_words)
#

With the `lemmatize_words` function implemented, let's lemmatize all the data.

In [28]:
# ten_ks[0]["file_clean"] = item_content.get_text("\n\n")
#

In [29]:
# word_pattern = re.compile('\w+')
#
# for ticker, ten_ks in ten_ks_by_ticker.items():
#     for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
#         ten_k['file_lemma'] = lemmatize_words(word_pattern.findall(ten_k['file_clean']))
#
#
# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_lemma'])
#

### Remove Stopwords

In [30]:
# from nltk.corpus import stopwords
#
#
# lemma_english_stopwords = lemmatize_words(stopwords.words('english'))
#
# for ticker, ten_ks in ten_ks_by_ticker.items():
#     for ten_k in tqdm(ten_ks, desc='Remove Stop Words for {} 10-Ks'.format(ticker), unit='10-K'):
#         ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]
#
#
# print('Stop Words Removed')
# if ("7A" in ten_ks[0]['file_lemma']):
#     print("Element Exists")
#

In [31]:
# ten_ks[0]['file_lemma']
#

In [32]:
# len(ten_ks[0]['file_lemma'])