# SEC EDGAR - Filings' Data Extraction Part 3

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import time
import re
import os
import unicodedata

# note: the requests library and the BeautifulSoup library will be used for web scraping
# note: the numpy library will be used to replace 'nan' values in various dataframes
# note: the unicodedata library will be used to get the normal form of unicode strings

## Read Datasets to Pandas Dataframe

In [2]:
# read filing_page.csv file into a pandas dataframe
filing_page_df = pd.read_csv('filing_page.csv')
# display filing_page_df
display(filing_page_df.head())
# replace 'nan' values with 'N/A'
filing_page_df = filing_page_df.replace(np.nan, 'N/A')

Unnamed: 0,CIK,Ticker_Symbol,Company_Name,Data_Date,Filing_Date,Filing_Type,Filing_Url,MDA_Placement,Filing_Check,Filing_Report_Url
0,1750,AIR,AAR CORP,2012-05-31,2012-07-19,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...
1,1750,AIR,AAR CORP,2013-05-31,2013-07-26,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...
2,1750,AIR,AAR CORP,2014-05-31,2014-07-17,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...
3,1750,AIR,AAR CORP,2016-05-31,2016-07-13,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...
4,1750,AIR,AAR CORP,2019-05-31,2019-07-18,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...


In [3]:
# read filing_10k_mda_df.csv file into a pandas dataframe
filing_10k_mda_df = pd.read_csv('filing_10k_mda_df.csv')
# display filing_10k_mda_df 
display(filing_10k_mda_df.head())
# replace 'nan' values with 'N/A'
filing_10k_mda_df = filing_10k_mda_df.replace(np.nan, 'N/A')

Unnamed: 0,CIK,Ticker_Symbol,Company_Name,Data_Date,Filing_Date,Filing_Type,Filing_Url,MDA_Placement,Filing_Check,Filing_Report_Url,...,MDA_Page_Number_Start_Tag,MDA_Page_Number_Start_Prev_Tag,MDA_Page_Number_End_Tag,MDA_Page_Number_End_Prev_Tag,MDA_Page_Number_Start_Match,MDA_Page_Number_Start_Prev_Match,MDA_Page_Number_End_Match,MDA_Page_Number_End_Prev_Match,MDA_Start_Position,MDA_End_Position
0,1750,AIR,AAR CORP,2012-05-31,2012-07-19,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...,...,"<font size=""2"">17</font>","<font size=""2"">16</font>","<font size=""2"">29</font>","<font size=""2"">28</font>",True,True,True,True,176725.0,275637.0
1,1750,AIR,AAR CORP,2013-05-31,2013-07-26,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...,...,"<font size=""2"">18</font>","<font size=""2"">17</font>","<font size=""2"">27</font>","<font size=""2"">26</font>",True,True,True,True,186736.0,272962.0
2,1750,AIR,AAR CORP,2014-05-31,2014-07-17,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...,...,"<font size=""2"">19</font>","<font size=""2"">18</font>","<font size=""2"">29</font>","<font size=""2"">28</font>",True,True,True,True,229060.0,405617.0
3,1750,AIR,AAR CORP,2016-05-31,2016-07-13,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...,...,"<font size=""2"">20</font>","<font size=""2"">19</font>","<font size=""2"">33</font>","<font size=""2"">32</font>",True,True,True,True,291382.0,482986.0
4,1750,AIR,AAR CORP,2019-05-31,2019-07-18,10-K,https://www.sec.gov/Archives/edgar/data/1750/0...,10-K,True,https://www.sec.gov/Archives/edgar/data/1750/0...,...,"<font size=""2"">21</font>","<font size=""2"">20</font>","<font size=""2"">36</font>","<font size=""2"">35</font>",True,True,True,True,226707.0,426552.0


In [4]:
# read filing_20f_ofp_df.csv file into a pandas dataframe
filing_20f_ofp_df = pd.read_csv('filing_20f_ofp_df.csv')
# display filing_20f_ofp_df 
display(filing_20f_ofp_df.head())
# replace 'nan' values with 'N/A'
filing_20f_ofp_df = filing_20f_ofp_df.replace(np.nan, 'N/A')

Unnamed: 0,CIK,Ticker_Symbol,Company_Name,Data_Date,Filing_Date,Filing_Type,Filing_Url,MDA_Placement,Filing_Check,Filing_Report_Url,...,OFP_Page_Number_Start_Tag,OFP_Page_Number_Start_Prev_Tag,OFP_Page_Number_End_Tag,OFP_Page_Number_End_Prev_Tag,OFP_Page_Number_Start_Match,OFP_Page_Number_Start_Prev_Match,OFP_Page_Number_End_Match,OFP_Page_Number_End_Prev_Match,OFP_Start_Position,OFP_End_Position
0,911971,TK,TEEKAY CORP,2011-12-31,2012-04-25,20-F,https://www.sec.gov/Archives/edgar/data/911971...,20-F,True,https://www.sec.gov/Archives/edgar/data/911971...,...,"<font size=""2"" style=""font-family:Times New Ro...","<font size=""2"" style=""font-family:Times New Ro...","<font size=""2"" style=""font-family:Times New Ro...","<font size=""2"" style=""font-family:Times New Ro...",True,True,True,True,565576.0,1154045.0
1,911971,TK,TEEKAY CORP,2012-12-31,2013-04-29,20-F,https://www.sec.gov/Archives/edgar/data/911971...,20-F,True,https://www.sec.gov/Archives/edgar/data/911971...,...,"<font size=""2"" style=""font-family:Times New Ro...","<font size=""2"" style=""font-family:Times New Ro...","<font size=""2"" style=""font-family:Times New Ro...","<font size=""2"" style=""font-family:Times New Ro...",True,True,True,True,568217.0,1177746.0
2,911971,TK,TEEKAY CORP,2013-12-31,2014-04-28,20-F,https://www.sec.gov/Archives/edgar/data/911971...,20-F,True,https://www.sec.gov/Archives/edgar/data/911971...,...,"<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...",True,True,True,True,483881.0,979510.0
3,911971,TK,TEEKAY CORP,2014-12-31,2015-04-29,20-F,https://www.sec.gov/Archives/edgar/data/911971...,20-F,True,https://www.sec.gov/Archives/edgar/data/911971...,...,"<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...",True,True,True,True,497565.0,1096371.0
4,911971,TK,TEEKAY CORP,2015-12-31,2016-04-26,20-F,https://www.sec.gov/Archives/edgar/data/911971...,20-F,True,https://www.sec.gov/Archives/edgar/data/911971...,...,"<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...","<p align=""center"" style=""margin-top:0pt; margi...",True,True,True,True,576580.0,1288894.0


In [5]:
# note: there is no need to read filing_40f_mda_df.csv into dataframe as observations are all 'N/A'

## Define Master Dataframe to Store Details of Extracted Filings

In [6]:
# note: a master dataframe is needed to keep a track of which filings have been extracted 
# note: this dataframe will also store the text file name for each filing

# create master dataframe to store details of extracted filings
master_filing_df = pd.DataFrame(columns=['CIK', 'Ticker_Symbol', 'Company_Name', 'Data_Date', 'Filing_Date', 
                                         'Filing_Type', 'MDA_Placement', 'File_Name'])

## Define Function to Complete Initial Preprocessing on HTML

In [7]:
# note: in order to save the text from html to a text file, the following preprocessing steps are required:
# - remove all tables as these are most likely less meaningful to the sentiment of the text
# - get textual data from html code
# - get the normal form of the unicode string (text) 
# - remove all non-ascii characters  
# - restore c1 control characters to windows-1252 encoding
# - replace new line encoding '\n' with space

# define function to complete preprocessing outlined above
def file_preprocessing(soup):
    # extract tables with numerical content > 10%
    for table in soup.find_all('table'):
        def table_digit_percentage(tablestring):
            if len(tablestring)>0.0:
                numbers = sum([char.isdigit() for char in tablestring])
                length = len(tablestring)
                return numbers/length
            else:
                return 1
        if table_digit_percentage(table.get_text())>0.1:
            table.extract()
        else:
            continue        
    # get soup text
    soup_text = soup.get_text()
    # get normal form of unicode string and remove all non-ascii characters
    soup_text = unicodedata.normalize('NFKD', soup_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # restore c1 control characters to windows-1252 encoding
    def restore_windows_1252_characters(restore_string):
        def to_windows_1252(match):
            try:
                return bytes([ord(match.group(0))]).decode('windows-1252')
            except UnicodeDecodeError:
                # No character at the corresponding code point: remove it.
                return ''    
        return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)
    soup_text = restore_windows_1252_characters(soup_text)
    # replace '\n' with space
    soup_text = soup_text.replace('\n', ' ')
    return soup_text

## Create Directory for Filing Text Files

In [8]:
# note: a new directory will be created to store the filing text files
if os.path.exists('Filing Text Files'):
    print('folder already exists!')
else:
    os.mkdir('Filing Text Files')
    print('folder created')

folder already exists!


## Change Working Directory to Filing Text Files Folder

In [9]:
# note: change directory path to 'Filing Text Files Folder'
try:
    os.chdir('Filing Text Files')
    print('directory changed')
except:
    print('already in directory!')

directory changed


## Define Function to Write Filing Text to Text File

In [10]:
# note: define function to write filing text to text file in Filing Text Files directory
def write_text_file(file_name, text):
    file_n = open(file_name, 'w')
    file_n.write(text)
    file_n.close()

## 10-K Filings - Retrieve MD&A Narrative and Save to Text File

In [11]:
# note: retrieve management's discussion and analysis narrative for observations in filing_10k_mda_df
# note: save narratives to text file

# start execution time
start_time = time.time()

# loop through observations in filing_10k_mda_df 
for idx, row in filing_10k_mda_df.iterrows():
    print('-------------------------------------------------------------------------')
    print(idx, row['Ticker_Symbol'])
    if row['MDA_Start_Position']!='N/A' and row['MDA_End_Position']!='N/A':
        filing_request = requests.get(url=row['Filing_Report_Url'])
        filing_soup = BeautifulSoup(filing_request.content, 'lxml')
        mda_start = int(row['MDA_Start_Position'])
        mda_end = int(row['MDA_End_Position'])
        filing_soup_mda_string = str(filing_soup)[mda_start:mda_end]
        filing_soup_mda = BeautifulSoup(filing_soup_mda_string)
        filing_mda_text = file_preprocessing(filing_soup_mda)
        file_name = row['Ticker_Symbol'] + '_' + row['Filing_Date'] + '.txt'
        data = [row['CIK'], row['Ticker_Symbol'], row['Company_Name'],
                                    row['Data_Date'], row['Filing_Date'], row['Filing_Type'],
                                    row['MDA_Placement'], file_name]
        observation = pd.DataFrame([data], columns=['CIK', 'Ticker_Symbol', 
                                                    'Company_Name', 'Data_Date', 'Filing_Date',
                                                    'Filing_Type', 'MDA_Placement', 'File_Name'])
        master_filing_df = master_filing_df.append(observation)
        if os.path.exists(file_name):
            print('file already exists!')
        else:
            write_text_file(file_name, filing_mda_text)
            print('md&a narrative written to text file')
    else:
        print('cannot retrieve md&a narrative')
    print('\n')
    # note: the SEC strictly states that developers should not exceed 10 requests per second 
    # delay next request to EDGAR by 1 second 
    time.sleep(1)
    
# reset index for master_filing_df
master_filing_df = master_filing_df.reset_index(drop=True)

# end execution time
end_time = time.time()

filing_10k_time = end_time - start_time

print('total time taken:', filing_10k_time)

-------------------------------------------------------------------------
0 AIR
file already exists!


-------------------------------------------------------------------------
1 AIR
file already exists!


-------------------------------------------------------------------------
2 AIR
file already exists!


-------------------------------------------------------------------------
3 AIR
file already exists!


-------------------------------------------------------------------------
4 AIR
file already exists!


-------------------------------------------------------------------------
5 HES
file already exists!


-------------------------------------------------------------------------
6 HES
file already exists!


-------------------------------------------------------------------------
7 HES
file already exists!


-------------------------------------------------------------------------
8 HES
cannot retrieve md&a narrative


---------------------------------------------------------------

file already exists!


-------------------------------------------------------------------------
77 CSS
cannot retrieve md&a narrative


-------------------------------------------------------------------------
78 CMC
file already exists!


-------------------------------------------------------------------------
79 CMC
file already exists!


-------------------------------------------------------------------------
80 TAP
file already exists!


-------------------------------------------------------------------------
81 TAP
file already exists!


-------------------------------------------------------------------------
82 TAP
file already exists!


-------------------------------------------------------------------------
83 TAP
file already exists!


-------------------------------------------------------------------------
84 GLW
file already exists!


-------------------------------------------------------------------------
85 GLW
cannot retrieve md&a narrative


---------------------

-------------------------------------------------------------------------
152 GPX
file already exists!


-------------------------------------------------------------------------
153 NR
file already exists!


-------------------------------------------------------------------------
154 NR
cannot retrieve md&a narrative


-------------------------------------------------------------------------
155 NL
cannot retrieve md&a narrative


-------------------------------------------------------------------------
156 NL
cannot retrieve md&a narrative


-------------------------------------------------------------------------
157 NBL
file already exists!


-------------------------------------------------------------------------
158 MOV
file already exists!


-------------------------------------------------------------------------
159 MOV
file already exists!


-------------------------------------------------------------------------
160 OII
file already exists!


-----------------------------

-------------------------------------------------------------------------
227 MRO
cannot retrieve md&a narrative


-------------------------------------------------------------------------
228 S
file already exists!


-------------------------------------------------------------------------
229 S
file already exists!


-------------------------------------------------------------------------
230 S
file already exists!


-------------------------------------------------------------------------
231 S
file already exists!


-------------------------------------------------------------------------
232 S
file already exists!


-------------------------------------------------------------------------
233 UVV
cannot retrieve md&a narrative


-------------------------------------------------------------------------
234 UVV
file already exists!


-------------------------------------------------------------------------
235 UVV
file already exists!


---------------------------------------------

-------------------------------------------------------------------------
302 RGS
cannot retrieve md&a narrative


-------------------------------------------------------------------------
303 RGS
cannot retrieve md&a narrative


-------------------------------------------------------------------------
304 RGS
cannot retrieve md&a narrative


-------------------------------------------------------------------------
305 RGS
file already exists!


-------------------------------------------------------------------------
306 MUR
cannot retrieve md&a narrative


-------------------------------------------------------------------------
307 MUR
cannot retrieve md&a narrative


-------------------------------------------------------------------------
308 MUR
cannot retrieve md&a narrative


-------------------------------------------------------------------------
309 MUR
cannot retrieve md&a narrative


-------------------------------------------------------------------------
310 MUR
cannot r

-------------------------------------------------------------------------
376 ELY
file already exists!


-------------------------------------------------------------------------
377 TTI
cannot retrieve md&a narrative


-------------------------------------------------------------------------
378 TTI
cannot retrieve md&a narrative


-------------------------------------------------------------------------
379 TTI
cannot retrieve md&a narrative


-------------------------------------------------------------------------
380 TTI
cannot retrieve md&a narrative


-------------------------------------------------------------------------
381 MHK
file already exists!


-------------------------------------------------------------------------
382 MHK
cannot retrieve md&a narrative


-------------------------------------------------------------------------
383 CGA
cannot retrieve md&a narrative


-------------------------------------------------------------------------
384 CGA
file already exist

-------------------------------------------------------------------------
452 KEM
cannot retrieve md&a narrative


-------------------------------------------------------------------------
453 KEM
cannot retrieve md&a narrative


-------------------------------------------------------------------------
454 KEM
cannot retrieve md&a narrative


-------------------------------------------------------------------------
455 KEM
cannot retrieve md&a narrative


-------------------------------------------------------------------------
456 SM
cannot retrieve md&a narrative


-------------------------------------------------------------------------
457 SM
file already exists!


-------------------------------------------------------------------------
458 SM
cannot retrieve md&a narrative


-------------------------------------------------------------------------
459 SM
file already exists!


-------------------------------------------------------------------------
460 MD
file already exists!




-------------------------------------------------------------------------
527 DO
cannot retrieve md&a narrative


-------------------------------------------------------------------------
528 DO
cannot retrieve md&a narrative


-------------------------------------------------------------------------
529 SGU
file already exists!


-------------------------------------------------------------------------
530 SGU
file already exists!


-------------------------------------------------------------------------
531 SGU
file already exists!


-------------------------------------------------------------------------
532 SGU
file already exists!


-------------------------------------------------------------------------
533 AEE
cannot retrieve md&a narrative


-------------------------------------------------------------------------
534 AEE
cannot retrieve md&a narrative


-------------------------------------------------------------------------
535 PCG
cannot retrieve md&a narrative


-------

-------------------------------------------------------------------------
603 TDS
cannot retrieve md&a narrative


-------------------------------------------------------------------------
604 RLH
file already exists!


-------------------------------------------------------------------------
605 RLH
file already exists!


-------------------------------------------------------------------------
606 RLH
cannot retrieve md&a narrative


-------------------------------------------------------------------------
607 RLH
file already exists!


-------------------------------------------------------------------------
608 RLH
file already exists!


-------------------------------------------------------------------------
609 RLH
file already exists!


-------------------------------------------------------------------------
610 RLH
file already exists!


-------------------------------------------------------------------------
611 HZO
file already exists!


-----------------------------------

file already exists!


-------------------------------------------------------------------------
679 HOS
file already exists!


-------------------------------------------------------------------------
680 HOS
cannot retrieve md&a narrative


-------------------------------------------------------------------------
681 HOS
file already exists!


-------------------------------------------------------------------------
682 HOS
file already exists!


-------------------------------------------------------------------------
683 CEQP
file already exists!


-------------------------------------------------------------------------
684 CEQP
file already exists!


-------------------------------------------------------------------------
685 CEQP
cannot retrieve md&a narrative


-------------------------------------------------------------------------
686 CEQP
file already exists!


-------------------------------------------------------------------------
687 CEQP
file already exists!


-------

file already exists!


-------------------------------------------------------------------------
755 SHO
cannot retrieve md&a narrative


-------------------------------------------------------------------------
756 ORA
cannot retrieve md&a narrative


-------------------------------------------------------------------------
757 ARC
file already exists!


-------------------------------------------------------------------------
758 ARC
cannot retrieve md&a narrative


-------------------------------------------------------------------------
759 ARC
file already exists!


-------------------------------------------------------------------------
760 ARC
file already exists!


-------------------------------------------------------------------------
761 GTT
file already exists!


-------------------------------------------------------------------------
762 SEM
file already exists!


-------------------------------------------------------------------------
763 CPS
file already exists!


--

-------------------------------------------------------------------------
831 RFP
cannot retrieve md&a narrative


-------------------------------------------------------------------------
832 RFP
cannot retrieve md&a narrative


-------------------------------------------------------------------------
833 RFP
cannot retrieve md&a narrative


-------------------------------------------------------------------------
834 RFP
cannot retrieve md&a narrative


-------------------------------------------------------------------------
835 RFP
cannot retrieve md&a narrative


-------------------------------------------------------------------------
836 DHX
cannot retrieve md&a narrative


-------------------------------------------------------------------------
837 DHX
cannot retrieve md&a narrative


-------------------------------------------------------------------------
838 FET
file already exists!


-------------------------------------------------------------------------
839 FET
file alr

file already exists!


-------------------------------------------------------------------------
906 GM
file already exists!


-------------------------------------------------------------------------
907 QUAD
file already exists!


-------------------------------------------------------------------------
908 QUAD
file already exists!


-------------------------------------------------------------------------
909 QUAD
file already exists!


-------------------------------------------------------------------------
910 QUAD
file already exists!


-------------------------------------------------------------------------
911 EXPR
file already exists!


-------------------------------------------------------------------------
912 EXPR
file already exists!


-------------------------------------------------------------------------
913 TOWR
file already exists!


-------------------------------------------------------------------------
914 OAS
cannot retrieve md&a narrative


----------------

-------------------------------------------------------------------------
980 ALEX
file already exists!


-------------------------------------------------------------------------
981 NGVC
file already exists!


-------------------------------------------------------------------------
982 HCR
cannot retrieve md&a narrative


-------------------------------------------------------------------------
983 HCR
file already exists!


-------------------------------------------------------------------------
984 SMLP
file already exists!


-------------------------------------------------------------------------
985 SMLP
file already exists!


-------------------------------------------------------------------------
986 MPLX
cannot retrieve md&a narrative


-------------------------------------------------------------------------
987 SUN
file already exists!


-------------------------------------------------------------------------
988 SUN
file already exists!


------------------------------

-------------------------------------------------------------------------
1055 VSTO
file already exists!


-------------------------------------------------------------------------
1056 JAX
file already exists!


-------------------------------------------------------------------------
1057 QSR
file already exists!


-------------------------------------------------------------------------
1058 SUM
file already exists!


-------------------------------------------------------------------------
1059 EVH
file already exists!


-------------------------------------------------------------------------
1060 TGE
cannot retrieve md&a narrative


-------------------------------------------------------------------------
1061 BNED
file already exists!


-------------------------------------------------------------------------
1062 BNED
cannot retrieve md&a narrative


-------------------------------------------------------------------------
1063 AGR
file already exists!


-----------------------

## 20-F Filings - Retrieve OFP Narrative and Save to Text File

In [12]:
# note: retrieve operating and financial review and prospects narrative for observations in filing_20f_ofp_df
# note: save narratives to text file

# start execution time
start_time = time.time()

# loop through observations in filing_20f_ofp_df
for idx, row in filing_20f_ofp_df.iterrows():
    print('-------------------------------------------------------------------------')
    print(idx, row['Ticker_Symbol'])
    if row['OFP_Start_Position']!='N/A' and row['OFP_End_Position']!='N/A':
        filing_request = requests.get(url=row['Filing_Report_Url'])
        filing_soup = BeautifulSoup(filing_request.content, 'lxml')
        ofp_start = int(row['OFP_Start_Position'])
        ofp_end = int(row['OFP_End_Position'])
        filing_soup_ofp_string = str(filing_soup)[ofp_start:ofp_end]
        filing_soup_ofp = BeautifulSoup(filing_soup_ofp_string)
        filing_ofp_text = file_preprocessing(filing_soup_ofp)
        file_name = row['Ticker_Symbol'] + '_' + row['Filing_Date'] + '.txt'
        data = [row['CIK'], row['Ticker_Symbol'], row['Company_Name'],
                                    row['Data_Date'], row['Filing_Date'], row['Filing_Type'],
                                    row['MDA_Placement'], file_name]
        observation = pd.DataFrame([data], columns=['CIK', 'Ticker_Symbol', 
                                                    'Company_Name', 'Data_Date', 'Filing_Date',
                                                    'Filing_Type', 'MDA_Placement', 'File_Name'])
        master_filing_df = master_filing_df.append(observation)
        if os.path.exists(file_name):
            print('file already exists!')
        else:
            write_text_file(file_name, filing_ofp_text)
            print('ofp narrative written to text file')
    else:
        print('cannot retrieve ofp narrative')
    print('\n')
    # note: the SEC strictly states that developers should not exceed 10 requests per second 
    # delay next request to EDGAR by 1 second 
    time.sleep(1)
    
# reset index for master_filing_df
master_filing_df = master_filing_df.reset_index(drop=True)

# end execution time
end_time = time.time()

filing_20f_time = end_time - start_time

print('total time taken:', filing_20f_time)

-------------------------------------------------------------------------
0 TK
file already exists!


-------------------------------------------------------------------------
1 TK
file already exists!


-------------------------------------------------------------------------
2 TK
file already exists!


-------------------------------------------------------------------------
3 TK
file already exists!


-------------------------------------------------------------------------
4 TK
file already exists!


-------------------------------------------------------------------------
5 TK
file already exists!


-------------------------------------------------------------------------
6 TK
file already exists!


-------------------------------------------------------------------------
7 TK
file already exists!


-------------------------------------------------------------------------
8 FRO
cannot retrieve ofp narrative


------------------------------------------------------------------------

-------------------------------------------------------------------------
79 DAC
cannot retrieve ofp narrative


-------------------------------------------------------------------------
80 DAC
cannot retrieve ofp narrative


-------------------------------------------------------------------------
81 DAC
file already exists!


-------------------------------------------------------------------------
82 DAC
cannot retrieve ofp narrative


-------------------------------------------------------------------------
83 DAC
cannot retrieve ofp narrative


-------------------------------------------------------------------------
84 DAC
cannot retrieve ofp narrative


-------------------------------------------------------------------------
85 DAC
cannot retrieve ofp narrative


-------------------------------------------------------------------------
86 TOO
file already exists!


-------------------------------------------------------------------------
87 TOO
file already exists!


----------

-------------------------------------------------------------------------
155 CMRE
cannot retrieve ofp narrative


-------------------------------------------------------------------------
156 CMRE
cannot retrieve ofp narrative


-------------------------------------------------------------------------
157 PACD
file already exists!


-------------------------------------------------------------------------
158 PACD
cannot retrieve ofp narrative


-------------------------------------------------------------------------
159 PACD
cannot retrieve ofp narrative


-------------------------------------------------------------------------
160 PACD
cannot retrieve ofp narrative


-------------------------------------------------------------------------
161 PACD
cannot retrieve ofp narrative


-------------------------------------------------------------------------
162 PACD
file already exists!


-------------------------------------------------------------------------
163 EROS
file already ex

## Other Filings - Retrieve MD&A Narrative and Save to Text File

In [13]:
# note: retrieve filings for which the management's discussion and analysis narrative is stored in separate exhbits
# note: save narratives to text file 

# store filings for which the management's discussion and analysis narrative is stored in separate exhbits
filing_other_mda_df = filing_page_df.query('MDA_Placement!="N/A" & MDA_Placement!="10-K" & MDA_Placement!="10-KT" & MDA_Placement!="20-F" & MDA_Placement!="40-F"')

# start execution time
start_time = time.time()

# loop through observations in filing_other_mda_df
for idx, row in filing_other_mda_df.iterrows():
    print('-------------------------------------------------------------------------')
    print(idx, row['Ticker_Symbol'])
    filing_request = requests.get(url=row['Filing_Report_Url'])
    filing_soup_mda = BeautifulSoup(filing_request.content, 'lxml')
    filing_mda_text = file_preprocessing(filing_soup_mda)
    file_name = row['Ticker_Symbol'] + '_' + row['Filing_Date'] + '.txt'
    data = [row['CIK'], row['Ticker_Symbol'], row['Company_Name'],
            row['Data_Date'], row['Filing_Date'], row['Filing_Type'],
            row['MDA_Placement'], file_name]
    observation = pd.DataFrame([data], columns=['CIK', 'Ticker_Symbol', 
                                                'Company_Name', 'Data_Date', 'Filing_Date',
                                                'Filing_Type', 'MDA_Placement', 'File_Name'])
    master_filing_df = master_filing_df.append(observation)
    if os.path.exists(file_name):
        print('file already exists!')
    else:
        write_text_file(file_name, filing_mda_text)
        print('md&a narrative written to text file')
    print('\n')
    # note: the SEC strictly states that developers should not exceed 10 requests per second 
    # delay next request to EDGAR by 1 second 
    time.sleep(1)
    
# reset index for master_filing_df
master_filing_df = master_filing_df.reset_index(drop=True)

# end execution time
end_time = time.time()

filing_other_time = end_time - start_time

print('total time taken:', filing_other_time)

-------------------------------------------------------------------------
265 SU
file already exists!


-------------------------------------------------------------------------
292 KGC
file already exists!


-------------------------------------------------------------------------
293 KGC
file already exists!


-------------------------------------------------------------------------
294 KGC
file already exists!


-------------------------------------------------------------------------
295 KGC
file already exists!


-------------------------------------------------------------------------
296 KGC
file already exists!


-------------------------------------------------------------------------
297 KGC
file already exists!


-------------------------------------------------------------------------
298 KGC
file already exists!


-------------------------------------------------------------------------
299 KGC
file already exists!


--------------------------------------------------------

-------------------------------------------------------------------------
868 EXK
file already exists!


-------------------------------------------------------------------------
895 AG
file already exists!


-------------------------------------------------------------------------
896 AG
file already exists!


-------------------------------------------------------------------------
909 HBM
file already exists!


-------------------------------------------------------------------------
910 HBM
file already exists!


-------------------------------------------------------------------------
911 HBM
file already exists!


-------------------------------------------------------------------------
912 HBM
file already exists!


-------------------------------------------------------------------------
913 HBM
file already exists!


-------------------------------------------------------------------------
914 HBM
file already exists!


---------------------------------------------------------

## Save master_filing_df to CSV File

In [14]:
# change directory back to 'Prototype Two Notebooks'
os.chdir('..')

# save master_filing_df to csv file
master_filing_df.to_csv('master_filing_df.csv', index=False)