# Chapter 12: Sample Notebook

This notebook contains all code from Chapter 12: _Collecting Data from the Internet_.

## 12.2 EDGAR Data

### 12.2.1 EDGAR Index Files

In [None]:
import os
import urllib.request
from pathlib import Path

def get_index(start_year:int,end_year:int,down_direct:str):
    """Downloads SEC  EDGAR Index Files.
    start_year - > First Year to download
    end_year-> Last Year to download
    down_direct->Directory to download files to
    """
    print('Downloading Index Files')
    # Check if the download folder exists.
    if not os.path.exists(down_direct):
        # Create the directory if it does not exist.
        os.makedirs(down_direct)
    # Loop through each year and quarter
    for year in range(start_year, end_year+1):
        for qtr in range(1,5):
            # Specify the file you want to download.
            url='https://www.sec.gov/Archives/edgar/full-index/'+str(year)+'/'+'QTR'+str(qtr)+'/master.idx'
            # Specify the file name and location 
            # to download to.
            dl_file=down_direct+'master'+str(year)+str(qtr)+'.idx'
            # Download the file.
            urllib.request.urlretrieve(url, dl_file)  
            # Print the name of the downloaded file
            print('Downloaded',dl_file,end='\n')
    print('Downloading of Index Files Complete')
    return

# Specify the location of the folder where index files 
# will be downloaded to
down_direct = os.path.join(Path.home(), 'edgar', 'indexfiles')
# Execute the get_index function and download filings 
# from 2018, 2019, to the folder /
get_index(2018, 2019, down_direct)

### 12.2.2 Download SEC Filings

In [None]:
import urllib.request
import shutil
import os
import re
from pathlib import Path

def get_files(start_year:int, end_year:int,
              reform:str, 
              inddirect:str, odirect:str):
    """
    Downloads SEC filings for specific companies
    start_year -> First Year to download
    end_year -> Last Year to download
    reform -> Regex to specify forms to be downloaded
    inddirect -> Directory containing index files
    odirect -> Directory the filings will be downloaded to
    """
    
    print('Downloading Filings')
    
    # Regex to identify the form to download.
    re_formtype = re.compile(reform, re.IGNORECASE)
    # Regex to extract file name information 
    # from a line
    re_fullfilename = re.compile(r"\|(edgar/data.*\/([\d-]+\.txt))", re.IGNORECASE)
    
    #loop through the index files based on year
    for year in range(start_year, end_year+1):
        #check whether the directory exists and create one 
        # if it does not.
        download_path = os.path.join(odirect, str(year))
        if not os.path.exists(download_path):
            os.makedirs(download_path)
                
        for qtr in range(1,5):
            #name of index file to be read.
            dl_file = os.path.join(inddirect, 'master' + str(year) + str(qtr) + '.idx')
        
            # check to see if the index file exists.
            if not os.access(dl_file, os.R_OK):
                # Download the index file if it does not 
                # already exist
                url='https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/' + 'QTR' + str(qtr) + '/master.idx'
                # download the file defined as url and 
                # download to the file defined a dl_fle.
                urllib.request.urlretrieve(url, dl_file)
            # open the index file
            with open(dl_file, 'r') as f:
                # set a counter called count to 1. Note 
                # that the counter will only be incremented 
                # after it downloads a file.
                count=1
                
                # loop through each line in the index file, 
                # assigning to a variable called line
                for line in f:
                    # Only download a file if the counter 
                    # is less than 5.
                    # Remove this if statement if you want
                    # to download all the files for the
                    # time period
                    if count<5:
                        # Check to see if the the line  
                        # matches the form type 
                        rematch=re.search(re_formtype,line)
                        #If there is a match then download 
                        # the filing
                        if rematch:
                            # The following line searches 
                            # for filename information. 
                            # The first grouping will 
                            # contain the location and 
                            # filename of the file to be 
                            # downloaded. The second
                            # grouping will contain just 
                            # the filename o
                            matches = re.search(re_fullfilename, line)
                            if matches:
                                # Construct the url to for 
                                # retrieving the filing 
                                url = str('https://www.sec.gov/Archives/') + str(matches.group(1))
                                # Create the filename to 
                                # download the file to.
                                outfile = os.path.join(download_path, 
                                                       str(matches.group(2)))
                                # Check to make sure the  
                                # file hasn't already 
                                # been downloaded
                                                       
                                if not (os.path.isfile(outfile) and os.access(outfile, os.R_OK)):
                                    # Print the name of the 
                                    # file to be downloaded.
                                    print("Downloading:"+str(outfile),end='\n')
                                    #downlaod the file
                                    urllib.request.urlretrieve(url, outfile)    
                                    count += 1
    print('Downloading of Filings Complete',end='\n')
    return
                                                       
# Specify, in regular expression format, the filing
# you are looking for.  Following is the for 10-k.
reform='(\|10-?k(sb|sb40|405)?\s*\|)'

# Specify location of the index files.
inddirect = os.path.join(Path.home(), 'edgar', 'indexfiles')

# Specify where to download filings to
odirect = os.path.join(Path.home(), 'edgar', '10K')

# Execute the get filings function
get_files(2018, 2019, reform, inddirect, odirect)

### 12.2.3 Read Filings

In [None]:
import os
import re
import pandas as pd
import glob

def process_header(start_year:int, end_year:int, 
                   filings:str, outfile:str):
    """ Extracts header information from 10-K filings.
    Parameters:
    start_year -> First Year to process
    end_year -> Last Year to process
    filings -> Directory containing files to process
    outfile -> CSV file output
    """
    
    # Create a "dictionary" of regular expressions for 
    # each of the variables we want to get. A 
    # dictionary contains "keys" and "values."  In 
    # this case, for example, the key "cik", refers
    # to the regular expression for the cik number.
    edgar_vars={
        "file" : re.compile('<SEC-DOCUMENT>(.*\.txt)', re.IGNORECASE),
        "cik" : re.compile('^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d{10})', re.IGNORECASE),
        "report_date" : re.compile('^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d{8})', re.IGNORECASE),
        "file_date" : re.compile('^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d{8})', re.IGNORECASE),
        "name" : re.compile('^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.+)', re.IGNORECASE),
        "sic" : re.compile('^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?(\d{4})', re.IGNORECASE),
        "hlink" : re.compile(r'(.*?(([0]*(\d+))\-(\d{2})\-(\d{6})))', re.IGNORECASE)
    } 
    # create a regular expression representing the 
    # last row of the file you want to read. The tag
    # '</SEC-HEADER>' represents the end of the 
    # Header information in the .txt file.  All the 
    # header information should be found before this
    # line
    regex_endheader = re.compile(r'</SEC-HEADER>', re.IGNORECASE)

    # Create a dataframe that has column names 
    # identical to those we defined in our 
    # dictionary. The "keys()" method creates 
    # a list of just the keys in the dictionary
    # This means, edgar_vars.keys(), is a list 
    # that looks like this:
    # ["file", "cik", "report_date", "file_date", "name", "sic", "hlink"]
    eframe = pd.DataFrame(columns = edgar_vars.keys())   
    
    # loop through each of the year folders in 
    # filings
    for year in range(start_year, end_year+1):
        # specify the files to process
        path = os.path.join(filings,str(year),'*.txt')
        # read in the names of each of the files 
        # contained in the folder
        files=glob.glob(path)
        # process one file at a time.
        for file in files:
            # Create a dictionary to hold the 
            # information we are obtaining 
            # (e.g., cik number)
            header_vars={}
            # For each of the keys contained 
            # in the dictionary, set the initial 
            # value to -99. This way we are sure
            # that each item is defined in the 
            # dictionary even if we cannot
            # find the value.
            for x in edgar_vars.keys():
                header_vars[x]=-99
            # Open the file we are processing 
            # and read it in one line at a time.
            f=open(file, 'r')
            for line in f:
                # The "items" method converts 
                # the dictionary into a list 
                # that is easy to operate on
                tems=edgar_vars.items()
                # Loop through the dictionary 
                # and assign the key to "k" 
                # and the value to "v"
                # For example, the first time 
                # through, k="file", and 
                # v="re.compile('<SEC-DOCUMENT>(.*\.txt)', re.IGNORECASE)"
                for k, v in tems:
                    match = v.search(line)
                    # if a match is found for the first
                    # time, add it to the dictionary 
                    # containing the header values.
                    # The purpose of the expression, 
                    # re_key!="hlink" is to not try and 
                    # match the hlink expression.
                    # The hlink expression is used at 
                    # the end to create a hyperlink to 
                    # the file on edgar.
                    if match and header_vars[k]==-99 and k!="hlink":
                        header_vars[k]=match.group(1)
                # Check to see if we are at the end 
                # of the header part of the filing.
                # Exit if we are there
                match = regex_endheader.search(line)  
                if match:
                    break
            f.close()
            # Create a link to the file on edgar
            if header_vars['file'] != -99:
                #Construct a link to the actual filing 
                match = edgar_vars['hlink'].search(header_vars['file']) 
                if match:
                    header_vars['hlink'] = str('http://www.sec.gov/Archives/edgar/data/')+str(header_vars['cik'].lstrip('0'))+str("/")+str(match.group(3))+str(match.group(5))+str(match.group(6))+str("/")+str(match.group(2))+str("-index.htm")
                #add the row to our dataframe
                eframe.loc[len(eframe)] = header_vars     

    # Write to csv file
    eframe.to_csv(outfile, sep=",", encoding='utf-8')
    print(f'Header File: {outfile} created')
    return eframe

# You can change the name of the output file below.
outfile = os.path.join(Path.home(), 
                       'edgar', 
                       'filingsoutput.csv')
 #Location of filings to be processed
filings = os.path.join(Path.home(), 'edgar', '10K')

edgar_dat = process_header(2018,2019,filings,outfile)
edgar_dat.head()

## 12.3 Web Scraping

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from dateutil import parser as dateparse
import re
import os
import urllib.request

def get_aaers(start_year:int, end_year:int,
              down_folder:str, csv_file:str):
    # create a dataframe to store the list of AAERS
    aaer_table = pd.DataFrame(columns = ['aaer_number', 
                                         'date',
                                         'defendant',
                                         'link'])
    # loop through each year of AAERS
    for year in range(start_year, end_year+1):
        print(year)
        # Define the URL to get
        url = 'https://www.sec.gov/divisions/enforce/friactions/friactions' + str(year) + '.shtml'
        # download the file
        response = requests.get(url)
        
        print('here')
        
        # extract html from response object
        data = response.text
        # parse the HTML.
        soup = BeautifulSoup(data, 'lxml')
        # The AAER table is the 5th table in the 
        # document prior to 2016. It is the first
        # table after that. So the table number
        # is specified as follows:
        if year > 2015:
            idx = 0
        else:
            idx = 4
        # Grab the table
        table = soup.find_all('table')[idx]
        # loop through each row - each AAER
        for row in table.find_all('tr'):
            # Get the columns contained in the row
            columns = row.find_all('td')
            # make sure there is at least one column
            if columns:
                # make sure that the first column contains
                # the tag 'a'
                if columns[0].find('a'):
                    # make sure that the first column 
                    # contains a hyperlink
                    if columns[0].find('a').get('href'):
                        # create a variable containing a 
                        # link to the AAER
                        link = 'https://www.sec.gov' + str(columns[0].find('a').get('href'))
                        # get the AAER number 
                        # (e.g., AAER-1209)
                        aaer = columns[0].find('a').contents[0]
                        # remove the "AAER-" from AAER 
                        # so there is just number
                        aaer_num = re.findall(r"\d+",aaer)[0]
                        # Use date parsing package to 
                        # parse the date.
                        dt = dateparse.parse(columns[1].contents[0])
                        aaerdate = str(dt.year) + str(dt.month).zfill(2) + str(dt.day).zfill(2)
                        defendant = columns[2].contents[0]
                        # add the row to our dataframe
                        aaer_table.loc[len(aaer_table)] = [aaer_num,aaerdate,defendant,link]
                        # check to see if the AAER is 
                        # a pdf or htm file.
                        if link.find('pdf')!= -1:
                            ftype='.pdf'
                        else:
                            ftype='.htm'
                        # name of file to download to
                        dl_file = os.path.join(down_folder, str(aaer_num) + ftype)
                        
                        print(dl_file)
                        
                        # download the file if not already 
                        # been downloaded.
                        if not (os.path.isfile(dl_file) and os.access(dl_file, os.R_OK)):
                            urllib.request.urlretrieve(link, dl_file)
    # output the pandas table to a csv file
    aaer_table.to_csv(csv_file,index=False)
    return aaer_table

# Name of folder to download aaers to
down_folder = os.path.join(Path.home(), 'aaers')
#Name of file that will include list of aaers
csv_file = os.path.join(down_folder, 'aaers.csv')
#Download and create a pandas dataframe of AAERs
aaers = get_aaers(1999, 2019, down_folder, csv_file)
#display pandas table
aaers