# Scrape Financial Metrics
### This will output dataframes containing financial metrics

In [2]:
# import our libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# define the base url needed to create the file url.
base_url = r"https://www.sec.gov"

# convert a normal url to a document url
normal_url = r"https://www.sec.gov/Archives/edgar/data/1265107/0001265107-19-000004.txt"
normal_url = normal_url.replace('-','').replace('.txt','/index.json')

# define a url that leads to a 10k document landing page
documents_url = r"https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/index.json"

# request the url and decode it.
content = requests.get(documents_url).json()

for file in content['directory']['item']:
    
    # Grab the filing summary and create a new url leading to the file so we can download it.
    if file['name'] == 'FilingSummary.xml':

        xml_summary = base_url + content['directory']['name'] + "/" + file['name']
        
        print('-' * 100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)

----------------------------------------------------------------------------------------------------
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/FilingSummary.xml


In [4]:
print(xml_summary)

https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/FilingSummary.xml


# Grab XML SUM
need to automate url 

In [5]:
# define the base url needed to create the file url.
base_url = r"https://www.sec.gov"


# convert a normal url to a document url
#normal_url = r"https://www.sec.gov/Archives/edgar/data/1265107/0001265107-19-000004.txt"
normal_url = r"https://www.sec.gov/Archives/edgar/data/2488/0001628280-21-001185.txt"
#OG_url = r"https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/0001628280-21-001185.txt"
normal_url = normal_url.replace('-','').replace('.txt','/index.json')

# define a url that leads to a 10k document landing page
#documents_url = r"https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/index.json"
documents_url = normal_url
# request the url and decode it.
content = requests.get(documents_url).json()

for file in content['directory']['item']:

    
    # Grab the filing summary and create a new url leading to the file so we can download it.
    if file['name'] == 'FilingSummary.xml':

        xml_summary = base_url + content['directory']['name'] + "/" + file['name']
        print(xml_summary)
        
        print('-' * 100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)

https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/FilingSummary.xml
----------------------------------------------------------------------------------------------------
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/FilingSummary.xml


In [8]:
test_url = r"https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/0001628280-21-001185.txt"


In [9]:
base_url = xml_summary.replace('FilingSummary.xml', '')

# request and parse the content
content = requests.get(xml_summary).content
soup = BeautifulSoup(content, 'lxml') #format xml

# find the 'myreports' tag because this contains all the individual reports submitted.
reports = soup.find('myreports')

# I want a list to store all the individual components of the report, so create the master list.
master_reports = []

# loop through each report in the 'myreports' tag but avoid the last one as this will cause an error.
for report in reports.find_all('report')[:-1]:


    # let's create a dictionary to store all the different parts we need.
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text

    # append the dictionary to the master list.
    master_reports.append(report_dict)

    # print the info to the user.
    print('-'*100)
    print(base_url + report.htmlfilename.text)
    print(report.longname.text)
    print(report.shortname.text)
    print(report.menucategory.text)
    print(report.position.text)

----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R1.htm
0001001 - Document - Cover Page
Cover Page
Cover
1
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R2.htm
1001002 - Statement - Consolidated Statements of Operations
Consolidated Statements of Operations
Statements
2
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R3.htm
1002003 - Statement - Consolidated Statements of Comprehensive Income
Consolidated Statements of Comprehensive Income
Statements
3
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R4.htm
1003004 - Statement

In [10]:
# create the list to hold the statement urls
statements_url = []

for report_dict in master_reports:
    
    # define the statements we want to look for.
    item1 = r"Consolidated Balance Sheets"
    item2 = r"Consolidated Statements of Comprehensive Income"
    item3 = r"Consolidated Statements of Cash Flows"
    item4 = r"Consolidated Statements of Operations"
    
    # store them in a list.
    report_list = [item1, item2, item3, item4]
    
    # if the short name can be found in the report list.
    if report_dict['name_short'] in report_list:
        
        # print some info and store it in the statements url.
        print('-'*100)
        print(report_dict['name_short'])
        print(report_dict['url'])
        
        statements_url.append(report_dict['url'])

----------------------------------------------------------------------------------------------------
Consolidated Statements of Operations
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R2.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Comprehensive Income
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R3.htm
----------------------------------------------------------------------------------------------------
Consolidated Balance Sheets
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R4.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Cash Flows
https://www.sec.gov/Archives/edgar/data/2488/000162828021001185/R7.htm


# Scraping Financial Statements

In [14]:
# let's assume we want all the statements in a single data set.
statements_data = []

# loop through each statement url
for statement in statements_url:

    # define a dictionary that will store the different parts of the statement.
    statement_data = {}
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    # request the statement file content
    content = requests.get(statement).content
    report_soup = BeautifulSoup(content, 'html')

    # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        # first let's get all the elements.
        cols = row.find_all('td')
        
        # if it's a regular row and not a section or a table header
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        # if it's a regular row and a section but not a table header
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        # finally if it's not any of those it must be a header
        elif (len(row.find_all('th')) != 0):            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
            
        else:            
            print('We encountered an error.')

    # append it to the master list.
    statements_data.append(statement_data)

# Converting to DF

In [48]:
def produce_df(i):
    # Grab the proper components
    income_header =  statements_data[i]['headers'][1]
    income_data = statements_data[i]['data']

    # Put the data in a DataFrame
    income_df = pd.DataFrame(income_data)


    # Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
    income_df.index = income_df[0] #first col

    income_df.index.name = 'Category'
    income_df = income_df.drop(0, axis = 1) #drop col names


    # Get rid of the '$', '(', ')', and convert the '' to NaNs.
    income_df = income_df.replace('[\$,)]','', regex=True )\
                         .replace( '[(]','-', regex=True)\
                         .replace( '', 'NaN', regex=True)


    # # convert string to float
    income_df = income_df.astype(float)

    # # Change the column headers
    income_df.columns = income_header



    # show the df
    return income_df

    # # drop the data in a CSV file if needed.
    # # income_df.to_csv('income_state.csv')

In [49]:
produce_df(0)

Unnamed: 0_level_0,"Dec. 26, 2020","Dec. 28, 2019","Dec. 29, 2018"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net revenue,9763.0,6731.0,6475.0
Cost of sales,5416.0,3863.0,4028.0
Gross profit,4347.0,2868.0,2447.0
Research and development,1983.0,1547.0,1434.0
"Marketing, general and administrative",995.0,750.0,562.0
Operating income,1369.0,631.0,451.0
Interest expense,-47.0,-94.0,-121.0
"Other expense, net",-47.0,-165.0,0.0
Income before income taxes and equity income (loss),1275.0,372.0,330.0
Income tax provision (benefit),-1210.0,31.0,-9.0


In [50]:
produce_df(1)

Unnamed: 0_level_0,"Dec. 26, 2020","Dec. 28, 2019","Dec. 29, 2018"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net income,2490.0,341.0,337.0
Net change in unrealized gains (losses) on cash flow hedges,17.0,8.0,-14.0
Total comprehensive income,2507.0,349.0,325.0
"Cumulative Effect, Period of Adoption, Adjustment | Debt Investment",,,
Total comprehensive income,0.0,0.0,2.0


In [51]:
produce_df(3)

Unnamed: 0_level_0,"Dec. 26, 2020","Dec. 28, 2019","Dec. 29, 2018"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net income,2490.0,341.0,337.0
Depreciation and amortization,312.0,222.0,170.0
Stock-based compensation,274.0,197.0,137.0
Amortization of debt discount and issuance costs,14.0,30.0,38.0
Amortization of operating lease right-of-use assets,42.0,36.0,0.0
"Loss on debt redemption, repurchase and conversion",54.0,176.0,12.0
Loss on sale/disposal of property and equipment,33.0,42.0,27.0
Impairment of technology licenses,0.0,0.0,45.0
Deferred income taxes,-1223.0,-7.0,-4.0
Other,6.0,-2.0,-1.0
