# Introduction

In [19]:
# Author: Alessandro Fassina
# Content: this code extracts data from the 10-K filings of three companies: Microsoft, Apple, and Google. The data is then stored in a .csv file for further analysis.
# Inputs
# - API key for the SEC EDGAR database: 'd6f32483c70c26c69ef8cae4010108a9667647438edd7c4d9876157d2ec3b82c'
# - URL links to the 10-K filings of Microsoft, Apple, and Google.
# Output
# - A .csv file containing the extracted data from the 10-K filings of the three companies.

In [20]:
# %pip install -q sec-api

In [21]:
%run ./99_helper_functions.py

# 1. Basic Imports and Setup

In [22]:
import pandas as pd 
from sec_api import XbrlApi

In [23]:
API_KEY = 'd6f32483c70c26c69ef8cae4010108a9667647438edd7c4d9876157d2ec3b82c'

In [24]:

xbrlApi = XbrlApi(API_KEY)

In [25]:
# Lists of items to extract

# Income Statement
# Operating expenses = Gross Profit - Operating Income
items1 = ['RevenueFromContractWithCustomerExcludingAssessedTax', # Total Revenue
          'CostOfGoodsAndServicesSold', # Cost of goods sold
          'GrossProfit', # Gross Profit
          'OperatingIncomeLoss', # Operating Income
          'NetIncomeLoss' # Net Income
          ]

# Balance Sheet
# Long-term assets = Total Assets - Current Assets
# Long-term liabilities = Total Liabilities - Current Liabilities
# Equity = Assets - Liabilities
items2 = ['AssetsCurrent', # Current Assets
          'LiabilitiesCurrent', # Current Liabilities
          'Assets', # Total Assets
          'Liabilities', # Total Liabilities
          'LiabilitiesAndStockholdersEquity' # Total Liabilities and Shareholders' Equity
          ]

# Cash Flow Statement
items3 = ['NetCashProvidedByUsedInOperatingActivities', # Cash from operating activities
          'NetCashProvidedByUsedInInvestingActivities', # Cash from investing activities
          'NetCashProvidedByUsedInFinancingActivities' # Cash from financing activities
          ]

# 2. Load Data

In [26]:
# URL of Microsoft's 10-K filings
url_10k_msft = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm'
url_10k_msft_2023 = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm'
url_10k_aapl = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm'
url_10k_aapl_2023 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm'
url_10k_tsla = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001318605/000162828025003063/tsla-20241231.htm'
url_10k_tsla_2023 = 'https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm'

# Convert the XBRL data to JSON format
msft_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_msft)
msft_xbrl_json_2023 = xbrlApi.xbrl_to_json(htm_url=url_10k_msft_2023)
aapl_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_aapl)
aapl_xbrl_json_2023 = xbrlApi.xbrl_to_json(htm_url=url_10k_aapl_2023)
tsla_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_tsla)
tsla_xbrl_json_2023 = xbrlApi.xbrl_to_json(htm_url=url_10k_tsla_2023)

In [27]:
print("Keys of income statement dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(msft_xbrl_json['StatementsOfIncome'].keys()), sep='\n')

Keys of income statement dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
RevenueFromContractWithCustomerExcludingAssessedTax
CostOfGoodsAndServicesSold
GrossProfit
ResearchAndDevelopmentExpense
SellingAndMarketingExpense
GeneralAndAdministrativeExpense
OperatingIncomeLoss
NonoperatingIncomeExpense
IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest
IncomeTaxExpenseBenefit
NetIncomeLoss
EarningsPerShareBasic
EarningsPerShareDiluted
WeightedAverageNumberOfSharesOutstandingBasic
WeightedAverageNumberOfDilutedSharesOutstanding


In [28]:
print("Keys of balance sheets dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(msft_xbrl_json['BalanceSheets'].keys()), sep='\n')

Keys of balance sheets dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
CashAndCashEquivalentsAtCarryingValue
ShortTermInvestments
CashCashEquivalentsAndShortTermInvestments
AccountsReceivableNetCurrent
InventoryNet
OtherAssetsCurrent
AssetsCurrent
PropertyPlantAndEquipmentNet
OperatingLeaseRightOfUseAsset
LongTermInvestments
Goodwill
FiniteLivedIntangibleAssetsNet
OtherAssetsNoncurrent
Assets
AccountsPayableCurrent
CommercialPaper
LongTermDebtCurrent
EmployeeRelatedLiabilitiesCurrent
AccruedIncomeTaxesCurrent
ContractWithCustomerLiabilityCurrent
OtherLiabilitiesCurrent
LiabilitiesCurrent
LongTermDebtNoncurrent
AccruedIncomeTaxesNoncurrent
ContractWithCustomerLiabilityNoncurrent
DeferredIncomeTaxLiabilitiesNet
OperatingLeaseLiabilityNoncurrent
OtherLiabilitiesNoncurrent
Liabilities
CommitmentsAndContingencies
CommonStocksIncludingAdditionalPaidInCapital
RetainedEarningsAccumulatedDeficit
AccumulatedOtherComprehensiveIn

In [29]:
print("Keys of Cash Flows dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(msft_xbrl_json['StatementsOfCashFlows'].keys()), sep='\n')

Keys of Cash Flows dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
NetIncomeLoss
DepreciationAmortizationAndOther
ShareBasedCompensation
GainLossOnInvestmentsAndDerivativeInstruments
DeferredIncomeTaxExpenseBenefit
IncreaseDecreaseInAccountsReceivable
IncreaseDecreaseInInventories
IncreaseDecreaseInOtherCurrentAssets
IncreaseDecreaseInOtherNoncurrentAssets
IncreaseDecreaseInAccountsPayable
IncreaseDecreaseInContractWithCustomerLiability
IncreaseDecreaseInAccruedIncomeTaxesPayable
IncreaseDecreaseInOtherCurrentLiabilities
IncreaseDecreaseInOtherNoncurrentLiabilities
NetCashProvidedByUsedInOperatingActivities
ProceedsFromRepaymentsOfShortTermDebtMaturingInThreeMonthsOrLess
ProceedsFromDebtMaturingInMoreThanThreeMonths
RepaymentsOfDebtMaturingInMoreThanThreeMonths
ProceedsFromIssuanceOfCommonStock
PaymentsForRepurchaseOfCommonStock
PaymentsOfDividendsCommonStock
ProceedsFromPaymentsForOtherFinancingActivities
NetCashProv

In [30]:
import re

In [31]:
for i in range(len(list(msft_xbrl_json['StatementsOfCashFlows'].keys()))):
    if re.search('NetCashProvidedByUsedIn', list(msft_xbrl_json['StatementsOfCashFlows'].keys())[i]):
        print(list(msft_xbrl_json['StatementsOfCashFlows'].keys())[i])

NetCashProvidedByUsedInOperatingActivities
NetCashProvidedByUsedInFinancingActivities
NetCashProvidedByUsedInInvestingActivities


In [47]:
income_statement_msft = get_income_statement(msft_xbrl_json)
income_statement_aapl = get_income_statement(aapl_xbrl_json)
income_statement_tsla = get_income_statement(tsla_xbrl_json)

income_statement_msft = income_statement_msft.loc[[item for item in items1 if item in income_statement_msft.index]]
income_statement_aapl = income_statement_aapl.loc[[item for item in items1 if item in income_statement_aapl.index]]
income_statement_tsla = income_statement_tsla.loc[[item for item in items1 if item in income_statement_tsla.index]]

# reset index and name it
income_statement_msft = income_statement_msft.reset_index().rename(columns={'index': 'Variables'})
income_statement_aapl = income_statement_aapl.reset_index().rename(columns={'index': 'Variables'})
income_statement_tsla = income_statement_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
income_statement_msft_melt = income_statement_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_aapl_melt = income_statement_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_tsla_melt = income_statement_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

# Keep only last part of Date string column (i.e., keep only the date of the filing)
income_statement_msft_melt['Date'] = income_statement_msft_melt['Date'].apply(lambda x: x.split('_')[1])
income_statement_aapl_melt['Date'] = income_statement_aapl_melt['Date'].apply(lambda x: x.split('_')[1])
income_statement_tsla_melt['Date'] = income_statement_tsla_melt['Date'].apply(lambda x: x.split('_')[1])

# Remove '-' from Date column values and keep only year and month
income_statement_msft_melt['Date'] = income_statement_msft_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
income_statement_aapl_melt['Date'] = income_statement_aapl_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
income_statement_tsla_melt['Date'] = income_statement_tsla_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))

print("Income statement of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
income_statement_msft_melt

Income statement of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,Variables,Date,Value
0,RevenueFromContractWithCustomerExcludingAssess...,202406,245122000000
1,CostOfGoodsAndServicesSold,202406,74114000000
2,GrossProfit,202406,171008000000
3,OperatingIncomeLoss,202406,109433000000
4,NetIncomeLoss,202406,88136000000
5,RevenueFromContractWithCustomerExcludingAssess...,202306,211915000000
6,CostOfGoodsAndServicesSold,202306,65863000000
7,GrossProfit,202306,146052000000
8,OperatingIncomeLoss,202306,88523000000
9,NetIncomeLoss,202306,72361000000


In [46]:
# get balance sheets for each company
balance_sheets_msft = get_balance_sheets(msft_xbrl_json)
balance_sheets_aapl = get_balance_sheets(aapl_xbrl_json)
balance_sheets_tsla = get_balance_sheets(tsla_xbrl_json)

balance_sheets_msft_2023 = get_balance_sheets(msft_xbrl_json_2023)
balance_sheets_aapl_2023 = get_balance_sheets(aapl_xbrl_json_2023)
balance_sheets_tsla_2023 = get_balance_sheets(tsla_xbrl_json_2023)

# join 2024 and 2023 dataframes for each company
balance_sheets_msft = pd.concat([balance_sheets_msft, balance_sheets_msft_2023], axis=1)
balance_sheets_aapl = pd.concat([balance_sheets_aapl, balance_sheets_aapl_2023], axis=1)    
balance_sheets_tsla = pd.concat([balance_sheets_tsla, balance_sheets_tsla_2023], axis=1)

# select only those items that we want
balance_sheets_msft = balance_sheets_msft.loc[items2]
balance_sheets_aapl = balance_sheets_aapl.loc[items2]
balance_sheets_tsla = balance_sheets_tsla.loc[items2]

# drop columns with null values if any
balance_sheets_msft = balance_sheets_msft.dropna(axis=1, how='all')
balance_sheets_aapl = balance_sheets_aapl.dropna(axis=1, how='all')
balance_sheets_tsla = balance_sheets_tsla.dropna(axis=1, how='all')

# drop duplicate columns if any
balance_sheets_msft = balance_sheets_msft.loc[:,~balance_sheets_msft.columns.duplicated()]
balance_sheets_aapl = balance_sheets_aapl.loc[:,~balance_sheets_aapl.columns.duplicated()]
balance_sheets_tsla = balance_sheets_tsla.loc[:,~balance_sheets_tsla.columns.duplicated()]

# reset index and name it
balance_sheets_msft = balance_sheets_msft.reset_index().rename(columns={'index': 'Variables'})
balance_sheets_aapl = balance_sheets_aapl.reset_index().rename(columns={'index': 'Variables'})
balance_sheets_tsla = balance_sheets_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
balance_sheets_msft_melt = balance_sheets_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
balance_sheets_aapl_melt = balance_sheets_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
balance_sheets_tsla_melt = balance_sheets_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

# Remove '-' from Date column values and keep only year and month
balance_sheets_msft_melt['Date'] = balance_sheets_msft_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
balance_sheets_aapl_melt['Date'] = balance_sheets_aapl_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
balance_sheets_tsla_melt['Date'] = balance_sheets_tsla_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))

print("Balance Sheets of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
balance_sheets_msft_melt

Balance Sheets of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,Variables,Date,Value
0,AssetsCurrent,202306,184257000000
1,LiabilitiesCurrent,202306,104149000000
2,Assets,202306,411976000000
3,Liabilities,202306,205753000000
4,LiabilitiesAndStockholdersEquity,202306,411976000000
5,AssetsCurrent,202406,159734000000
6,LiabilitiesCurrent,202406,125286000000
7,Assets,202406,512163000000
8,Liabilities,202406,243686000000
9,LiabilitiesAndStockholdersEquity,202406,512163000000


In [44]:
cashflow_msft = get_cashflow(msft_xbrl_json)
cashflow_aapl = get_cashflow(aapl_xbrl_json)
cashflow_tsla = get_cashflow(tsla_xbrl_json)

cashflow_msft_2023 = get_cashflow(msft_xbrl_json_2023)
cashflow_aapl_2023 = get_cashflow(aapl_xbrl_json_2023)
cashflow_tsla_2023 = get_cashflow(tsla_xbrl_json_2023)

cashflow_msft = pd.concat([cashflow_msft, cashflow_msft_2023], axis=1)
cashflow_aapl = pd.concat([cashflow_aapl, cashflow_aapl_2023], axis=1)    
cashflow_tsla = pd.concat([cashflow_tsla, cashflow_tsla_2023], axis=1)

# Get only the items we are interested in
cashflow_msft = cashflow_msft.loc[items3]
cashflow_aapl = cashflow_aapl.loc[items3]
cashflow_tsla = cashflow_tsla.loc[items3]

# drop columns with null values if any
balance_sheets_msft = balance_sheets_msft.dropna(axis=1, how='all')
balance_sheets_aapl = balance_sheets_aapl.dropna(axis=1, how='all')
balance_sheets_tsla = balance_sheets_tsla.dropna(axis=1, how='all')

# drop duplicate columns if any
balance_sheets_msft = balance_sheets_msft.loc[:,~balance_sheets_msft.columns.duplicated()]
balance_sheets_aapl = balance_sheets_aapl.loc[:,~balance_sheets_aapl.columns.duplicated()]
balance_sheets_tsla = balance_sheets_tsla.loc[:,~balance_sheets_tsla.columns.duplicated()]

# reset index and name it
cashflow_msft = cashflow_msft.reset_index().rename(columns={'index': 'Variables'})
cashflow_aapl = cashflow_aapl.reset_index().rename(columns={'index': 'Variables'})
cashflow_tsla = cashflow_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
cashflow_msft_melt = cashflow_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
cashflow_aapl_melt = cashflow_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
cashflow_tsla_melt = cashflow_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

cashflow_msft_melt = cashflow_msft_melt.dropna()
cashflow_aapl_melt = cashflow_aapl_melt.dropna()
cashflow_tsla_melt = cashflow_tsla_melt.dropna()

# Keep only last part of Date string column (i.e., keep only the date of the filing)
# Keep only the last nine characters of the Date string column (i.e., keep only the date of the filing)
# Example: '2021-07-01-2022-06-30' -> '20230630'
cashflow_msft_melt['Date'] = cashflow_msft_melt['Date'].apply(lambda x: x[-10:-2].replace('-', ''))
cashflow_aapl_melt['Date'] = cashflow_aapl_melt['Date'].apply(lambda x: x[-10:-2].replace('-', ''))
cashflow_tsla_melt['Date'] = cashflow_tsla_melt['Date'].apply(lambda x: x[-10:-2].replace('-', ''))

print("Cash Flow of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
cashflow_msft_melt

Cash Flow of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,Variables,Date,Value
3,NetCashProvidedByUsedInOperatingActivities,202206,89035000000
4,NetCashProvidedByUsedInInvestingActivities,202206,-30311000000
5,NetCashProvidedByUsedInFinancingActivities,202206,-58876000000
9,NetCashProvidedByUsedInOperatingActivities,202306,87582000000
10,NetCashProvidedByUsedInInvestingActivities,202306,-22680000000
11,NetCashProvidedByUsedInFinancingActivities,202306,-43935000000
15,NetCashProvidedByUsedInOperatingActivities,202406,118548000000
16,NetCashProvidedByUsedInInvestingActivities,202406,-96970000000
17,NetCashProvidedByUsedInFinancingActivities,202406,-37757000000
24,NetCashProvidedByUsedInOperatingActivities,202106,76740000000
