In [2]:
!pip install -q sec-api

In [3]:
API_KEY = 'd6f32483c70c26c69ef8cae4010108a9667647438edd7c4d9876157d2ec3b82c'

In [4]:
from sec_api import XbrlApi

xbrlApi = XbrlApi(API_KEY)

In [5]:
# URL of Microsoft's 10-K filings
url_10k_msft = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm'
url_10k_aapl = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm'
url_10k_tsla = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001318605/000162828025003063/tsla-20241231.htm'

msft_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_msft)
aapl_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_aapl)
tsla_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_tsla)

In [12]:
print("Keys of income statement dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(msft_xbrl_json['StatementsOfIncome'].keys()), sep='\n')

Keys of income statement dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
RevenueFromContractWithCustomerExcludingAssessedTax
CostOfGoodsAndServicesSold
GrossProfit
ResearchAndDevelopmentExpense
SellingAndMarketingExpense
GeneralAndAdministrativeExpense
OperatingIncomeLoss
NonoperatingIncomeExpense
IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest
IncomeTaxExpenseBenefit
NetIncomeLoss
EarningsPerShareBasic
EarningsPerShareDiluted
WeightedAverageNumberOfSharesOutstandingBasic
WeightedAverageNumberOfDilutedSharesOutstanding


In [14]:
import pandas as pd 

# convert XBRL-JSON of income statement to pandas dataframe
def get_income_statement(xbrl_json):
    income_statement_store = {}

    # iterate over each US GAAP item in the income statement
    for usGaapItem in xbrl_json['StatementsOfIncome']:
        values = []
        indicies = []

        for fact in xbrl_json['StatementsOfIncome'][usGaapItem]:
            # only consider items without segment. not required for our analysis.
            if 'segment' not in fact:
                index = fact['period']['startDate'] + '-' + fact['period']['endDate']
                # ensure no index duplicates are created
                if index not in indicies:
                    values.append(fact['value'])
                    indicies.append(index)                    

        income_statement_store[usGaapItem] = pd.Series(values, index=indicies) 

    income_statement = pd.DataFrame(income_statement_store)
    # switch columns and rows so that US GAAP items are rows and each column header represents a date range
    return income_statement.T 


income_statement_msft = get_income_statement(msft_xbrl_json)
income_statement_aapl = get_income_statement(aapl_xbrl_json)
income_statement_tsla = get_income_statement(tsla_xbrl_json)

income_statement_msft = income_statement_msft.loc[['RevenueFromContractWithCustomerExcludingAssessedTax', 'NetIncomeLoss']]
income_statement_aapl = income_statement_aapl.loc[['RevenueFromContractWithCustomerExcludingAssessedTax', 'NetIncomeLoss']]
income_statement_tsla = income_statement_tsla.loc[['RevenueFromContractWithCustomerExcludingAssessedTax', 'NetIncomeLoss']]

# reset index and name it
income_statement_msft = income_statement_msft.reset_index().rename(columns={'index': 'Variables'})
income_statement_aapl = income_statement_aapl.reset_index().rename(columns={'index': 'Variables'})
income_statement_tsla = income_statement_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
income_statement_msft_melt = income_statement_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_aapl_melt = income_statement_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_tsla_melt = income_statement_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

print("Income statement of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
income_statement_msft_melt

Income statement of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,2023-07-01-2024-06-30,2022-07-01-2023-06-30,2021-07-01-2022-06-30
RevenueFromContractWithCustomerExcludingAssessedTax,245122000000.0,211915000000.0,198270000000.0
CostOfGoodsAndServicesSold,74114000000.0,65863000000.0,62650000000.0
GrossProfit,171008000000.0,146052000000.0,135620000000.0
ResearchAndDevelopmentExpense,29510000000.0,27195000000.0,24512000000.0
SellingAndMarketingExpense,24456000000.0,22759000000.0,21825000000.0
GeneralAndAdministrativeExpense,7609000000.0,7575000000.0,5900000000.0
OperatingIncomeLoss,109433000000.0,88523000000.0,83383000000.0
NonoperatingIncomeExpense,-1646000000.0,788000000.0,333000000.0
IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest,107787000000.0,89311000000.0,83716000000.0
IncomeTaxExpenseBenefit,19651000000.0,16950000000.0,10978000000.0


In [17]:
print("Keys of balance sheets dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(msft_xbrl_json['BalanceSheets'].keys()), sep='\n')

Keys of balance sheets dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
CashAndCashEquivalentsAtCarryingValue
ShortTermInvestments
CashCashEquivalentsAndShortTermInvestments
AccountsReceivableNetCurrent
InventoryNet
OtherAssetsCurrent
AssetsCurrent
PropertyPlantAndEquipmentNet
OperatingLeaseRightOfUseAsset
LongTermInvestments
Goodwill
FiniteLivedIntangibleAssetsNet
OtherAssetsNoncurrent
Assets
AccountsPayableCurrent
CommercialPaper
LongTermDebtCurrent
EmployeeRelatedLiabilitiesCurrent
AccruedIncomeTaxesCurrent
ContractWithCustomerLiabilityCurrent
OtherLiabilitiesCurrent
LiabilitiesCurrent
LongTermDebtNoncurrent
AccruedIncomeTaxesNoncurrent
ContractWithCustomerLiabilityNoncurrent
DeferredIncomeTaxLiabilitiesNet
OperatingLeaseLiabilityNoncurrent
OtherLiabilitiesNoncurrent
Liabilities
CommitmentsAndContingencies
CommonStocksIncludingAdditionalPaidInCapital
RetainedEarningsAccumulatedDeficit
AccumulatedOtherComprehensiveIn

In [14]:
# convert XBRL-JSON of Balance sheets to pandas dataframe
def get_balance_sheets(xbrl_json):
    balance_sheets_store = {}

    # iterate over each US GAAP item in the income statement
    for usGaapItem in xbrl_json['BalanceSheets']:
        values = []
        indicies = []

        for fact in xbrl_json['BalanceSheets'][usGaapItem]:
            # only consider items without segment. not required for our analysis.
            if 'segment' not in fact:
                index = fact['period']['startDate'] + '-' + fact['period']['endDate']
                # ensure no index duplicates are created
                if index not in indicies:
                    values.append(fact['value'])
                    indicies.append(index)                    

        balance_sheets_store[usGaapItem] = pd.Series(values, index=indicies) 

    balance_sheets = pd.DataFrame(balance_sheets_store)
    # switch columns and rows so that US GAAP items are rows and each column header represents a date range
    return balance_sheets.T 


balance_sheets_msft = get_balance_sheets(msft_xbrl_json)
balance_sheets_aapl = get_balance_sheets(aapl_xbrl_json)
balance_sheets_tsla = get_balance_sheets(tsla_xbrl_json)

balance_sheets_msft = income_statement_msft.loc[['Assets', 'Liabilities']]
balance_sheets_aapl = income_statement_aapl.loc[['Assets', 'Liabilities']]
balance_sheets_tsla = income_statement_tsla.loc[['Assets', 'Liabilities']]

# reset index and name it
income_statement_msft = income_statement_msft.reset_index().rename(columns={'index': 'Variables'})
income_statement_aapl = income_statement_aapl.reset_index().rename(columns={'index': 'Variables'})
income_statement_tsla = income_statement_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
income_statement_msft_melt = income_statement_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_aapl_melt = income_statement_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_tsla_melt = income_statement_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

print("Income statement of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
income_statement_msft_melt


print("Balance Sheets of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
balance_sheets_msft

Income statement of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,2023-07-01-2024-06-30,2022-07-01-2023-06-30,2021-07-01-2022-06-30
RevenueFromContractWithCustomerExcludingAssessedTax,245122000000.0,211915000000.0,198270000000.0
CostOfGoodsAndServicesSold,74114000000.0,65863000000.0,62650000000.0
GrossProfit,171008000000.0,146052000000.0,135620000000.0
ResearchAndDevelopmentExpense,29510000000.0,27195000000.0,24512000000.0
SellingAndMarketingExpense,24456000000.0,22759000000.0,21825000000.0
GeneralAndAdministrativeExpense,7609000000.0,7575000000.0,5900000000.0
OperatingIncomeLoss,109433000000.0,88523000000.0,83383000000.0
NonoperatingIncomeExpense,-1646000000.0,788000000.0,333000000.0
IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest,107787000000.0,89311000000.0,83716000000.0
IncomeTaxExpenseBenefit,19651000000.0,16950000000.0,10978000000.0
