# Introduction

In [1]:
# Author: Alessandro Fassina
# Content: this code extracts data from the 10-K filings of three companies: Microsoft, Apple, and Google. The data is then stored in a .csv file for further analysis.
# Inputs
# - API key for the SEC EDGAR database: 'd6f32483c70c26c69ef8cae4010108a9667647438edd7c4d9876157d2ec3b82c'
# - URL links to the 10-K filings of Microsoft, Apple, and Google.
# Output
# - A .csv file containing the extracted data from the 10-K filings of the three companies.

In [2]:
# %pip install -q sec-api

In [3]:
%run ./99_helper_functions.py

# 1. Basic Imports and Setup

In [4]:
import pandas as pd 
from sec_api import XbrlApi

In [5]:
API_KEY = 'd6f32483c70c26c69ef8cae4010108a9667647438edd7c4d9876157d2ec3b82c'

In [6]:

xbrlApi = XbrlApi(API_KEY)

In [None]:
# Lists of items to extract

# Income Statement
# Operating expenses = Gross Profit - Operating Income
items1 = ['RevenueFromContractWithCustomerExcludingAssessedTax', # Total Revenue
          'CostOfGoodsAndServicesSold', # Cost of goods sold
          'GrossProfit', # Gross Profit
          'OperatingIncomeLoss', # Operating Income
          'NetIncomeLoss' # Net Income
          ]

# Balance Sheet
# Long-term assets = Total Assets - Current Assets
# Long-term liabilities = Total Liabilities - Current Liabilities
# Equity = Assets - Liabilities
items2 = ['AssetsCurrent', # Current Assets
          'LiabilitiesCurrent', # Current Liabilities
          'Assets', # Total Assets
          'Liabilities', # Total Liabilities
          'LiabilitiesAndStockholdersEquity' # Total Liabilities and Shareholders' Equity
          ]

# Cash Flow Statement
items3 = ['NetCashProvidedByUsedInOperatingActivities', # Cash from operating activities
          'NetCashProvidedByUsedInInvestingActivities', # Cash from investing activities
          'NetCashProvidedByUsedInFinancingActivities' # Cash from financing activities
          ]

# Years of interest
years = [2020, 2021, 2022, 2023, 2024, 2025]
tickers = ['MSFT', 'AAPL', 'TSLA', 'META', 'NVDA']

dict_years_tkr_url =  {
    2025: {
        'MSFT': 'https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/msft-20250630.htm',
        'AAPL': '',
        'TSLA': '',
        'META': '',
        'NVDA': ''
    },
    2024: {
        'MSFT': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm',
        'AAPL': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm',
        'TSLA': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001318605/000162828025003063/tsla-20241231.htm',
        'META': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001326801/000132680125000017/meta-20241231.htm',
        'NVDA': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126.htm' # fiscal year refers to 2024
    },
    2023: {
        'MSFT': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm',
        'AAPL': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm',
        'TSLA': 'https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm',
        'META': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001326801/000132680124000012/meta-20231231.htm',
        'NVDA': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm' # fiscal year refers to 2023
    },
    2022: {
        'MSFT': 'https://www.sec.gov/Archives/edgar/data/789019/000156459022026876/msft-10k_20220630.htm',
        'AAPL': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019322000108/aapl-20220924.htm',
        'TSLA': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017023001409/tsla-20221231.htm',
        'META': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680123000013/meta-20221231.htm',
        'NVDA': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000017/nvda-20230129.htm' # fiscal year refers to 2022
    },
    2021: {
        'MSFT': 'https://www.sec.gov/Archives/edgar/data/789019/000156459021039151/msft-10k_20210630.htm',
        'AAPL': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019321000105/aapl-20210925.htm',
        'TSLA': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',
        'META': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680122000018/fb-20211231.htm',
        'NVDA': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581022000036/nvda-20220130.htm' # fiscal year refers to 2021
    },
    2020: {
        'MSFT': 'https://www.sec.gov/Archives/edgar/data/789019/000156459020034944/msft-10k_20200630.htm',
        'AAPL': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019320000096/aapl-20200926.htm',
        'TSLA': 'https://www.sec.gov/Archives/edgar/data/1318605/000156459021004599/tsla-10k_20201231.htm',
        'META': '',
        'NVDA': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581021000010/nvda-20210131.htm' # fiscal year refers to 2020
    }
}

# 2. Extract Data

In [25]:
# extract data for each year and each ticker
dict_xbrl_json = {}
for key, value in dict_years_tkr_url.items():
    print(f'Extracting data for year: {key}')
    dict_xbrl_json[key] = {}  # Initialize dictionary for each year
    for tkr, url in value.items():
        if url != '':
            xbrl_json = xbrlApi.xbrl_to_json(htm_url=url)
            dict_xbrl_json[key][tkr] = xbrl_json
        else:
            print(f'No URL for {tkr} in {key}, skipping...')
            continue

dict_xbrl_json['2025']['MSFT']

Extracting data for year: 2025
No URL for AAPL in 2025, skipping...
No URL for TSLA in 2025, skipping...
No URL for META in 2025, skipping...
No URL for NVDA in 2025, skipping...
Extracting data for year: 2024


Exception: API error: 429 - {"status":429,"error":"You send a lot of requests. We like that. But you exceeded the free query limit of 100 requests. Upgrade your account to get unlimited access. Visit sec-api.io for more."}

In [10]:
# # URL of Microsoft's 10-K filings


# url_10k_msft_2024 = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000789019/000095017024087843/msft-20240630.htm'
# url_10k_aapl_2024 = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm'
# url_10k_tsla_2024 = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001318605/000162828025003063/tsla-20241231.htm'

# url_10k_msft_2023 = 'https://www.sec.gov/ix?doc=/Archives/edgar/data/789019/000095017023035122/msft-20230630.htm'
# url_10k_aapl_2023 = 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm'
# url_10k_tsla_2023 = 'https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm'


# # Convert the XBRL data to JSON format
# msft_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_msft)
# msft_xbrl_json_2023 = xbrlApi.xbrl_to_json(htm_url=url_10k_msft_2023)
# aapl_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_aapl)
# aapl_xbrl_json_2023 = xbrlApi.xbrl_to_json(htm_url=url_10k_aapl_2023)
# tsla_xbrl_json = xbrlApi.xbrl_to_json(htm_url=url_10k_tsla)
# tsla_xbrl_json_2023 = xbrlApi.xbrl_to_json(htm_url=url_10k_tsla_2023)

In [11]:
print("Keys of income statement dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(dict_xbrl_json['MSFT_2025']['StatementsOfIncome'].keys()), sep='\n')

Keys of income statement dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
RevenueFromContractWithCustomerExcludingAssessedTax
CostOfGoodsAndServicesSold
GrossProfit
ResearchAndDevelopmentExpense
SellingAndMarketingExpense
GeneralAndAdministrativeExpense
OperatingIncomeLoss
NonoperatingIncomeExpense
IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest
IncomeTaxExpenseBenefit
NetIncomeLoss
EarningsPerShareBasic
EarningsPerShareDiluted
WeightedAverageNumberOfSharesOutstandingBasic
WeightedAverageNumberOfDilutedSharesOutstanding


In [12]:
print("Keys of balance sheets dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(dict_xbrl_json['MSFT_2025']['BalanceSheets'].keys()), sep='\n')

Keys of balance sheets dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
CashAndCashEquivalentsAtCarryingValue
ShortTermInvestments
CashCashEquivalentsAndShortTermInvestments
AccountsReceivableNetCurrent
InventoryNet
OtherAssetsCurrent
AssetsCurrent
PropertyPlantAndEquipmentNet
OperatingLeaseRightOfUseAsset
LongTermInvestments
Goodwill
FiniteLivedIntangibleAssetsNet
OtherAssetsNoncurrent
Assets
AccountsPayableCurrent
CommercialPaper
LongTermDebtCurrent
EmployeeRelatedLiabilitiesCurrent
AccruedIncomeTaxesCurrent
ContractWithCustomerLiabilityCurrent
OtherLiabilitiesCurrent
LiabilitiesCurrent
LongTermDebtNoncurrent
AccruedIncomeTaxesNoncurrent
ContractWithCustomerLiabilityNoncurrent
DeferredIncomeTaxLiabilitiesNet
OperatingLeaseLiabilityNoncurrent
OtherLiabilitiesNoncurrent
Liabilities
CommitmentsAndContingencies
CommonStocksIncludingAdditionalPaidInCapital
RetainedEarningsAccumulatedDeficit
AccumulatedOtherComprehensiveIn

In [13]:
print("Keys of Cash Flows dictionary in XBRL from Microsoft's 10-K filing")
print('--------------------------------------------------------------------')
print(*list(dict_xbrl_json['MSFT_2025']['StatementsOfCashFlows'].keys()), sep='\n')

Keys of Cash Flows dictionary in XBRL from Microsoft's 10-K filing
--------------------------------------------------------------------
NetIncomeLoss
DepreciationAmortizationAndOther
ShareBasedCompensation
GainLossOnInvestmentsAndDerivativeInstruments
DeferredIncomeTaxExpenseBenefit
IncreaseDecreaseInAccountsReceivable
IncreaseDecreaseInInventories
IncreaseDecreaseInOtherCurrentAssets
IncreaseDecreaseInOtherNoncurrentAssets
IncreaseDecreaseInAccountsPayable
IncreaseDecreaseInContractWithCustomerLiability
IncreaseDecreaseInAccruedIncomeTaxesPayable
IncreaseDecreaseInOtherCurrentLiabilities
IncreaseDecreaseInOtherNoncurrentLiabilities
NetCashProvidedByUsedInOperatingActivities
ProceedsFromRepaymentsOfShortTermDebtMaturingInThreeMonthsOrLess
ProceedsFromDebtMaturingInMoreThanThreeMonths
RepaymentsOfDebtMaturingInMoreThanThreeMonths
ProceedsFromIssuanceOfCommonStock
PaymentsForRepurchaseOfCommonStock
PaymentsOfDividendsCommonStock
ProceedsFromPaymentsForOtherFinancingActivities
NetCashProv

In [14]:
import re

In [22]:
for i in range(len(list(dict_xbrl_json['MSFT_2025']['StatementsOfCashFlows'].keys()))):
    if re.search('CAPEX', list(dict_xbrl_json['MSFT_2025']['StatementsOfCashFlows'].keys())[i]):
        print(list(dict_xbrl_json['MSFT_2025']['StatementsOfCashFlows'].keys())[i])

In [None]:
# for i in range(len(list(msft_xbrl_json['StatementsOfCashFlows'].keys()))):
#     if re.search('NetCashProvidedByUsedIn', list(msft_xbrl_json['StatementsOfCashFlows'].keys())[i]):
#         print(list(msft_xbrl_json['StatementsOfCashFlows'].keys())[i])

NetCashProvidedByUsedInOperatingActivities
NetCashProvidedByUsedInFinancingActivities
NetCashProvidedByUsedInInvestingActivities


In [None]:


for key in dict_xbrl_json.keys():
    tkr, year = key.split('_')
    print(f'Processing data for {tkr} in {year}')
    income_statement = get_income_statement(dict_xbrl_json[f'{tkr}_{year}'])
    balance_sheets = get_balance_sheets(dict_xbrl_json[f'{tkr}_{year}'])
    cashflow = get_cash_flows(dict_xbrl_json[f'{tkr}_{year}'])

income_statement_msft = get_income_statement(msft_xbrl_json)
income_statement_aapl = get_income_statement(aapl_xbrl_json)
income_statement_tsla = get_income_statement(tsla_xbrl_json)

income_statement_msft = income_statement_msft.loc[[item for item in items1 if item in income_statement_msft.index]]
income_statement_aapl = income_statement_aapl.loc[[item for item in items1 if item in income_statement_aapl.index]]
income_statement_tsla = income_statement_tsla.loc[[item for item in items1 if item in income_statement_tsla.index]]

# reset index and name it
income_statement_msft = income_statement_msft.reset_index().rename(columns={'index': 'Variables'})
income_statement_aapl = income_statement_aapl.reset_index().rename(columns={'index': 'Variables'})
income_statement_tsla = income_statement_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
income_statement_msft_melt = income_statement_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_aapl_melt = income_statement_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
income_statement_tsla_melt = income_statement_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

# Keep only last part of Date string column (i.e., keep only the date of the filing)
income_statement_msft_melt['Date'] = income_statement_msft_melt['Date'].apply(lambda x: x.split('_')[1])
income_statement_aapl_melt['Date'] = income_statement_aapl_melt['Date'].apply(lambda x: x.split('_')[1])
income_statement_tsla_melt['Date'] = income_statement_tsla_melt['Date'].apply(lambda x: x.split('_')[1])

# Remove '-' from Date column values and keep only year and month
income_statement_msft_melt['Date'] = income_statement_msft_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
income_statement_aapl_melt['Date'] = income_statement_aapl_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
income_statement_tsla_melt['Date'] = income_statement_tsla_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))

print("Income statement of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
income_statement_msft_melt

Income statement of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,Variables,Date,Value
0,RevenueFromContractWithCustomerExcludingAssess...,202406,245122000000
1,CostOfGoodsAndServicesSold,202406,74114000000
2,GrossProfit,202406,171008000000
3,OperatingIncomeLoss,202406,109433000000
4,NetIncomeLoss,202406,88136000000
5,RevenueFromContractWithCustomerExcludingAssess...,202306,211915000000
6,CostOfGoodsAndServicesSold,202306,65863000000
7,GrossProfit,202306,146052000000
8,OperatingIncomeLoss,202306,88523000000
9,NetIncomeLoss,202306,72361000000


In [73]:
# get balance sheets for each company
balance_sheets_msft = get_balance_sheets(msft_xbrl_json)
balance_sheets_aapl = get_balance_sheets(aapl_xbrl_json)
balance_sheets_tsla = get_balance_sheets(tsla_xbrl_json)

balance_sheets_msft_2023 = get_balance_sheets(msft_xbrl_json_2023)
balance_sheets_aapl_2023 = get_balance_sheets(aapl_xbrl_json_2023)
balance_sheets_tsla_2023 = get_balance_sheets(tsla_xbrl_json_2023)

# join 2024 and 2023 dataframes for each company
balance_sheets_msft = pd.concat([balance_sheets_msft, balance_sheets_msft_2023], axis=1)
balance_sheets_aapl = pd.concat([balance_sheets_aapl, balance_sheets_aapl_2023], axis=1)    
balance_sheets_tsla = pd.concat([balance_sheets_tsla, balance_sheets_tsla_2023], axis=1)

# select only those items that we want
balance_sheets_msft = balance_sheets_msft.loc[items2]
balance_sheets_aapl = balance_sheets_aapl.loc[items2]
balance_sheets_tsla = balance_sheets_tsla.loc[items2]

# drop columns with null values if any
balance_sheets_msft = balance_sheets_msft.dropna(axis=1, how='all')
balance_sheets_aapl = balance_sheets_aapl.dropna(axis=1, how='all')
balance_sheets_tsla = balance_sheets_tsla.dropna(axis=1, how='all')

# drop duplicate columns if any
balance_sheets_msft = balance_sheets_msft.loc[:,~balance_sheets_msft.columns.duplicated()]
balance_sheets_aapl = balance_sheets_aapl.loc[:,~balance_sheets_aapl.columns.duplicated()]
balance_sheets_tsla = balance_sheets_tsla.loc[:,~balance_sheets_tsla.columns.duplicated()]

# reset index and name it
balance_sheets_msft = balance_sheets_msft.reset_index().rename(columns={'index': 'Variables'})
balance_sheets_aapl = balance_sheets_aapl.reset_index().rename(columns={'index': 'Variables'})
balance_sheets_tsla = balance_sheets_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
balance_sheets_msft_melt = balance_sheets_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
balance_sheets_aapl_melt = balance_sheets_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
balance_sheets_tsla_melt = balance_sheets_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

# Remove '-' from Date column values and keep only year and month
balance_sheets_msft_melt['Date'] = balance_sheets_msft_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
balance_sheets_aapl_melt['Date'] = balance_sheets_aapl_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))
balance_sheets_tsla_melt['Date'] = balance_sheets_tsla_melt['Date'].apply(lambda x: x[:-2].replace('-', ''))

print("Balance Sheets of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
balance_sheets_msft_melt

Balance Sheets of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,Variables,Date,Value
0,AssetsCurrent,202306,184257000000
1,LiabilitiesCurrent,202306,104149000000
2,Assets,202306,411976000000
3,Liabilities,202306,205753000000
4,LiabilitiesAndStockholdersEquity,202306,411976000000
5,AssetsCurrent,202406,159734000000
6,LiabilitiesCurrent,202406,125286000000
7,Assets,202406,512163000000
8,Liabilities,202406,243686000000
9,LiabilitiesAndStockholdersEquity,202406,512163000000


In [74]:
cashflow_msft = get_cashflow(msft_xbrl_json)
cashflow_aapl = get_cashflow(aapl_xbrl_json)
cashflow_tsla = get_cashflow(tsla_xbrl_json)

cashflow_msft_2023 = get_cashflow(msft_xbrl_json_2023)
cashflow_aapl_2023 = get_cashflow(aapl_xbrl_json_2023)
cashflow_tsla_2023 = get_cashflow(tsla_xbrl_json_2023)

cashflow_msft = pd.concat([cashflow_msft, cashflow_msft_2023], axis=1)
cashflow_aapl = pd.concat([cashflow_aapl, cashflow_aapl_2023], axis=1)    
cashflow_tsla = pd.concat([cashflow_tsla, cashflow_tsla_2023], axis=1)

# Get only the items we are interested in
cashflow_msft = cashflow_msft.loc[items3]
cashflow_aapl = cashflow_aapl.loc[items3]
cashflow_tsla = cashflow_tsla.loc[items3]

# drop columns with null values if any
cashflow_msft = cashflow_msft.dropna(axis=1, how='all')
cashflow_aapl = cashflow_aapl.dropna(axis=1, how='all')
cashflow_tsla = cashflow_tsla.dropna(axis=1, how='all')

# drop duplicate columns if any
cashflow_msft = cashflow_msft.loc[:,~cashflow_msft.columns.duplicated()]
cashflow_aapl = cashflow_aapl.loc[:,~cashflow_aapl.columns.duplicated()]
cashflow_tsla = cashflow_tsla.loc[:,~cashflow_tsla.columns.duplicated()]

# reset index and name it
cashflow_msft = cashflow_msft.reset_index().rename(columns={'index': 'Variables'})
cashflow_aapl = cashflow_aapl.reset_index().rename(columns={'index': 'Variables'})
cashflow_tsla = cashflow_tsla.reset_index().rename(columns={'index': 'Variables'})

# pivot 
cashflow_msft_melt = cashflow_msft.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
cashflow_aapl_melt = cashflow_aapl.melt(id_vars=["Variables"], var_name="Date", value_name="Value")
cashflow_tsla_melt = cashflow_tsla.melt(id_vars=["Variables"], var_name="Date", value_name="Value")

cashflow_msft_melt = cashflow_msft_melt.dropna()
cashflow_aapl_melt = cashflow_aapl_melt.dropna()
cashflow_tsla_melt = cashflow_tsla_melt.dropna()

# Keep only last part of Date string column (i.e., keep only the date of the filing)
# Keep only the last nine characters of the Date string column (i.e., keep only the date of the filing)
# Example: '2021-07-01-2022-06-30' -> '20230630'
cashflow_msft_melt['Date'] = cashflow_msft_melt['Date'].apply(lambda x: x[-10:-2].replace('-', ''))
cashflow_aapl_melt['Date'] = cashflow_aapl_melt['Date'].apply(lambda x: x[-10:-2].replace('-', ''))
cashflow_tsla_melt['Date'] = cashflow_tsla_melt['Date'].apply(lambda x: x[-10:-2].replace('-', ''))

print("Cash Flow of Microsoft's 2023 10-K filing as dataframe")
print('---------------------------------------------------------')
cashflow_msft_melt

Cash Flow of Microsoft's 2023 10-K filing as dataframe
---------------------------------------------------------


Unnamed: 0,Variables,Date,Value
0,NetCashProvidedByUsedInOperatingActivities,202206,89035000000
1,NetCashProvidedByUsedInInvestingActivities,202206,-30311000000
2,NetCashProvidedByUsedInFinancingActivities,202206,-58876000000
3,NetCashProvidedByUsedInOperatingActivities,202306,87582000000
4,NetCashProvidedByUsedInInvestingActivities,202306,-22680000000
5,NetCashProvidedByUsedInFinancingActivities,202306,-43935000000
6,NetCashProvidedByUsedInOperatingActivities,202406,118548000000
7,NetCashProvidedByUsedInInvestingActivities,202406,-96970000000
8,NetCashProvidedByUsedInFinancingActivities,202406,-37757000000
9,NetCashProvidedByUsedInOperatingActivities,202106,76740000000


In [75]:
balance_sheets_msft

Unnamed: 0,Variables,2023-06-30,2024-06-30,2022-06-30
0,AssetsCurrent,184257000000,159734000000,169684000000
1,LiabilitiesCurrent,104149000000,125286000000,95082000000
2,Assets,411976000000,512163000000,364840000000
3,Liabilities,205753000000,243686000000,198298000000
4,LiabilitiesAndStockholdersEquity,411976000000,512163000000,364840000000


In [76]:
type(income_statement_msft_melt)

pandas.core.frame.DataFrame

In [77]:
type(income_statement_msft_melt)

pandas.core.frame.DataFrame

In [89]:
# Add tickers to each dataframe
income_statement_msft_melt['Ticker'] = 'MSFT'
income_statement_aapl_melt['Ticker'] = 'AAPL'
income_statement_tsla_melt['Ticker'] = 'TSLA'
balance_sheets_msft_melt['Ticker'] = 'MSFT'
balance_sheets_aapl_melt['Ticker'] = 'AAPL'
balance_sheets_tsla_melt['Ticker'] = 'TSLA'
cashflow_msft_melt['Ticker'] = 'MSFT'
cashflow_aapl_melt['Ticker'] = 'AAPL'
cashflow_tsla_melt['Ticker'] = 'TSLA'

# Concatenate dataframes for the three companies
income_statement = pd.concat([income_statement_msft_melt, income_statement_aapl_melt, income_statement_tsla_melt], axis=0)
balance_sheets = pd.concat([balance_sheets_msft_melt, balance_sheets_aapl_melt, balance_sheets_tsla_melt], axis=0)
cashflow = pd.concat([cashflow_msft_melt, cashflow_aapl_melt, cashflow_tsla_melt], axis=0)

# pivot each dataframe to have Variables as columns
income_statement_pivot = income_statement.pivot(index=['Ticker', 'Date'], columns='Variables', values='Value').reset_index()
balance_sheets_pivot = balance_sheets.pivot(index=['Ticker', 'Date'], columns='Variables', values='Value').reset_index()
cashflow_pivot = cashflow.pivot(index=['Ticker', 'Date'], columns='Variables', values='Value').reset_index()

# merge the three dataframes into a single one
df_merged = pd.merge(income_statement_pivot, balance_sheets_pivot, on=['Ticker', 'Date'], how='inner')
df_merged = pd.merge(df_merged, cashflow_pivot, on=['Ticker', 'Date'], how='inner')

# sort by Ticker and Date
df_merged = df_merged.sort_values(by=['Ticker', 'Date'], ascending=[True, False])

print("Merged dataframe of Income Statement, Balance Sheets, and Cash Flow")
print('---------------------------------------------------------')
df_merged.head(10)

Merged dataframe of Income Statement, Balance Sheets, and Cash Flow
---------------------------------------------------------


Variables,Ticker,Date,CostOfGoodsAndServicesSold,GrossProfit,NetIncomeLoss,OperatingIncomeLoss,RevenueFromContractWithCustomerExcludingAssessedTax,Assets,AssetsCurrent,Liabilities,LiabilitiesAndStockholdersEquity,LiabilitiesCurrent,NetCashProvidedByUsedInFinancingActivities,NetCashProvidedByUsedInInvestingActivities,NetCashProvidedByUsedInOperatingActivities
2,AAPL,202409,210352000000.0,180683000000,93736000000,123216000000,391035000000,364980000000,152987000000,308030000000,364980000000,176392000000,-121983000000,2935000000,118254000000
1,AAPL,202309,214137000000.0,169148000000,96995000000,114301000000,383285000000,352583000000,143566000000,290437000000,352583000000,145308000000,-108488000000,3705000000,110543000000
0,AAPL,202209,223546000000.0,170782000000,99803000000,119437000000,394328000000,352755000000,135405000000,302083000000,352755000000,153982000000,-110749000000,-22354000000,122151000000
5,MSFT,202406,74114000000.0,171008000000,88136000000,109433000000,245122000000,512163000000,159734000000,243686000000,512163000000,125286000000,-37757000000,-96970000000,118548000000
4,MSFT,202306,65863000000.0,146052000000,72361000000,88523000000,211915000000,411976000000,184257000000,205753000000,411976000000,104149000000,-43935000000,-22680000000,87582000000
3,MSFT,202206,62650000000.0,135620000000,72738000000,83383000000,198270000000,364840000000,169684000000,198298000000,364840000000,95082000000,-58876000000,-30311000000,89035000000
8,TSLA,202412,,17450000000,7091000000,7076000000,97690000000,122070000000,58360000000,48390000000,122070000000,28821000000,3853000000,-18787000000,14923000000
7,TSLA,202312,,17660000000,14997000000,8891000000,96773000000,106618000000,49616000000,43009000000,106618000000,28748000000,2589000000,-15584000000,13256000000
6,TSLA,202212,,20853000000,12556000000,13656000000,81462000000,82338000000,40917000000,36440000000,82338000000,26709000000,-3527000000,-11973000000,14724000000


In [90]:
df_merged.to_csv('financial_statements.csv', index=False)