## Import Libraries

In [1]:
import yfinance as yf
import pandas as pd

## Data Filepaths

In [2]:
# You can customize your data locations here:

sp500_all_financials = './data/sp500_all_financials.csv'
sp500_companies = './data/sp500_companies.csv'
sp_500_income_statements = './data/sp500_income_statements.csv'
sp_500_balance_sheets = './data/sp500_balance_sheets.csv'
sp_500_cash_flows = './data/sp500_cash_flows.csv'
sp_500_financials = './data/sp500_financials.xlsx'

## Fetch List of S&P Ticker Symbols

In [3]:
# Fetch the S&P 500 list from Wikipedia

sp_500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
sp_500_companies = pd.read_html(sp_500_url)[0]

## Extract and Transform S&P 500 Data

In [4]:
# Obtain a list of all the S&P 500 ticker symbols 
tickers = sp_500_companies['Symbol'].tolist()

# Function to rearrange columns: Move 'Symbol' column to the front
def move_symbol_to_front(df):
    if 'Symbol' in df.columns:
        cols = ['Symbol'] + [col for col in df.columns if col != 'Symbol']
        return df[cols]
    return df

# Initialize empty DataFrames to store individual financial statements if needed
income_statements = pd.DataFrame()
balance_sheets = pd.DataFrame()
cash_flows = pd.DataFrame()
all_financials = pd.DataFrame()

# Iterate over each ticker symbol
for tick in tickers:
    try:
        # Fetch financial statements
        ticker = yf.Ticker(tick)
        income_statement = ticker.financials.T[0:1]
        balance_sheet = ticker.balance_sheet.T[0:1]
        cash_flow = ticker.cashflow.T[0:1]

        # Add ticker symbol as a column
        for statement in (income_statement, balance_sheet, cash_flow):
            statement['Symbol'] = tick

        # Fetch the quarter date from the income_statement
        quarter = income_statement.index.values[0]
        
        # Reset index for each DataFrame and move 'Symbol' to front
        income_statement = move_symbol_to_front(income_statement.reset_index(drop=True))
        balance_sheet = move_symbol_to_front(balance_sheet.reset_index(drop=True))
        cash_flow = move_symbol_to_front(cash_flow.reset_index(drop=True))

        # Merge the financial statements into one row per ticker
        temp_merged = pd.merge(income_statement, balance_sheet, on='Symbol')
        full_merged = pd.merge(temp_merged, cash_flow, on='Symbol')
        full_merged['Quarter'] = quarter  # Add 'Quarter' after merging

        # Append the merged row to the main DataFrame
        all_financials = pd.concat([all_financials, full_merged], axis=0)

        # Collect individual statements if needed
        income_statements = pd.concat([income_statements, income_statement], axis=0)
        balance_sheets = pd.concat([balance_sheets, balance_sheet], axis=0)
        cash_flows = pd.concat([cash_flows, cash_flow], axis=0)

        print(f'Completed {tick}')
    except Exception as e:
        print(f"Failed to fetch data for {tick}: {e}")

# Merge the final DataFrame with the original S&P 500 companies list and reset index
all_financials = pd.merge(sp_500_companies, all_financials, on='Symbol')
all_financials.reset_index(drop=True, inplace=True)
cols = ['Quarter'] + [col for col in all_financials.columns if col != 'Quarter']
all_financials = all_financials[cols]

Completed MMM
Completed AOS
Completed ABT
Completed ABBV
Completed ACN
Completed ADBE
Completed AMD
Completed AES
Completed AFL
Completed A
Completed APD
Completed ABNB
Completed AKAM
Completed ALB
Completed ARE
Completed ALGN
Completed ALLE
Completed LNT
Completed ALL
Completed GOOGL
Completed GOOG
Completed MO
Completed AMZN
Completed AMCR
Completed AEE
Completed AAL
Completed AEP
Completed AXP
Completed AIG
Completed AMT
Completed AWK
Completed AMP
Completed AME
Completed AMGN
Completed APH
Completed ADI
Completed ANSS
Completed AON
Completed APA
Completed AAPL
Completed AMAT
Completed APTV
Completed ACGL
Completed ADM
Completed ANET
Completed AJG
Completed AIZ
Completed T
Completed ATO
Completed ADSK
Completed ADP
Completed AZO
Completed AVB
Completed AVY
Completed AXON
Completed BKR
Completed BALL
Completed BAC
Completed BK
Completed BBWI
Completed BAX
Completed BDX
Failed to fetch data for BRK.B: index 0 is out of bounds for axis 0 with size 0
Completed BBY
Completed BIO
Complete

## Load / Save Financial Data to Use Later

In [5]:
# Optionally, save to CSV
all_financials.to_csv('./data/sp500_all_financials.csv', index=False)
sp_500_companies.to_csv('./data/sp500_companies.csv', index=False)
income_statements.to_csv('./data/sp500_income_statements.csv', index=False)
balance_sheets.to_csv('./data/sp500_balance_sheets.csv', index=False)
cash_flows.to_csv('./data/sp500_cash_flows.csv', index=False)
print("Data saved to CSV files.")

# Use a Pandas Excel writer using XlsxWriter as the engine
with pd.ExcelWriter('./data/sp500_financials.xlsx', engine='xlsxwriter') as writer:
    sp_500_companies.to_excel(writer, sheet_name='S&P 500 Companies', index=False)
    income_statements.to_excel(writer, sheet_name='Income Statements', index=False)
    balance_sheets.to_excel(writer, sheet_name='Balance Sheets', index=False)
    cash_flows.to_excel(writer, sheet_name='Cash Flows', index=False)

print("Data saved to Excel file with multiple sheets.")

Data saved to CSV files.


ModuleNotFoundError: No module named 'xlsxwriter'

## Display the Financial Data in DataFrames

In [None]:
# Display sections of the DataFrames
print("All Financials:\n", all_financials.head(2))
print("S&P 500 Companies:\n", sp_500_companies.head(2))
print("Income Statements:\n", income_statements.head(2))
print("Balance Sheets:\n", balance_sheets.head(2))
print("Cash Flows:\n", cash_flows.head(2))