Notebook for Extracting & Saving Raw Data

First:
- Loads daily stock prices using the SimFin API.
- Saves data to stock_prices.csv.
- Then reads it again and does basic inspection (head(), describe(), isna().mean()).

Then:
- Downloads annual income statements (load_income()).
- Saves to us_income_statements.csv.
- Downloads metadata about companies into us_companies_list.csv.

In [1]:
import os
from dotenv import load_dotenv
import requests
import json
import pandas as pd
import simfin as sf
from simfin.names import CLOSE, OPEN, HIGH, LOW, VOLUME, TICKER
from simfin.names import *

In [2]:
# load environment variables from the .env file
load_dotenv()

# get the API token from the environment variables
api_key = os.getenv('API_KEY')

In [3]:
os.makedirs("data/RAW", exist_ok=True)


In [4]:
def load_data():
    """
    Loads daily stock prices for all US companies from SimFin 
    and saves the raw data to a specified directory for processing.
    """

    # Set SimFin API key (Replace with your actual key)
    sf.set_api_key(api_key)

    # Set SimFin data directory (ensure it exists)
    sf.set_data_dir("~/simfin_data/")

    # Load daily stock prices for all US companies
    print("📥 Downloading US stock market data...")
    df_prices = sf.load_shareprices(market="us", variant="daily")

    # Reset index to make 'Date' a normal column
    df_prices = df_prices.reset_index()

    # Define save path - ADAPT TO YOUR LOCAL ENVIRONMENT
    save_path = "data/RAW/stock_prices.csv"

    # Save raw data to CSV
    df_prices.to_csv(save_path, index=False)

    print(f"Data saved to: {save_path}")

# Run the function
load_data()

📥 Downloading US stock market data...
Dataset "us-shareprices-daily" on disk (0 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
✅ Data saved to: data/RAW/stock_prices.csv


In [5]:
# Read the saved CSV file into a DataFrame
df_prices = pd.read_csv("data/RAW/stock_prices.csv")

# Display the first few rows to confirm it loaded correctly
df_prices.head()

Unnamed: 0,Ticker,Date,SimFinId,Open,High,Low,Close,Adj. Close,Volume,Dividend,Shares Outstanding
0,A,2019-04-25,45846,75.26,77.04,74.94,76.72,73.68,1481436,,317515869.0
1,A,2019-04-26,45846,76.98,77.46,76.3,77.42,74.36,1608922,,317515869.0
2,A,2019-04-29,45846,77.47,78.44,77.32,77.33,74.27,2177700,,317515869.0
3,A,2019-04-30,45846,77.44,78.59,77.38,78.5,75.39,1726239,,317000000.0
4,A,2019-05-01,45846,78.49,78.92,77.28,77.47,74.4,1078572,,317000000.0


In [6]:
df_prices.describe()

Unnamed: 0,SimFinId,Open,High,Low,Close,Adj. Close,Volume,Dividend,Shares Outstanding
count,5875518.0,5875518.0,5875518.0,5875518.0,5875518.0,5875518.0,5875518.0,35897.0,5330811.0
mean,5948873.0,16166.88,16265.36,16032.82,16124.13,16121.96,1783404.0,0.454983,598440300000.0
std,6099526.0,1221082.0,1224253.0,1216641.0,1219356.0,1219356.0,28716900.0,7.993318,61698420000000.0
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,627774.0,7.8,8.01,7.58,7.8,7.33,35967.0,0.12,18106150.0
50%,1322470.0,20.26,20.7,19.85,20.25,18.83,243547.0,0.25,49043250.0
75%,11035980.0,51.72,52.65,50.75,51.69,48.74,938488.5,0.45,131356200.0
max,18589410.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,18489980000.0,1500.0,6667887000000000.0


In [7]:
df_prices.isna().mean() # 99% of the dividend column is null for all tickers 

Ticker                0.000000
Date                  0.000000
SimFinId              0.000000
Open                  0.000000
High                  0.000000
Low                   0.000000
Close                 0.000000
Adj. Close            0.000000
Volume                0.000000
Dividend              0.993890
Shares Outstanding    0.092708
dtype: float64

In [8]:
# Download all US Income Statements
df_income = sf.load_income(variant='annual', market='us')

# Save to CSV
df_income.to_csv("data/RAW/us_income_statements.csv", index=False)

# to check, print all Revenue and Net Income for Microsoft (ticker MSFT).
print(df_income.loc['MSFT', [REVENUE, NET_INCOME]])

Dataset "us-income-annual" on disk (0 days old).
- Loading from disk ... Done!


  df = pd.read_csv(path, sep=';', header=0,


                  Revenue   Net Income
Report Date                           
2019-06-30   1.258430e+11  39240000000
2020-06-30   1.430150e+11  44281000000
2021-06-30   1.680880e+11  61271000000
2022-06-30   1.982700e+11  72738000000
2023-06-30   2.119150e+11  72361000000


In [9]:
df_income.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 17555 entries, ('A', Timestamp('2019-10-31 00:00:00')) to ('ZYXI', Timestamp('2023-12-31 00:00:00'))
Data columns (total 26 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   SimFinId                                  17555 non-null  int64         
 1   Currency                                  17555 non-null  object        
 2   Fiscal Year                               17555 non-null  int64         
 3   Fiscal Period                             17555 non-null  object        
 4   Publish Date                              17555 non-null  datetime64[ns]
 5   Restated Date                             17555 non-null  datetime64[ns]
 6   Shares (Basic)                            17403 non-null  float64       
 7   Shares (Diluted)                          17276 non-null  float64       
 8   Revenue                        

In [10]:
# Download the list of all US companies with metadata
df_companies = sf.load_companies(market='us')

# Save to CSV
df_companies.to_csv("data/RAW/us_companies_list.csv", index=False)

Dataset "us-companies" on disk (0 days old).
- Loading from disk ... Done!


  df = pd.read_csv(path, sep=';', header=0,
