# Fundamentals Data

Download quarterly fundamentals from FMP:
- Key metrics (P/E, EV/EBITDA, etc.)
- Financial ratios (ROE, margins, etc.)
- Financial growth (YoY revenue/earnings growth)

In [1]:
import os
import time
import random
from pathlib import Path

import pandas as pd
import requests
from tqdm.auto import tqdm

os.environ["FMP_API_KEY"] = "67jXuKOp0KmWB6FyH9k86zlxnTJSAql7"

In [2]:
FMP_BASE_URL = "https://financialmodelingprep.com/stable"
SAVE_EVERY = 500
MAX_CALLS_PER_MINUTE = 700

In [3]:
universe = pd.read_parquet("data/universe.pqt")
symbols = universe["symbol"].dropna().astype(str).unique().tolist()
print(f"Loaded {len(symbols):,} symbols")

Loaded 6,060 symbols


In [4]:
class RateLimiter:
    def __init__(self, max_per_minute: int = 240):
        self.max_per_minute = max_per_minute
        self._last_call = 0.0
    
    @property
    def min_interval(self) -> float:
        return 60.0 / self.max_per_minute
    
    def wait(self) -> None:
        now = time.time()
        elapsed = now - self._last_call
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self._last_call = time.time()


def request_json(
    session: requests.Session,
    path: str,
    params: dict,
    max_retries: int = 8,
) -> list | dict:
    url = f"{FMP_BASE_URL}{path}"
    for attempt in range(max_retries):
        resp = session.get(url, params=params, timeout=30)
        if resp.status_code == 200:
            return resp.json()
        if resp.status_code in (429, 500, 502, 503, 504):
            sleep_time = min(60, (2**attempt) + random.random())
            time.sleep(sleep_time)
            continue
        resp.raise_for_status()
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

In [5]:
def fetch_fundamentals(
    session: requests.Session,
    symbol: str,
    api_key: str,
    rate_limiter: RateLimiter,
    limit: int = 100,  # ~25 years of quarterly data
) -> dict[str, pd.DataFrame]:
    """Fetch key metrics, ratios, and growth for a symbol."""
    endpoints = {
        "key_metrics": "/key-metrics",
        "ratios": "/ratios",
        "growth": "/financial-growth",
    }
    
    results = {}
    for name, path in endpoints.items():
        rate_limiter.wait()
        params = {
            "symbol": symbol,
            "period": "quarter",
            "limit": limit,
            "apikey": api_key,
        }
        data = request_json(session, path, params)
        if data:
            df = pd.DataFrame(data)
            df["symbol"] = symbol
            results[name] = df
        else:
            results[name] = pd.DataFrame()
    
    return results

In [6]:
# Check for existing progress
DATA_DIR = Path("data")
PROGRESS_PATH = DATA_DIR / "fundamentals_progress.txt"

METRICS_PATH = DATA_DIR / "key_metrics.pqt"
RATIOS_PATH = DATA_DIR / "ratios.pqt"
GROWTH_PATH = DATA_DIR / "growth.pqt"

if PROGRESS_PATH.exists():
    done_symbols = set(PROGRESS_PATH.read_text().strip().split("\n"))
    done_symbols.discard("")
    print(f"Found {len(done_symbols):,} completed symbols")
else:
    done_symbols = set()
    print("No existing progress found")

# Load existing data if resuming
def load_existing(path: Path) -> pd.DataFrame | None:
    if path.exists():
        return pd.read_parquet(path)
    return None

existing_metrics = load_existing(METRICS_PATH)
existing_ratios = load_existing(RATIOS_PATH)
existing_growth = load_existing(GROWTH_PATH)

symbols_to_fetch = [s for s in symbols if s not in done_symbols]
print(f"{len(symbols_to_fetch):,} symbols to fetch")

Found 3,000 completed symbols
3,060 symbols to fetch


In [7]:
def coerce_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Convert numeric columns to float64 to avoid mixed type issues."""
    non_numeric = {"symbol", "date", "fiscalYear", "period", "reportedCurrency"}
    for col in df.columns:
        if col not in non_numeric:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    return df

if len(symbols_to_fetch) == 0:
    print("Nothing to fetch")
else:
    api_key = os.environ["FMP_API_KEY"]
    session = requests.Session()
    rate_limiter = RateLimiter(max_per_minute=MAX_CALLS_PER_MINUTE)
    
    # Initialize accumulators
    all_metrics = [existing_metrics] if existing_metrics is not None else []
    all_ratios = [existing_ratios] if existing_ratios is not None else []
    all_growth = [existing_growth] if existing_growth is not None else []
    
    batch_metrics = []
    batch_ratios = []
    batch_growth = []
    newly_done = []
    
    n_total = len(symbols_to_fetch)
    n_errors = 0
    
    print(f"Fetching fundamentals for {n_total:,} symbols (3 API calls each)")
    print("-" * 60)
    
    for i, symbol in enumerate(tqdm(symbols_to_fetch, desc="Symbols")):
        try:
            results = fetch_fundamentals(session, symbol, api_key, rate_limiter)
            
            if not results["key_metrics"].empty:
                batch_metrics.append(results["key_metrics"])
            if not results["ratios"].empty:
                batch_ratios.append(results["ratios"])
            if not results["growth"].empty:
                batch_growth.append(results["growth"])
            
            newly_done.append(symbol)
        except Exception as e:
            n_errors += 1
            tqdm.write(f"Error fetching {symbol}: {e}")
            continue
        
        # Save checkpoint
        if (i + 1) % SAVE_EVERY == 0 or (i + 1) == n_total:
            # Combine and save metrics
            if batch_metrics:
                all_metrics.extend(batch_metrics)
                combined_metrics = pd.concat(all_metrics, ignore_index=True)
                combined_metrics = coerce_numeric_columns(combined_metrics)
                combined_metrics.to_parquet(METRICS_PATH, index=False)
                batch_metrics = []
                all_metrics = [combined_metrics]
            
            # Combine and save ratios
            if batch_ratios:
                all_ratios.extend(batch_ratios)
                combined_ratios = pd.concat(all_ratios, ignore_index=True)
                combined_ratios = coerce_numeric_columns(combined_ratios)
                combined_ratios.to_parquet(RATIOS_PATH, index=False)
                batch_ratios = []
                all_ratios = [combined_ratios]
            
            # Combine and save growth
            if batch_growth:
                all_growth.extend(batch_growth)
                combined_growth = pd.concat(all_growth, ignore_index=True)
                combined_growth = coerce_numeric_columns(combined_growth)
                combined_growth.to_parquet(GROWTH_PATH, index=False)
                batch_growth = []
                all_growth = [combined_growth]
            
            # Update progress
            done_symbols.update(newly_done)
            PROGRESS_PATH.write_text("\n".join(sorted(done_symbols)))
            newly_done = []
            
            pct = (i + 1) / n_total * 100
            tqdm.write(f"Checkpoint: {i + 1:,}/{n_total:,} ({pct:.1f}%)")
    
    print("-" * 60)
    print(f"Done! Errors: {n_errors}")

Fetching fundamentals for 3,060 symbols (3 API calls each)
------------------------------------------------------------


Symbols:   0%|          | 0/3060 [00:00<?, ?it/s]

  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 500/3,060 (16.3%)


  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 1,000/3,060 (32.7%)


  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 1,500/3,060 (49.0%)


  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 2,000/3,060 (65.4%)


  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 2,500/3,060 (81.7%)


  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 3,000/3,060 (98.0%)


  combined_metrics = pd.concat(all_metrics, ignore_index=True)


Checkpoint: 3,060/3,060 (100.0%)
------------------------------------------------------------
Done! Errors: 0


In [8]:
# Verify outputs
metrics = pd.read_parquet(METRICS_PATH)
ratios = pd.read_parquet(RATIOS_PATH)
growth = pd.read_parquet(GROWTH_PATH)

print(f"Key metrics: {len(metrics):,} rows, {metrics['symbol'].nunique():,} symbols")
print(f"Ratios: {len(ratios):,} rows, {ratios['symbol'].nunique():,} symbols")
print(f"Growth: {len(growth):,} rows, {growth['symbol'].nunique():,} symbols")

# Update universe to only include symbols with fundamentals
fund_symbols = set(metrics["symbol"].unique())
universe = pd.read_parquet("data/universe.pqt")
n_before = len(universe)
universe_filtered = universe[universe["symbol"].isin(fund_symbols)]
n_after = len(universe_filtered)

print(f"\nFiltering universe: {n_before:,} -> {n_after:,} symbols")
universe_filtered.to_parquet("data/universe.pqt", index=False)
print("Saved filtered universe.pqt")

Key metrics: 307,009 rows, 5,564 symbols
Ratios: 307,009 rows, 5,564 symbols
Growth: 307,009 rows, 5,564 symbols

Filtering universe: 6,060 -> 5,564 symbols
Saved filtered universe.pqt


In [9]:
print("Key metrics columns:")
print(list(metrics.columns))

Key metrics columns:
['symbol', 'date', 'fiscalYear', 'period', 'reportedCurrency', 'marketCap', 'enterpriseValue', 'evToSales', 'evToOperatingCashFlow', 'evToFreeCashFlow', 'evToEBITDA', 'netDebtToEBITDA', 'currentRatio', 'incomeQuality', 'grahamNumber', 'grahamNetNet', 'taxBurden', 'interestBurden', 'workingCapital', 'investedCapital', 'returnOnAssets', 'operatingReturnOnAssets', 'returnOnTangibleAssets', 'returnOnEquity', 'returnOnInvestedCapital', 'returnOnCapitalEmployed', 'earningsYield', 'freeCashFlowYield', 'capexToOperatingCashFlow', 'capexToDepreciation', 'capexToRevenue', 'salesGeneralAndAdministrativeToRevenue', 'researchAndDevelopementToRevenue', 'stockBasedCompensationToRevenue', 'intangiblesToTotalAssets', 'averageReceivables', 'averagePayables', 'averageInventory', 'daysOfSalesOutstanding', 'daysOfPayablesOutstanding', 'daysOfInventoryOutstanding', 'operatingCycle', 'cashConversionCycle', 'freeCashFlowToEquity', 'freeCashFlowToFirm', 'tangibleAssetValue', 'netCurrentAss

In [10]:
print("Ratios columns:")
print(list(ratios.columns))

Ratios columns:
['symbol', 'date', 'fiscalYear', 'period', 'reportedCurrency', 'grossProfitMargin', 'ebitMargin', 'ebitdaMargin', 'operatingProfitMargin', 'pretaxProfitMargin', 'continuousOperationsProfitMargin', 'netProfitMargin', 'bottomLineProfitMargin', 'receivablesTurnover', 'payablesTurnover', 'inventoryTurnover', 'fixedAssetTurnover', 'assetTurnover', 'currentRatio', 'quickRatio', 'solvencyRatio', 'cashRatio', 'priceToEarningsRatio', 'priceToEarningsGrowthRatio', 'forwardPriceToEarningsGrowthRatio', 'priceToBookRatio', 'priceToSalesRatio', 'priceToFreeCashFlowRatio', 'priceToOperatingCashFlowRatio', 'debtToAssetsRatio', 'debtToEquityRatio', 'debtToCapitalRatio', 'longTermDebtToCapitalRatio', 'financialLeverageRatio', 'workingCapitalTurnoverRatio', 'operatingCashFlowRatio', 'operatingCashFlowSalesRatio', 'freeCashFlowOperatingCashFlowRatio', 'debtServiceCoverageRatio', 'interestCoverageRatio', 'shortTermOperatingCashFlowCoverageRatio', 'operatingCashFlowCoverageRatio', 'capitalEx

In [11]:
print("Growth columns:")
print(list(growth.columns))

Growth columns:
['symbol', 'date', 'fiscalYear', 'period', 'reportedCurrency', 'revenueGrowth', 'grossProfitGrowth', 'ebitgrowth', 'operatingIncomeGrowth', 'netIncomeGrowth', 'epsgrowth', 'epsdilutedGrowth', 'weightedAverageSharesGrowth', 'weightedAverageSharesDilutedGrowth', 'dividendsPerShareGrowth', 'operatingCashFlowGrowth', 'receivablesGrowth', 'inventoryGrowth', 'assetGrowth', 'bookValueperShareGrowth', 'debtGrowth', 'rdexpenseGrowth', 'sgaexpensesGrowth', 'freeCashFlowGrowth', 'tenYRevenueGrowthPerShare', 'fiveYRevenueGrowthPerShare', 'threeYRevenueGrowthPerShare', 'tenYOperatingCFGrowthPerShare', 'fiveYOperatingCFGrowthPerShare', 'threeYOperatingCFGrowthPerShare', 'tenYNetIncomeGrowthPerShare', 'fiveYNetIncomeGrowthPerShare', 'threeYNetIncomeGrowthPerShare', 'tenYShareholdersEquityGrowthPerShare', 'fiveYShareholdersEquityGrowthPerShare', 'threeYShareholdersEquityGrowthPerShare', 'tenYDividendperShareGrowthPerShare', 'fiveYDividendperShareGrowthPerShare', 'threeYDividendperShare

In [12]:
metrics.head()

Unnamed: 0,symbol,date,fiscalYear,period,reportedCurrency,marketCap,enterpriseValue,evToSales,evToOperatingCashFlow,evToFreeCashFlow,...,averageInventory,daysOfSalesOutstanding,daysOfPayablesOutstanding,daysOfInventoryOutstanding,operatingCycle,cashConversionCycle,freeCashFlowToEquity,freeCashFlowToFirm,tangibleAssetValue,netCurrentAssetValue
0,AACB,2025-06-30,2025,Q2,USD,280071000.0,279830609.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,211191429.0,-12297540.0
1,AACB,2025-03-31,2025,Q1,USD,53145855.4,52789729.4,0.0,-101.868393,-101.868393,...,0.0,0.0,0.0,0.0,0.0,0.0,-162089.0,0.0,209084416.0,-12115144.0
2,AACB,2024-07-31,2024,Q2,USD,0.0,10420.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2776.0,-241432.0
3,AACBU,2025-06-30,2025,Q2,USD,280071000.0,279830609.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,211191429.0,-12297540.0
4,AACBU,2025-03-31,2025,Q1,USD,53145855.4,52789729.4,0.0,-101.868393,-101.868393,...,0.0,0.0,0.0,0.0,0.0,0.0,-162089.0,0.0,209084416.0,-12115144.0
