# Fundamentals Data

Download quarterly fundamentals from FMP:
- Key metrics (P/E, EV/EBITDA, etc.)
- Financial ratios (ROE, margins, etc.)
- Financial growth (YoY revenue/earnings growth)

In [1]:
import os
import time
import random
from pathlib import Path

import pandas as pd
import requests
from tqdm.auto import tqdm

os.environ["FMP_API_KEY"] = "67jXuKOp0KmWB6FyH9k86zlxnTJSAql7"

In [2]:
FMP_BASE_URL = "https://financialmodelingprep.com/stable"
SAVE_EVERY = 500
MAX_CALLS_PER_MINUTE = 700

In [3]:
universe = pd.read_parquet("data/universe.pqt")
symbols = universe["symbol"].dropna().astype(str).unique().tolist()
print(f"Loaded {len(symbols):,} symbols")

Loaded 6,060 symbols


In [4]:
class RateLimiter:
    def __init__(self, max_per_minute: int = 240):
        self.max_per_minute = max_per_minute
        self._last_call = 0.0
    
    @property
    def min_interval(self) -> float:
        return 60.0 / self.max_per_minute
    
    def wait(self) -> None:
        now = time.time()
        elapsed = now - self._last_call
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self._last_call = time.time()


def request_json(
    session: requests.Session,
    path: str,
    params: dict,
    max_retries: int = 8,
) -> list | dict:
    url = f"{FMP_BASE_URL}{path}"
    for attempt in range(max_retries):
        resp = session.get(url, params=params, timeout=30)
        if resp.status_code == 200:
            return resp.json()
        if resp.status_code in (429, 500, 502, 503, 504):
            sleep_time = min(60, (2**attempt) + random.random())
            time.sleep(sleep_time)
            continue
        resp.raise_for_status()
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

In [5]:
def fetch_fundamentals(
    session: requests.Session,
    symbol: str,
    api_key: str,
    rate_limiter: RateLimiter,
) -> dict[str, pd.DataFrame]:
    """Fetch key metrics, ratios, and growth for a symbol."""
    endpoints = {
        "key_metrics": "/key-metrics",
        "ratios": "/ratios",
        "growth": "/financial-growth",
    }
    
    results = {}
    for name, path in endpoints.items():
        rate_limiter.wait()
        params = {
            "symbol": symbol,
            "period": "quarter",
            "apikey": api_key,
        }
        data = request_json(session, path, params)
        if data:
            df = pd.DataFrame(data)
            df["symbol"] = symbol
            results[name] = df
        else:
            results[name] = pd.DataFrame()
    
    return results

In [6]:
# Check for existing progress
DATA_DIR = Path("data")
PROGRESS_PATH = DATA_DIR / "fundamentals_progress.txt"

METRICS_PATH = DATA_DIR / "key_metrics.pqt"
RATIOS_PATH = DATA_DIR / "ratios.pqt"
GROWTH_PATH = DATA_DIR / "growth.pqt"

if PROGRESS_PATH.exists():
    done_symbols = set(PROGRESS_PATH.read_text().strip().split("\n"))
    done_symbols.discard("")
    print(f"Found {len(done_symbols):,} completed symbols")
else:
    done_symbols = set()
    print("No existing progress found")

# Load existing data if resuming
def load_existing(path: Path) -> pd.DataFrame | None:
    if path.exists():
        return pd.read_parquet(path)
    return None

existing_metrics = load_existing(METRICS_PATH)
existing_ratios = load_existing(RATIOS_PATH)
existing_growth = load_existing(GROWTH_PATH)

symbols_to_fetch = [s for s in symbols if s not in done_symbols]
print(f"{len(symbols_to_fetch):,} symbols to fetch")

No existing progress found
6,060 symbols to fetch


In [None]:
if len(symbols_to_fetch) == 0:
    print("Nothing to fetch")
else:
    api_key = os.environ["FMP_API_KEY"]
    session = requests.Session()
    rate_limiter = RateLimiter(max_per_minute=MAX_CALLS_PER_MINUTE)
    
    # Initialize accumulators
    all_metrics = [existing_metrics] if existing_metrics is not None else []
    all_ratios = [existing_ratios] if existing_ratios is not None else []
    all_growth = [existing_growth] if existing_growth is not None else []
    
    batch_metrics = []
    batch_ratios = []
    batch_growth = []
    newly_done = []
    
    n_total = len(symbols_to_fetch)
    n_errors = 0
    
    print(f"Fetching fundamentals for {n_total:,} symbols (3 API calls each)")
    print("-" * 60)
    
    for i, symbol in enumerate(tqdm(symbols_to_fetch, desc="Symbols")):
        try:
            results = fetch_fundamentals(session, symbol, api_key, rate_limiter)
            
            if not results["key_metrics"].empty:
                batch_metrics.append(results["key_metrics"])
            if not results["ratios"].empty:
                batch_ratios.append(results["ratios"])
            if not results["growth"].empty:
                batch_growth.append(results["growth"])
            
            newly_done.append(symbol)
        except Exception as e:
            n_errors += 1
            tqdm.write(f"Error fetching {symbol}: {e}")
            continue
        
        # Save checkpoint
        if (i + 1) % SAVE_EVERY == 0 or (i + 1) == n_total:
            # Combine and save metrics
            if batch_metrics:
                all_metrics.extend(batch_metrics)
                combined_metrics = pd.concat(all_metrics, ignore_index=True)
                combined_metrics.to_parquet(METRICS_PATH, index=False)
                batch_metrics = []
                all_metrics = [combined_metrics]
            
            # Combine and save ratios
            if batch_ratios:
                all_ratios.extend(batch_ratios)
                combined_ratios = pd.concat(all_ratios, ignore_index=True)
                combined_ratios.to_parquet(RATIOS_PATH, index=False)
                batch_ratios = []
                all_ratios = [combined_ratios]
            
            # Combine and save growth
            if batch_growth:
                all_growth.extend(batch_growth)
                combined_growth = pd.concat(all_growth, ignore_index=True)
                combined_growth.to_parquet(GROWTH_PATH, index=False)
                batch_growth = []
                all_growth = [combined_growth]
            
            # Update progress
            done_symbols.update(newly_done)
            PROGRESS_PATH.write_text("\n".join(sorted(done_symbols)))
            newly_done = []
            
            pct = (i + 1) / n_total * 100
            tqdm.write(f"Checkpoint: {i + 1:,}/{n_total:,} ({pct:.1f}%)")
    
    print("-" * 60)
    print(f"Done! Errors: {n_errors}")

Fetching fundamentals for 6,060 symbols (3 API calls each)
------------------------------------------------------------


Symbols:   0%|          | 0/6060 [00:00<?, ?it/s]

  combined_metrics = pd.concat(all_metrics, ignore_index=True)


In [None]:
# Verify outputs
metrics = pd.read_parquet(METRICS_PATH)
ratios = pd.read_parquet(RATIOS_PATH)
growth = pd.read_parquet(GROWTH_PATH)

print(f"Key metrics: {len(metrics):,} rows, {metrics['symbol'].nunique():,} symbols")
print(f"Ratios: {len(ratios):,} rows, {ratios['symbol'].nunique():,} symbols")
print(f"Growth: {len(growth):,} rows, {growth['symbol'].nunique():,} symbols")

In [None]:
print("Key metrics columns:")
print(list(metrics.columns))

In [None]:
print("Ratios columns:")
print(list(ratios.columns))

In [None]:
print("Growth columns:")
print(list(growth.columns))

In [None]:
metrics.head()