In [1]:
import os
import time
import random
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
import requests
from tqdm.auto import tqdm

os.environ["FMP_API_KEY"] = "67jXuKOp0KmWB6FyH9k86zlxnTJSAql7"

In [2]:
FMP_BASE_URL = "https://financialmodelingprep.com/stable"
YEARS = 5
SAVE_EVERY = 500  # Save checkpoint every N symbols
MAX_CALLS_PER_MINUTE = 700

In [3]:
universe = pd.read_parquet("data/universe.pqt")
symbols = universe["symbol"].dropna().astype(str).unique().tolist()
print(f"Loaded {len(symbols):,} symbols")

Loaded 6,060 symbols


In [4]:
# Date range
end_date = datetime.now()
start_date = end_date - timedelta(days=int(365.25 * YEARS))

date_from = start_date.strftime("%Y-%m-%d")
date_to = end_date.strftime("%Y-%m-%d")

print(f"Date range: {date_from} to {date_to}")

Date range: 2020-12-21 to 2025-12-21


In [5]:
class RateLimiter:
    def __init__(self, max_per_minute: int = 240):
        self.max_per_minute = max_per_minute
        self._last_call = 0.0
    
    @property
    def min_interval(self) -> float:
        return 60.0 / self.max_per_minute
    
    def wait(self) -> None:
        now = time.time()
        elapsed = now - self._last_call
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self._last_call = time.time()


def request_json(
    session: requests.Session,
    path: str,
    params: dict,
    max_retries: int = 8,
) -> list | dict:
    url = f"{FMP_BASE_URL}{path}"
    for attempt in range(max_retries):
        resp = session.get(url, params=params, timeout=30)
        if resp.status_code == 200:
            return resp.json()
        if resp.status_code in (429, 500, 502, 503, 504):
            sleep_time = min(60, (2**attempt) + random.random())
            time.sleep(sleep_time)
            continue
        resp.raise_for_status()
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

In [6]:
def fetch_prices(session: requests.Session, symbol: str, date_from: str, date_to: str, api_key: str) -> pd.DataFrame:
    """Fetch historical OHLCV data for a symbol."""
    params = {
        "symbol": symbol,
        "from": date_from,
        "to": date_to,
        "apikey": api_key,
    }
    data = request_json(session, "/historical-price-eod/full", params)
    
    if not data:
        return pd.DataFrame()
    
    df = pd.DataFrame(data)
    df["symbol"] = symbol
    return df

In [7]:
# Check for existing progress
PRICES_PATH = Path("data/prices.pqt")
PROGRESS_PATH = Path("data/prices_progress.txt")

if PROGRESS_PATH.exists():
    done_symbols = set(PROGRESS_PATH.read_text().strip().split("\n"))
    done_symbols.discard("")  # Remove empty strings
    print(f"Found {len(done_symbols):,} completed symbols")
else:
    done_symbols = set()
    print("No existing progress found")

# Load existing data if resuming
if PRICES_PATH.exists() and done_symbols:
    existing_df = pd.read_parquet(PRICES_PATH)
    print(f"Loaded {len(existing_df):,} existing price records")
else:
    existing_df = None

symbols_to_fetch = [s for s in symbols if s not in done_symbols]
print(f"{len(symbols_to_fetch):,} symbols to fetch")

No existing progress found
6,060 symbols to fetch


In [8]:
if len(symbols_to_fetch) == 0:
    print("Nothing to fetch")
else:
    api_key = os.environ["FMP_API_KEY"]
    session = requests.Session()
    rate_limiter = RateLimiter(max_per_minute=MAX_CALLS_PER_MINUTE)
    
    all_dfs = [existing_df] if existing_df is not None else []
    batch_dfs = []
    newly_done = []
    
    n_total = len(symbols_to_fetch)
    n_errors = 0
    
    print(f"Fetching prices for {n_total:,} symbols")
    print("-" * 60)
    
    for i, symbol in enumerate(tqdm(symbols_to_fetch, desc="Symbols")):
        rate_limiter.wait()
        
        try:
            df = fetch_prices(session, symbol, date_from, date_to, api_key)
            if not df.empty:
                batch_dfs.append(df)
            newly_done.append(symbol)
        except Exception as e:
            n_errors += 1
            tqdm.write(f"Error fetching {symbol}: {e}")
            continue
        
        # Save checkpoint
        if (i + 1) % SAVE_EVERY == 0 or (i + 1) == n_total:
            if batch_dfs:
                all_dfs.extend(batch_dfs)
                combined = pd.concat(all_dfs, ignore_index=True)
                combined.to_parquet(PRICES_PATH, index=False)
                batch_dfs = []
                all_dfs = [combined]  # Keep only combined for next iteration
            
            # Update progress file
            done_symbols.update(newly_done)
            PROGRESS_PATH.write_text("\n".join(sorted(done_symbols)))
            newly_done = []
            
            pct = (i + 1) / n_total * 100
            tqdm.write(f"Checkpoint: {i + 1:,}/{n_total:,} ({pct:.1f}%) - {len(combined):,} records saved")
    
    print("-" * 60)
    print(f"Done! Errors: {n_errors}")

Fetching prices for 6,060 symbols
------------------------------------------------------------


Symbols:   0%|          | 0/6060 [00:00<?, ?it/s]

Checkpoint: 500/6,060 (8.3%) - 476,805 records saved
Checkpoint: 1,000/6,060 (16.5%) - 965,317 records saved
Checkpoint: 1,500/6,060 (24.8%) - 1,461,231 records saved
Checkpoint: 2,000/6,060 (33.0%) - 1,928,348 records saved
Checkpoint: 2,500/6,060 (41.3%) - 2,406,740 records saved
Checkpoint: 3,000/6,060 (49.5%) - 2,885,781 records saved
Checkpoint: 3,500/6,060 (57.8%) - 3,348,024 records saved
Checkpoint: 4,000/6,060 (66.0%) - 3,833,192 records saved
Checkpoint: 4,500/6,060 (74.3%) - 4,341,922 records saved
Checkpoint: 5,000/6,060 (82.5%) - 4,855,062 records saved
Checkpoint: 5,500/6,060 (90.8%) - 5,333,252 records saved
Checkpoint: 6,000/6,060 (99.0%) - 5,825,844 records saved
Checkpoint: 6,060/6,060 (100.0%) - 5,888,410 records saved
------------------------------------------------------------
Done! Errors: 0


In [9]:
# Verify final output
prices = pd.read_parquet(PRICES_PATH)
print(f"Total records: {len(prices):,}")
print(f"Unique symbols: {prices['symbol'].nunique():,}")
print(f"Date range: {prices['date'].min()} to {prices['date'].max()}")
print(f"\nColumns: {list(prices.columns)}")
prices.head()

Total records: 5,888,410
Unique symbols: 5,644
Date range: 2020-12-21 to 2025-12-19

Columns: ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume', 'change', 'changePercent', 'vwap']


Unnamed: 0,symbol,date,open,high,low,close,volume,change,changePercent,vwap
0,AACB,2025-12-19,10.32,10.32,10.3,10.3,70900,-0.02,-0.1938,10.31
1,AACB,2025-12-18,10.31,10.31,10.3,10.3,459400,-0.01,-0.096993,10.31
2,AACB,2025-12-17,10.34,10.34,10.29,10.31,3900,-0.03,-0.29014,10.32
3,AACB,2025-12-16,10.31,10.31,10.31,10.31,257800,0.0,0.0,10.31
4,AACB,2025-12-15,10.32,10.33,10.3,10.3,6545,-0.02,-0.1938,10.31


In [10]:
# Summary stats
print(f"Records per symbol: {len(prices) / prices['symbol'].nunique():.0f} avg")
print(f"\nFile size: {PRICES_PATH.stat().st_size / 1e6:.1f} MB")

Records per symbol: 1043 avg

File size: 136.4 MB
