# Filing Dates

Download actual SEC filing dates from FMP income-statement endpoint.

This provides `filingDate` for point-in-time fundamental alignment instead of using a fixed 45-day lag from period end.

In [1]:
import os
import time
import random
from pathlib import Path

import pandas as pd
import requests
from tqdm.auto import tqdm

# Load API key from .env
env_path = Path(".env")
if env_path.exists():
    for line in env_path.read_text().strip().split("\n"):
        if "=" in line and not line.startswith("#"):
            k, v = line.split("=", 1)
            os.environ[k.strip()] = v.strip()

In [2]:
FMP_BASE_URL = "https://financialmodelingprep.com/stable"
SAVE_EVERY = 500
MAX_CALLS_PER_MINUTE = 700

In [3]:
# Use symbols from existing fundamentals (no point fetching for symbols without fundamentals)
metrics = pd.read_parquet("data/key_metrics.pqt")
symbols = metrics["symbol"].unique().tolist()
print(f"Loaded {len(symbols):,} symbols with fundamentals")

Loaded 5,564 symbols with fundamentals


In [4]:
class RateLimiter:
    def __init__(self, max_per_minute: int = 240):
        self.max_per_minute = max_per_minute
        self._last_call = 0.0
    
    @property
    def min_interval(self) -> float:
        return 60.0 / self.max_per_minute
    
    def wait(self) -> None:
        now = time.time()
        elapsed = now - self._last_call
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self._last_call = time.time()


def request_json(
    session: requests.Session,
    path: str,
    params: dict,
    max_retries: int = 8,
) -> list | dict:
    url = f"{FMP_BASE_URL}{path}"
    for attempt in range(max_retries):
        resp = session.get(url, params=params, timeout=30)
        if resp.status_code == 200:
            return resp.json()
        if resp.status_code in (429, 500, 502, 503, 504):
            sleep_time = min(60, (2**attempt) + random.random())
            time.sleep(sleep_time)
            continue
        resp.raise_for_status()
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

In [5]:
def fetch_filing_dates(
    session: requests.Session,
    symbol: str,
    api_key: str,
    rate_limiter: RateLimiter,
    limit: int = 100,  # ~25 years of quarterly data
) -> pd.DataFrame:
    """Fetch income statements to extract filing dates."""
    rate_limiter.wait()
    params = {
        "symbol": symbol,
        "period": "quarter",
        "limit": limit,
        "apikey": api_key,
    }
    data = request_json(session, "/income-statement", params)
    
    if not data:
        return pd.DataFrame()
    
    # Extract only the fields we need
    records = []
    for row in data:
        records.append({
            "symbol": symbol,
            "period_end": row.get("date"),
            "fiscal_year": row.get("fiscalYear"),
            "period": row.get("period"),
            "filing_date": row.get("filingDate"),
            "accepted_date": row.get("acceptedDate"),
        })
    
    return pd.DataFrame(records)

In [6]:
# Check for existing progress
DATA_DIR = Path("data")
PROGRESS_PATH = DATA_DIR / "filing_dates_progress.txt"
OUTPUT_PATH = DATA_DIR / "filing_dates.pqt"

if PROGRESS_PATH.exists():
    done_symbols = set(PROGRESS_PATH.read_text().strip().split("\n"))
    done_symbols.discard("")
    print(f"Found {len(done_symbols):,} completed symbols")
else:
    done_symbols = set()
    print("No existing progress found")

# Load existing data if resuming
existing_data = None
if OUTPUT_PATH.exists():
    existing_data = pd.read_parquet(OUTPUT_PATH)
    print(f"Loaded {len(existing_data):,} existing rows")

symbols_to_fetch = [s for s in symbols if s not in done_symbols]
print(f"{len(symbols_to_fetch):,} symbols to fetch")

Found 500 completed symbols
Loaded 23,060 existing rows
5,064 symbols to fetch


In [7]:
if len(symbols_to_fetch) == 0:
    print("Nothing to fetch")
else:
    api_key = os.environ["FMP_API_KEY"]
    session = requests.Session()
    rate_limiter = RateLimiter(max_per_minute=MAX_CALLS_PER_MINUTE)
    
    # Initialize accumulator
    all_data = [existing_data] if existing_data is not None else []
    batch_data = []
    newly_done = []
    
    n_total = len(symbols_to_fetch)
    n_errors = 0
    
    print(f"Fetching filing dates for {n_total:,} symbols")
    print("-" * 60)
    
    for i, symbol in enumerate(tqdm(symbols_to_fetch, desc="Symbols")):
        try:
            df = fetch_filing_dates(session, symbol, api_key, rate_limiter)
            if not df.empty:
                batch_data.append(df)
            newly_done.append(symbol)
        except Exception as e:
            n_errors += 1
            tqdm.write(f"Error fetching {symbol}: {e}")
            continue
        
        # Save checkpoint
        if (i + 1) % SAVE_EVERY == 0 or (i + 1) == n_total:
            if batch_data:
                all_data.extend(batch_data)
                combined = pd.concat(all_data, ignore_index=True)
                combined.to_parquet(OUTPUT_PATH, index=False)
                batch_data = []
                all_data = [combined]
            
            # Update progress
            done_symbols.update(newly_done)
            PROGRESS_PATH.write_text("\n".join(sorted(done_symbols)))
            newly_done = []
            
            pct = (i + 1) / n_total * 100
            tqdm.write(f"Checkpoint: {i + 1:,}/{n_total:,} ({pct:.1f}%)")
    
    print("-" * 60)
    print(f"Done! Errors: {n_errors}")

Fetching filing dates for 5,064 symbols
------------------------------------------------------------


Symbols:   0%|          | 0/5064 [00:00<?, ?it/s]

Checkpoint: 500/5,064 (9.9%)
Checkpoint: 1,000/5,064 (19.7%)
Checkpoint: 1,500/5,064 (29.6%)
Checkpoint: 2,000/5,064 (39.5%)
Checkpoint: 2,500/5,064 (49.4%)
Checkpoint: 3,000/5,064 (59.2%)
Checkpoint: 3,500/5,064 (69.1%)
Checkpoint: 4,000/5,064 (79.0%)
Checkpoint: 4,500/5,064 (88.9%)
Checkpoint: 5,000/5,064 (98.7%)
Checkpoint: 5,064/5,064 (100.0%)
------------------------------------------------------------
Done! Errors: 0


In [8]:
# Verify output
filing_dates = pd.read_parquet(OUTPUT_PATH)
print(f"Filing dates: {len(filing_dates):,} rows, {filing_dates['symbol'].nunique():,} symbols")
print(f"\nDate range: {filing_dates['period_end'].min()} to {filing_dates['period_end'].max()}")

Filing dates: 305,371 rows, 5,476 symbols

Date range: 1985-06-30 to 2025-11-30


In [9]:
filing_dates.head(10)

Unnamed: 0,symbol,period_end,fiscal_year,period,filing_date,accepted_date
0,AACB,2025-09-30,2025,Q3,2025-11-07,2025-11-06 19:34:54
1,AACB,2025-06-30,2025,Q2,2025-08-07,2025-08-06 21:10:19
2,AACB,2025-03-31,2025,Q1,2025-05-07,2025-05-07 17:16:32
3,AACBU,2025-09-30,2025,Q3,2025-11-07,2025-11-06 19:34:54
4,AACBU,2025-06-30,2025,Q2,2025-08-07,2025-08-06 21:10:19
5,AACBU,2025-03-31,2025,Q1,2025-05-07,2025-05-07 17:16:32
6,AACG,2025-09-30,2025,Q3,2025-11-17,2025-11-17 16:18:33
7,AACG,2025-06-30,2025,Q2,2025-08-06,2025-08-06 17:00:44
8,AACG,2025-03-31,2025,Q1,2025-05-16,2025-05-16 07:00:20
9,AACG,2024-12-31,2024,Q4,2025-04-10,2025-04-10 06:06:11


In [10]:
# Analyze filing lag (days from period_end to filing_date)
filing_dates["period_end_dt"] = pd.to_datetime(filing_dates["period_end"])
filing_dates["filing_date_dt"] = pd.to_datetime(filing_dates["filing_date"])
filing_dates["filing_lag_days"] = (filing_dates["filing_date_dt"] - filing_dates["period_end_dt"]).dt.days

print("Filing lag (days from period end to SEC filing):")
print(filing_dates["filing_lag_days"].describe())
print(f"\nPercentiles:")
print(filing_dates["filing_lag_days"].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))

Filing lag (days from period end to SEC filing):
count    305363.000000
mean         38.587651
std          89.184467
min        -365.000000
25%          26.000000
50%          38.000000
75%          45.000000
max        8074.000000
Name: filing_lag_days, dtype: float64

Percentiles:
0.10      0.0
0.25     26.0
0.50     38.0
0.75     45.0
0.90     69.0
0.95     86.0
0.99    133.0
Name: filing_lag_days, dtype: float64
