# Missing Data Report


In [1]:
# Install required packages
!pip install pandas numpy ipython


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Check for gaps in trading history to plan masking or imputation strategies for transformer inputs.


In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List

import numpy as np
import pandas as pd
from IPython.display import display


In [3]:
@dataclass
class GapSummary:
    symbol: str
    category: str
    start_date: pd.Timestamp
    end_date: pd.Timestamp
    total_days: int
    observed_points: int
    expected_business_days: int
    missing_business_days: int
    coverage_ratio: float
    worst_gap_days: int


def _count_business_days(start: np.datetime64, end: np.datetime64) -> int:
    start_d = start.astype('datetime64[D]')
    end_d = end.astype('datetime64[D]')
    return int(np.busday_count(start_d, end_d)) + 1


def analyse_symbol(path: Path) -> GapSummary | None:
    try:
        df = pd.read_csv(path, parse_dates=['Date'])
    except Exception as exc:
        print(f'[WARN] Could not process {path}: {exc}')
        return None

    if df.empty:
        return None

    df = df.sort_values('Date')
    dates = (
        pd.to_datetime(df['Date'])
        .dt.normalize()
        .drop_duplicates()
        .to_numpy(dtype='datetime64[D]')
    )
    if len(dates) < 2:
        return None

    start = dates[0]
    end = dates[-1]
    expected = _count_business_days(start, end)
    observed = len(dates)
    if expected <= 0:
        return None

    begin = dates[:-1]
    finish = dates[1:]
    bus_counts = np.busday_count(begin, finish)
    missing_between = np.maximum(bus_counts - 1, 0)
    worst_gap = int(missing_between.max(initial=0))
    total_missing = max(int(expected - observed), 0)

    return GapSummary(
        symbol=path.stem,
        category=path.parent.name,
        start_date=pd.Timestamp(start),
        end_date=pd.Timestamp(end),
        total_days=int(((end - start) / np.timedelta64(1, 'D'))) + 1,
        observed_points=observed,
        expected_business_days=expected,
        missing_business_days=total_missing,
        coverage_ratio=observed / expected if expected else 0.0,
        worst_gap_days=worst_gap,
    )


def gather_gaps(files: Iterable[Path]) -> List[GapSummary]:
    results: List[GapSummary] = []
    for path in files:
        summary = analyse_symbol(path)
        if summary is not None:
            results.append(summary)
    return results


def build_dataframe(summaries: Iterable[GapSummary]) -> pd.DataFrame:
    df = pd.DataFrame([s.__dict__ for s in summaries])
    if df.empty:
        raise RuntimeError('No gap summaries produced. Check data path.')
    df['missing_ratio'] = df['missing_business_days'] / df['expected_business_days'].clip(lower=1)
    df['avg_gap'] = df['missing_business_days'] / (df['observed_points'] - 1)
    return df


def print_headlines(df: pd.DataFrame) -> None:
    print('=== Missing Data Snapshot ===')
    print(f'Symbols analysed: {len(df)}')
    print(
        f'Median missing ratio: {df['missing_ratio'].median(): .4%} ' 
        f'(90th pct: {df['missing_ratio'].quantile(0.9): .2%})'
    )
    print(f'Median worst gap (business days): {df['worst_gap_days'].median(): .1f}')

    by_category = (
        df.groupby('category')[['missing_ratio', 'worst_gap_days', 'coverage_ratio']]
        .median()
        .sort_index()
    )
    display(by_category)


def display_outliers(df: pd.DataFrame, limit: int) -> None:
    worst = df.nlargest(limit, 'missing_ratio')
    best = df.nsmallest(limit, 'missing_ratio')
    print('Tickers with the largest gaps:')
    display(worst[['symbol', 'category', 'missing_ratio', 'worst_gap_days', 'start_date', 'end_date']])
    print('Tickers with the cleanest coverage:')
    display(best[['symbol', 'category', 'missing_ratio', 'worst_gap_days']])


## Configuration


In [4]:
DATA_ROOT = Path('../stock_data')
LIMIT = 5
OUTPUT_CSV = None  # e.g. Path('outputs/missing_data_report.csv')


## Run Analysis


In [5]:
files = sorted(DATA_ROOT.glob('*/*.txt'))
if not files:
    raise RuntimeError(f'No symbol files found under {DATA_ROOT}')

summaries = gather_gaps(files)
df = build_dataframe(summaries)

print_headlines(df)
display_outliers(df, LIMIT)

if OUTPUT_CSV:
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f'[INFO] Missing data report saved to {OUTPUT_CSV}')


[WARN] Could not process ../stock_data/Stocks/accp.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/amrh.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/amrhw.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/asns.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/bbrx.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/bolt.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/boxl.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/bxg.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/ehr.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/fmax.us.txt: No columns to parse from file
[WARN] Could not process ../stock_data/Stocks/gnst.us.txt: No columns to parse from file
[WARN] Could not proce

Unnamed: 0_level_0,missing_ratio,worst_gap_days,coverage_ratio
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ETFs,0.036296,2.0,0.963704
Stocks,0.03468,2.0,0.96532


Tickers with the largest gaps:


Unnamed: 0,symbol,category,missing_ratio,worst_gap_days,start_date,end_date
7584,tcbiw.us,Stocks,0.981132,189,2016-12-06,2017-09-27
3161,dlbl.us,Stocks,0.97093,94,2017-02-14,2017-10-11
6109,oacqu.us,Stocks,0.959459,96,2016-12-13,2017-10-18
3256,dtyl.us,Stocks,0.951754,130,2016-12-21,2017-11-03
5682,mtb_c.us,Stocks,0.949309,54,2016-11-08,2017-09-06


Tickers with the cleanest coverage:


Unnamed: 0,symbol,category,missing_ratio,worst_gap_days
1377,ablx.us,Stocks,0.0,0
1411,acmr.us,Stocks,0.0,0
1555,aieq.us,Stocks,0.0,0
1621,alna.us,Stocks,0.0,0
1626,alp_o-cl.us,Stocks,0.0,0
