# 02 — Exogenous Data Healthcheck

Checks:
- Which CSVs exist
- Time coverage (min/max year)
- Missing value counts

This is a quick “does the folder look sane?” check.

In [None]:
from pathlib import Path
import pandas as pd
import yaml
import matplotlib.pyplot as plt

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p/"configs"/"time.yml").exists() and (p/"data"/"exogenous").exists():
            return p
    raise FileNotFoundError("Could not locate repo root containing configs/ and data/exogenous/")

ROOT = find_repo_root(Path(".").resolve())
CFG = ROOT/"configs"
DATA = ROOT/"data"/"exogenous"

def load_yaml(path):
    with open(path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)


csvs = sorted(DATA.glob('*.csv'))
print('CSV files:', len(csvs))
summary=[]
for p in csvs:
    df = pd.read_csv(p)
    n = len(df)
    miss = df['value'].isna().sum() if 'value' in df.columns else None
    tmin=tmax=None
    if 't' in df.columns:
        tmin, tmax = df['t'].min(), df['t'].max()
    summary.append({'file': p.name, 'rows': n, 'missing_value_cells': miss, 't_min': tmin, 't_max': tmax})
summary_df = pd.DataFrame(summary).sort_values('file')
summary_df.head(20)

In [None]:
summary_df