
# Data Dictionary and Provenance Appendix

This notebook documents inputs, transformations, and outputs for the Treasury inflation basis analysis suite. It produces a markdown appendix (`reports/appendix_data_dictionary.md`) and a provenance YAML file (`exports/analysis_artifacts.yml`) containing SHA256 hashes for reproducibility.



## Imports and helper functions


In [None]:

import hashlib
import pathlib
from typing import Dict

import pandas as pd

INPUT_PATHS = [
    pathlib.Path('data/trace_microstructure_event_panels.csv'),
    pathlib.Path('data/tenor_liq.csv'),
    pathlib.Path('_output/strategy3/state_estimates.csv'),
    pathlib.Path('data/policy/treasury_buybacks_refunding.csv'),
    pathlib.Path('data/val/bei_ils_wedge_by_tenor.csv'),
    pathlib.Path('_output/strategy3/variance_decomposition.csv'),
    pathlib.Path('_output/strategy3/halflife_summary.csv')
]

OUTPUT_PATHS = [
    pathlib.Path('reports/microstructure_concentration_results.html'),
    pathlib.Path('reports/microstructure_concentration_results.csv'),
    pathlib.Path('reports/policy_intervention_state_space.html'),
    pathlib.Path('reports/policy_intervention_coeffs.csv'),
    pathlib.Path('reports/val_wedge_linkage.html'),
    pathlib.Path('reports/val_wedge_linkage.csv'),
    pathlib.Path('reports/forecast_comparison.html'),
    pathlib.Path('reports/forecast_rmsfe.csv'),
    pathlib.Path('reports/event_irfs_daily.html'),
    pathlib.Path('reports/event_irfs_daily.csv'),
    pathlib.Path('reports/strategy3_state_space.html'),
    pathlib.Path('tables/state_space_variance.csv'),
    pathlib.Path('reports/appendix_data_dictionary.md'),
    pathlib.Path('exports/analysis_artifacts.yml')
]

OUTPUT_MD = pathlib.Path('reports/appendix_data_dictionary.md')
OUTPUT_YAML = pathlib.Path('exports/analysis_artifacts.yml')
OUTPUT_MD.parent.mkdir(parents=True, exist_ok=True)
OUTPUT_YAML.parent.mkdir(parents=True, exist_ok=True)


def sha256(path: pathlib.Path) -> str:
    if not path.exists():
        return ''
    h = hashlib.sha256()
    with path.open('rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            h.update(chunk)
    return h.hexdigest()


def summarize_csv(path: pathlib.Path) -> Dict:
    if not path.exists():
        return {'path': str(path), 'exists': False, 'columns': [], 'rows': 0}
    df = pd.read_csv(path)
    return {
        'path': str(path),
        'exists': True,
        'columns': df.columns.tolist(),
        'rows': len(df)
    }



## Input dataset overview


In [None]:

input_summaries = [summarize_csv(path) for path in INPUT_PATHS]
inputs_df = pd.DataFrame(input_summaries)
inputs_df



## Output file status

This section lists expected analysis artifacts. Files not yet generated will have empty hashes and `exists = False` until the notebooks are executed end-to-end.


In [None]:

output_records = []
for path in OUTPUT_PATHS:
    record = {
        'path': str(path),
        'exists': path.exists(),
        'sha256': sha256(path) if path.exists() else ''
    }
    output_records.append(record)
outputs_df = pd.DataFrame(output_records)
outputs_df



## Write markdown appendix


In [None]:

lines = ['# Appendix: Data Dictionary', '', '## Inputs']
for record in input_summaries:
    status = 'available' if record['exists'] else 'missing'
    cols = ', '.join(record['columns']) if record['columns'] else 'n/a'
    lines.append(f"- **{record['path']}** ({status}, rows={record['rows']}): columns = {cols}")
lines.append('')
lines.append('## Outputs')
for record in output_records:
    status = 'available' if record['exists'] else 'pending'
    lines.append(f"- **{record['path']}** ({status})")
OUTPUT_MD.write_text('\n'.join(lines))
OUTPUT_MD



## Write provenance YAML


In [None]:

def format_yaml_list(items, indent=0):
    spaces = '  ' * indent
    lines = []
    for item in items:
        lines.append(f"{spaces}- path: {item['path']}")
        lines.append(f"{spaces}  exists: {str(item['exists']).lower()}")
        hash_val = item.get('sha256') or sha256(pathlib.Path(item['path']))
        if hash_val:
            lines.append(f"{spaces}  sha256: {hash_val}")
        else:
            lines.append(f"{spaces}  sha256: null")
    return '\n'.join(lines)

input_yaml = format_yaml_list(input_summaries)
output_yaml = format_yaml_list(output_records)
yaml_text = f"inputs:\n{input_yaml}\noutputs:\n{output_yaml}\n"
OUTPUT_YAML.write_text(yaml_text)
OUTPUT_YAML



## Interpretation

The appendix confirms all datasets reside within the repository and provides a checklist for verifying generated artifacts. Missing hashes indicate steps that require notebook execution before dissemination.
