# 01 Data Ingestion and EDA

This notebook implements MVP 1.2 requirements: data overview, univariate distribution checks, treatment-control covariate balance diagnostics, and correlation scan.

## Section 0: Setup

In [None]:
import io
import os
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yaml
from IPython.display import Markdown, display

from src.data_utils import load_and_clean

# Standardized plotting style
# Improves visual consistency and reviewer readability.
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 13
sns.set_palette('Set2')

config_candidates = [Path('configs/config.yaml'), Path('configs/config.yml')]
config_path = next((p for p in config_candidates if p.exists()), None)
if config_path is None:
    raise FileNotFoundError('No config file found in configs/config.yaml or configs/config.yml')

# Load config
with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

# Extract paths and create output directory
raw_data_path = config['paths']['raw_data']
cleaned_data_path = config['paths']['cleaned_data']
raw_text_path = config['paths']['raw_text_data']
figures_dir = Path(config['paths']['figures_dir'])
figures_dir.mkdir(parents=True, exist_ok=True)

# If you want to overwrite existing output file, set overwrite=True
# df = load_and_clean(filepath=raw_data_path, output_path=cleaned_data_path, raw_text_path=raw_text_path, overwrite=True)
df = load_and_clean(filepath=raw_data_path, output_path=cleaned_data_path, raw_text_path=raw_text_path)
print(f'Data loaded successfully: shape={df.shape}')
