In [3]:
import pandas as pd
from pathlib import Path

RAW = "../data/raw/records_2022.csv"
OUT_CSV = "../data/processed/records_2022_clean.csv"
OUT_JSON = "../data/processed/records_2022_clean.json"

Path("../data/processed").mkdir(parents=True, exist_ok=True)

df = pd.read_csv(RAW)

# Normalize text fields
for col in ["category", "unit", "source", "status"]:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.lower()
        .replace({"": None, "nan": None})
    )

# Category normalization
category_map = {
    "imaging": "imaging",
    "imaging ": "imaging",
    "billing": "billing",
    "follow_up": "follow-up",
    "follow-up": "follow-up",
}

df["category"] = df["category"].replace(category_map)

# Parse dates
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Numeric coercion
df["value"] = pd.to_numeric(df["value"], errors="coerce")

# Explicit missing value handling
df = df.replace({None: pd.NA})

# Save outputs
df.to_csv(OUT_CSV, index=False)
df.to_json(OUT_JSON, orient="records", indent=2)

print("Cleaning complete")


Cleaning complete
