In [None]:
import pandas as pd

def prepare_scan(scan_path):
    scan = pd.read_csv(scan_path)
    scan = scan.rename(columns={"Local_ID": "local_id"})
    scan["local_id"] = scan["local_id"].astype(str).str.strip()
    scan["StudyDate"] = pd.to_datetime(scan["StudyDate"], errors="coerce")
    scan = scan.dropna(subset=["StudyDate"])
    return scan


def prepare_longitudinal(path, date_col):
    df = pd.read_csv(path)
    df = df.rename(columns={"Local_ID": "local_id", date_col: "VisitDate"})
    df["local_id"] = df["local_id"].astype(str).str.strip()
    df["VisitDate"] = pd.to_datetime(df["VisitDate"], errors="coerce")
    df = df.dropna(subset=["VisitDate"])
    return df


def merge_nearest(scan, long, out_path):
    rows = []

    for lid, scan_g in scan.groupby("local_id"):
        long_g = long[long["local_id"] == lid]

        if long_g.empty:
            tmp = scan_g.copy()
            tmp["VisitDate"] = pd.NaT
            tmp["delta_days"] = pd.NA
            rows.append(tmp)
            continue

        scan_g = scan_g.sort_values("StudyDate")
        long_g = long_g.sort_values("VisitDate")

        merged = pd.merge_asof(
            scan_g,
            long_g,
            left_on="StudyDate",
            right_on="VisitDate",
            direction="nearest"
        )

        merged["delta_days"] = (merged["VisitDate"] - merged["StudyDate"]).abs().dt.days
        rows.append(merged)

    out = pd.concat(rows, ignore_index=True)
    out.to_csv(out_path, index=False)
    print(f"Wrote {out_path}")
    print(f"Matched rows: {out['VisitDate'].notna().sum()} / {len(out)}")


In [None]:
scan_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/scan_metadata.csv'
bars_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/BARS.csv'
ccas_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/CCAS.csv'
cnrs_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/CNRS.csv'

scan = prepare_scan(scan_path)

bars = prepare_longitudinal(bars_path, "Visit_Date")
ccas = prepare_longitudinal(ccas_path, "Visit_Date")
cnrs = prepare_longitudinal(cnrs_path, "Visit_Date")

# DIAGNOSTICS â€” DO NOT SKIP
print("ID overlap check:")
print("BARS overlap:", set(scan.local_id) & set(bars.local_id))
print("CCAS overlap:", set(scan.local_id) & set(ccas.local_id))
print("CNRS overlap:", set(scan.local_id) & set(cnrs.local_id))

merge_nearest(scan, bars, '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/BARS_nearest.csv')
merge_nearest(scan, ccas, '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/CCAS_nearest.csv')
merge_nearest(scan, cnrs, '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/Schahmann_SCA_Atrophy/metadata/CNRS_nearest.csv')
