## Master Analytic Dataset Builder

**Description:**  
    This script consolidates all engineered features from the entire pipeline   
    into the single, comprehensive, row-per-admission analytic dataset   
    (FINAL_HAPI_ANALYTIC.csv) required for HAPI model training and evaluation.  

**Key Logic:**  
    - Key ID: hadm_id (hospital admission ID)  
    - Join Type: Left merge on hadm_id, anchoring the merge to the HAPI labels file.  
    - Integrity: Includes error handling for missing feature files and checks for duplicate hadm_id values in the final output.  

**Inputs:** 
    - hospitalwide_hapi_labels.csv (The primary cohort anchor)  
    - All individual feature files (*_feat.csv) listed in the FEATURE_FILES configuration list.  

**Output:**
    - FINAL_HAPI_ANALYTIC.csv (The final merged dataset)

In [20]:
import os
from functools import reduce

import numpy as np
import pandas as pd

In [21]:
#Configuration
BASE_DIR = r"D:\School\5141"        # <-- adjust if needed
OUTPUT_NAME = "FINAL_HAPI_ANALYTIC.csv"

# List of feature files to merge.
# Comment out any that you don’t have yet.
FEATURE_FILES = [
    "demographics_feat.csv",
    "los_feat.csv",
    "icu_transfer.csv",
    "icu_procedures_feat.csv",
    "io_feat.csv",
    "procedures_feat.csv",
    "vitals_feat.csv",              
    "labs_feat.csv",                
    "hcpcs_feat.csv",               
    "emar_feat.csv",                
    "poe_feat.csv",                 
    "poe_detail_feat.csv",          
    "prescriptions_feat.csv",       
    "pharmacy_feat.csv",            
    "feat_notes.csv",               # NLP features / Braden-like risk
    "diagnoses_feat.csv",           # ICD-based risk factors
    "medications_master_feat.csv",  # merged EMAR/Presc/Pharm/POE
    "hospitalwide_hapi_labels.csv"  # target labels (HAPI_STRUCTURED / UNSTRUCTURED / FINAL)
]



In [22]:

def full_path(fname: str):
    """Build full path for a file living in BASE_DIR."""
    return os.path.join(BASE_DIR, fname)


def load_feature_table(fname: str):
    """
    Load a single feature table, enforce hadm_id type, and
    keep it ready for merging.

    Returns:
        DataFrame if loaded successfully, else None.
    """
    path = full_path(fname)

    if not os.path.exists(path):
        return None

    df = pd.read_csv(path, low_memory=False)

    if "hadm_id" not in df.columns:
        raise ValueError(f"{fname} does not contain 'hadm_id' column. Columns: {list(df.columns)}")

    # Ensure consistent nullable integer type for joins
    df["hadm_id"] = df["hadm_id"].astype("Int64")

    # drop exact duplicate rows
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    if after < before:
        print(f"   · Dropped {before - after} duplicate rows in {fname}")
    return df


def merge_on_hadm_id(dfs: list[pd.DataFrame]):
    """
    Merge all DataFrames in `dfs` on hadm_id using left joins.
    Assumes each df has a 'hadm_id' column.
    """
    def _merge(left: pd.DataFrame, right: pd.DataFrame):
        merged = pd.merge(left, right, on="hadm_id", how="left")
        return merged

    return reduce(_merge, dfs)



In [None]:
# Execute
def main():

    dfs: list[pd.DataFrame] = []

    # Load each feature file
    for fname in FEATURE_FILES:
        df = load_feature_table(fname)
        if df is not None:
            dfs.append(df)

    if not dfs:
        raise RuntimeError("No feature tables were loaded. "
                           "Check BASE_DIR and FEATURE_FILES list.")

    # Sort so that labels (hospitalwide_hapi_labels.csv) are merged last (nice but not required)
    dfs_sorted = sorted(
        dfs,
        key=lambda d: "HAPI_FINAL" in d.columns or "has_HAPI" in d.columns
    )

    # Merge everything on hadm_id
    final = merge_on_hadm_id(dfs_sorted)

    # Save final dataset
    out_path = full_path(OUTPUT_NAME)
    print(f"\n Saving merged analytic dataset to:\n   {out_path}")
    final.to_csv(out_path, index=False)
    print(f" Done. Final shape: {final.shape}")


if __name__ == "__main__":
    main()


 Saving merged analytic dataset to:
   D:\School\5141\FINAL_HAPI_ANALYTIC.csv
