# Feature Module: Procedure Features (ICD Codes)


## Overview
This module processes the raw*ICD Procedure codes  to characterize the procedural intensity and complexity of a patient's hospital stay. It aggregates complex, transaction-level data into patient-level summary metrics that are highly predictive of clinical risk.

## Key Features
1.  **Complexity Metrics**
    * Calculates the **Total Number of Procedures** (volume of interventions).
    * Calculates the **Number of Unique Procedures** (proxy for clinical complexity).
2.  **High-Risk Flagging**
    * Scans all procedure codes against a curated list of high-risk interventions (e.g., **Mechanical Ventilation**, **Extracorporeal Circulation**, bowel resections).
    * Creates a binary flag (`has_high_risk_procedure`) used by the model to identify patients requiring intensive support.
3.  **Cohort Aggregation**
    * Outputs a feature table (`procedures_feat.csv`) with one row per hospital admission (`hadm_id`), ready for merging.

**Input**: `procedures_icd.csv` 
 **Output**:`procedures_feat.csv`

In [None]:
import os
import pandas as pd

In [None]:
# Configuration

import os
import pandas as pd

BASE_DIR = r"D:\School\5141"

# Raw MIMIC-IV procedures file
PROC_PATH   = os.path.join(BASE_DIR, "procedures_icd.csv", "procedures_icd.csv")

# Output feature file
OUTPUT_PATH = os.path.join(BASE_DIR, "procedures_feat.csv")

# Small subset of high-risk surgical codes

HIGH_RISK_PROC_CODES = [
    "45.13", "45.16",  # example bowel resections
    "39.61",           # extracorporeal circulation
    "96.04",           # insertion of endotracheal tube
    "96.71", "96.72",  # continuous mechanical ventilation
]

In [None]:
# Load Funation
def load_procedures(path: str) -> pd.DataFrame:
    """
    Load procedures_icd and standardize key columns.
    Parameters:
    path : str
        Full path to procedures_icd.csv

    Returns:
    pd.DataFrame
        DataFrame with at least ['hadm_id', 'icd_code'].
    """
    df = pd.read_csv(path, low_memory=False)
    print("Columns:", df.columns.tolist())

    # Try to find the ICD code column
    possible_code_cols = ["icd_code", "icd9_code", "icd10_code", "icd_code_seq"]
    code_col = None
    for col in possible_code_cols:
        if col in df.columns:
            code_col = col
            break

    if code_col is None:
        raise ValueError(
            f"Could not find an ICD code column in procedures_icd. "
            f"Columns present: {df.columns.tolist()}"
        )

    if "hadm_id" not in df.columns:
        raise ValueError("Expected 'hadm_id' in procedures_icd but it was not found.")

    # Keep only what we need
    df = df[["hadm_id", code_col]].copy()

    # Standardize types
    df["hadm_id"] = df["hadm_id"].astype("Int64")
    df["icd_code"] = df[code_col].astype(str).str.strip()

    print("Procedures (cleaned) shape:", df.shape)
    return df



In [None]:
def build_procedure_features(df: pd.DataFrame):
    """
    Build aggregate procedure features per hadm_id.

    Features:
        - num_procedures
        - num_unique_procedures
        - has_any_procedure
        - has_high_risk_procedure

    Parameters:
    df : pd.DataFrame
        One row per procedure per admission, with columns:
        ['hadm_id', 'icd_code']

    Returns:
    pd.DataFrame
        One row per hadm_id with aggregate features.
    """
    # Total number of procedure rows per admission
    total_count = (
        df.groupby("hadm_id")["icd_code"]
        .size()
        .rename("num_procedures")
    )

    # Number of unique procedure codes per admission
    unique_count = (
        df.groupby("hadm_id")["icd_code"]
        .nunique()
        .rename("num_unique_procedures")
    )

    # Has any procedure at all?
    has_any = (total_count > 0).astype(int).rename("has_any_procedure")

    # Any high-risk procedure code?
    high_flag = (
        df.assign(is_high=lambda x: x["icd_code"].isin(HIGH_RISK_PROC_CODES))
        .groupby("hadm_id")["is_high"]
        .max()
        .fillna(False)
        .astype(int)
        .rename("has_high_risk_procedure")
    )

    # Combine all features into one DataFrame
    feat = pd.concat([total_count, unique_count, has_any, high_flag], axis=1).reset_index()

    print("Built procedure features. Shape:", feat.shape)
    return feat


def save_features(df: pd.DataFrame, path: str):
    """
    Save the features DataFrame to CSV.

    Parameters:
    df : pd.DataFrame
        Procedure features with one row per hadm_id.
    path : str
        Output CSV path.
    """
    df.to_csv(path, index=False)
    print(f"Saved procedure features to: {path}")


Loading procedures from: D:\School\5141\procedures_icd.csv\procedures_icd.csv
Raw procedures shape: (859655, 6)
Columns: ['subject_id', 'hadm_id', 'seq_num', 'chartdate', 'icd_code', 'icd_version']
Procedures (cleaned) shape: (859655, 2)
Built procedure features. Shape: (287504, 5)
Saved procedure features to: D:\School\5141\procedures_feat.csv


In [None]:
# Execute
if __name__ == "__main__":
    procs = load_procedures(PROC_PATH)
    proc_feat = build_procedure_features(procs)
    save_features(proc_feat, OUTPUT_PATH)
