# ICD Correlation & Risk Factor Discovery Script

## Description
This script performs a statistical correlation analysis on the entire MIMIC-IV diagnosis dataset (`diagnoses_icd.csv`) to discover intrinsic patient comorbidities associated with Hospital-Acquired Pressure Injuries (HAPI).

## Clinical Justification for HAPI Research
To predict pressure injuries accurately, we must identify underlying patient conditions that increase susceptibility. This script identifies risk factors aligned with the Braden Scale conceptual framework:
* **Immobility:** Codes for "Paraplegia" or "Spinal Cord Injury" (Braden Mobility/Activity).
* **Perfusion:** Codes for "Septic Shock" or "Hypotension" (Tissue Tolerance).
* **Nutrition:** Codes for "Malnutrition" or "Weight Loss" (Tissue Tolerance).

**Methodology Note:** This script explicitly *excludes* pressure ulcer diagnosis codes from the feature set during analysis. This prevents "Target Leakage" (using the outcome to predict the outcome) and ensures we find *predictors* rather than labels.

## Inputs & Outputs
* **Input:** `diagnoses_icd.csv` (All diagnosis codes)
* **Input:** `hospitalwide_hapi_labels.csv` (Ground truth targets)
* **Output:** `diagnoses_feat.csv` (Top 20 most correlated risk factors)

In [None]:
import pandas as pd
import os


In [None]:
# Configuration
BASE_DIR = r"D:\School\5141"

# Input Paths
DIAGNOSES_PATH = os.path.join(BASE_DIR, "diagnoses_icd.csv", "diagnoses_icd.csv")
LABELS_PATH = os.path.join(BASE_DIR, "hospitalwide_hapi_labels.csv")
DICTIONARY_PATH = os.path.join(BASE_DIR, "d_icd_diagnoses.csv", "d_icd_diagnoses.csv")

# Output Path
OUTPUT_PATH = os.path.join(BASE_DIR, "diagnoses_feat.csv")

# Analysis Parameters
MIN_ADMISSIONS = 500  # Code must appear in at least this many patients
TOP_N = 20            # Number of top risk factors to keep

In [None]:
# Clinical codes for Pressure Ulcers list
# These codes represent the outcome (Pressure Ulcer). They have to be removed from
# the feature set, otherwise the model will "cheat" by seeing the answer.
icd10_pu_codes = [
    "L89000","L89001","L89002","L89003","L89004","L89009",
    "L89010","L89011","L89012","L89013","L89014","L89019",
    "L89020","L89021","L89022","L89023","L89024","L89029",
    "L89100","L89101","L89102","L89103","L89104","L89109",
    "L89110","L89111","L89112","L89113","L89114","L89119",
    "L89120","L89121","L89122","L89123","L89124","L89129",
    "L89130","L89131","L89132","L89133","L89134","L89139",
    "L89140","L89141","L89142","L89143","L89144","L89149",
    "L89150","L89151","L89152","L89153","L89154","L89159",
    "L89200","L89201","L89202","L89203","L89204","L89209",
    "L89210","L89211","L89212","L89213","L89214","L89219",
    "L89220","L89221","L89222","L89223","L89224","L89229",
    "L89300","L89301","L89302","L89303","L89304","L89309",
    "L89310","L89311","L89312","L89313","L89314","L89319",
    "L89320","L89321","L89322","L89323","L89324","L89329",
    "L8940","L8941","L8942","L8943","L8944","L8945",
    "L89500","L89501","L89502","L89503","L89504","L89509",
    "L89510","L89511","L89512","L89513","L89514","L89519",
    "L89520","L89521","L89522","L89523","L89524","L89529",
    "L89600","L89601","L89602","L89603","L89604","L89609",
    "L89610","L89611","L89612","L89613","L89614","L89619",
    "L89620","L89621","L89622","L89623","L89624","L89629",
    "L89810","L89811","L89812","L89813","L89814","L89819",
    "L89890","L89891","L89892","L89893","L89894","L89899",
    "L8990","L8991","L8992","L8993","L8994","L8995"
]
icd9_pu_codes = [
    "7070","70700","70701","70702","70703","70704",
    "70705","70706","70707","70709",
    "70720","70721","70722","70723","70724","70725"
]

# Combine into a set for fast lookup O(1)
PRESSURE_ULCER_CODES = set(icd10_pu_codes + icd9_pu_codes)

In [None]:
# Loading Functions
def load_data():
    """
    Loads the full ICD diagnosis dataset from the raw CSV file.
    
    Operations:
    1. Reads diagnoses_icd.csv (setting low_memory=False to handle mixed types).
    2. Standardizes icd_code by converting to uppercase and stripping whitespace.
    3. Casts hadm_id to Int64 to ensure consistent merging with labels later.
    
    Returns:
        pd.DataFrame: A dataframe containing all patient diagnosis records.
    """
    df = pd.read_csv(DIAGNOSES_PATH, low_memory=False)
    df["icd_code"] = df["icd_code"].astype(str).str.upper().str.strip()
    if "hadm_id" in df.columns:
        df["hadm_id"] = df["hadm_id"].astype("Int64")
    return df

def load_labels():
    """
    Loads the ground truth Hospital-Acquired Pressure Injury (HAPI) labels.
    
    Operations:
    1. Checks if the labels file exists.
    2. Reads hospitalwide_hapi_labels.csv, importing only hadm_id and HAPI_FINAL.
    3. Casts hadm_id to Int64 to match the diagnosis dataframe.
    
    Returns:
        pd.DataFrame: A dataframe with admission IDs and their binary HAPI status.
    """
    if not os.path.exists(LABELS_PATH):
        raise FileNotFoundError(f"Labels file not found at {LABELS_PATH}")
        
    labels = pd.read_csv(LABELS_PATH, usecols=["hadm_id", "HAPI_FINAL"], low_memory=False)
    labels["hadm_id"] = labels["hadm_id"].astype("Int64")
    return labels

In [None]:
# Analysis Function
def select_top_features(df, pu_labels):
    """
    Performs statistical analysis to identify the top risk factors correlated with HAPI.
    
    Operations:
    1. Filter: Removes rare ICD codes that appear fewer than MIN_ADMISSIONS times.
    2. Pivot: Transforms data from long format to wide (binary matrix: rows=patients, cols=diagnoses).
    3. Merge: Joins the diagnosis matrix with the HAPI labels.
    4. Correlate: Calculates Pearson correlation between every diagnosis and 'HAPI_FINAL'.
    5. Clean: Removes the target variable itself and explicitly excludes known Pressure Ulcer codes
       to prevent target leakage.
    
    Arguments:
        df (pd.DataFrame): The diagnoses dataframe.
        pu_labels (pd.DataFrame): The labels dataframe.
        
    Returns:
        final_df: A feature matrix containing only the top N most correlated codes per patient.
        results_df: A summary table listing the specific codes and their correlation coefficients.
    """
    # Filter for common codes
    counts = df["icd_code"].value_counts()
    common_codes = counts[counts >= MIN_ADMISSIONS].index.tolist()
   
    
    df_common = df[df["icd_code"].isin(common_codes)].copy()
    df_common["val"] = 1
    
    # Pivot
    matrix = df_common.pivot_table(
        index="hadm_id", columns="icd_code", values="val", fill_value=0
    )
    
    #Merge with Labels
    matrix = matrix.merge(pu_labels, on="hadm_id", how="left")
    matrix["HAPI_FINAL"] = matrix["HAPI_FINAL"].fillna(0)
    
    #Calculate Correlation
    corrs = matrix.corrwith(matrix["HAPI_FINAL"]).sort_values(ascending=False)
    
    results = []
    for code, corr_val in corrs.items():
        if code == "HAPI_FINAL": continue
        if code == "hadm_id": continue
        if code in PRESSURE_ULCER_CODES: continue
        
        results.append({"icd_code": code, "correlation": corr_val})
        if len(results) >= TOP_N:
            break
            
    results_df = pd.DataFrame(results)
    
    #Create final feature set
    top_codes = results_df["icd_code"].tolist()
    cols_to_keep = ["hadm_id"] + top_codes
    final_df = matrix[cols_to_keep].copy()
    
    return final_df, results_df

def add_descriptions_to_results(results_df):
    """
    Adds human-readable descriptions.
    
    Operations:
    1. Loads the ICD dictionary file (d_icd_diagnoses.csv).
    2. Standardizes the dictionary codes to match the analysis format.
    3. Merges the dictionary with the correlation results to add the long_titl column.
    
    Arguments:
        results_df (pd.DataFrame): The output from select_top_features containing codes and scores.
        
    Returns:
        pd.DataFrame: The original results augmented with the full text description of each disease.
    """
    if not os.path.exists(DICTIONARY_PATH):
        return results_df
    
    d_icd = pd.read_csv(DICTIONARY_PATH, low_memory=False)
    d_icd["icd_code"] = d_icd["icd_code"].astype(str).str.upper().str.strip()
    
    merged = results_df.merge(d_icd[["icd_code", "long_title"]], on="icd_code", how="left")
    return merged

In [None]:
# Execute 
if __name__ == "__main__":
    # Load
    df_all = load_data()
    labels = load_labels()
    
    # Analyze
    feat_df, corr_results = select_top_features(df_all, labels)
    
    # Add Descriptions
    final_results = add_descriptions_to_results(corr_results)
    
    print("\n--- TOP 20 RISK FACTORS (Correlated Conditions) ---")
    print(final_results.to_string(index=False))
    
    # Save
    print(f"\nSaving top risk factor features to: {OUTPUT_PATH}")
    feat_df.to_csv(OUTPUT_PATH, index=False)
    print("Done.")
