In [27]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import os

In [28]:
# Step 1: Create reference subset
final_data = pd.read_csv("C:/MIMIC-Extract/data/processed_final_data.csv")
vars_anti = [
    "surv_30d", "has_antibiotic", "age", "gender", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Platelets_mean", "Respiratory rate_mean"
]
ref_anti = final_data[vars_anti].dropna()
print(f"Reference subset shape: {ref_anti.shape}")  

# Encode icd9_codes as binary sepsis indicator (995.91, 995.92)
# sepsis_icd9: binary indicator variable (0 or 1)
# 995.91: "Sepsis" - nhiễm trùng huyết
# 995.92: "Severe sepsis" - nhiễm trùng huyết nặng
ref_anti = ref_anti.copy()
if ref_anti["icd9_codes"].dtype == object:
    ref_anti["sepsis_icd9"] = ref_anti["icd9_codes"].isin(["995.91", "995.92"]).astype(int)
    categorical_vars = ["gender", "admission_type", "first_careunit", "sepsis_icd9"]
else:
    categorical_vars = ["gender", "admission_type", "first_careunit", "icd9_codes"]

# Numerical confounders
numerical_vars = [
    "age", "White blood cell count_mean", "Temperature_mean",
    "Oxygen saturation_mean", "Creatinine_mean", "Platelets_mean",
    "Respiratory rate_mean"
]


Reference subset shape: (33875, 13)


#### ***Purpose: Measuring Initial Imbalance, Propensity Score Modeling, Inverse Probability Weighting (IPW))***

In [29]:
# Step 2: Validation - Propensity Score Balance
print("\n=== Propensity Score Balance ===")
# Prepare data
X = ref_anti[numerical_vars + categorical_vars]
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SMD before adjustment
treated = X_scaled[ref_anti["has_antibiotic"] == 1]
control = X_scaled[ref_anti["has_antibiotic"] == 0]
smd_before = abs(treated.mean(axis=0) - control.mean(axis=0)) / np.std(X_scaled, axis=0)
print("SMD before adjustment (mean, max):", smd_before.mean(), smd_before.max())

# Propensity score model
ps_model = LogisticRegression(max_iter=1000).fit(X, ref_anti["has_antibiotic"])
ps = ps_model.predict_proba(X)[:, 1]

# IPW weights
weights = ref_anti["has_antibiotic"] / ps + (1 - ref_anti["has_antibiotic"]) / (1 - ps)
weights = np.clip(weights, 0.1, 10)  # Stabilize extreme weights

# SMD after IPW
smd_after = abs(
    np.average(treated, axis=0, weights=weights[ref_anti["has_antibiotic"] == 1]) -
    np.average(control, axis=0, weights=weights[ref_anti["has_antibiotic"] == 0])
)
print("SMD after IPW (mean, max):", smd_after.mean(), smd_after.max())



=== Propensity Score Balance ===
SMD before adjustment (mean, max): 0.11720818823262201 0.30298323141427436
SMD after IPW (mean, max): 0.026332993535682515 0.05691159245566336


#### ***Purpose: Show the overall study by showing that the propensity score approach works well even when considering additional important clinical variables like lactate***

In [30]:
# Step 4: Validation - Sensitivity Analysis with Lactate_mean
print("\n=== Sensitivity Analysis with Lactate_mean ===")
vars_lactate = vars_anti + ["Lactate_mean"]
ref_lactate = final_data[vars_lactate].dropna()
print(f"Lactate subset shape: {ref_lactate.shape}")

# Encode icd9_codes
ref_lactate = ref_lactate.copy()
if ref_lactate["icd9_codes"].dtype == object:
    ref_lactate["sepsis_icd9"] = ref_lactate["icd9_codes"].isin(["995.91", "995.92"]).astype(int)
    lactate_categorical = ["gender", "admission_type", "first_careunit", "sepsis_icd9"]
else:
    lactate_categorical = ["gender", "admission_type", "first_careunit", "icd9_codes"]

# Balance check
X_lactate = ref_lactate[numerical_vars + lactate_categorical + ["Lactate_mean"]]
X_lactate[numerical_vars + ["Lactate_mean"]] = scaler.fit_transform(X_lactate[numerical_vars + ["Lactate_mean"]])
X_lactate = pd.get_dummies(X_lactate, columns=lactate_categorical, drop_first=True)

# SMD for lactate subset
treated_lactate = X_lactate[ref_lactate["has_antibiotic"] == 1]
control_lactate = X_lactate[ref_lactate["has_antibiotic"] == 0]
smd_lactate_before = abs(treated_lactate.mean() - control_lactate.mean()) / X_lactate.std()
print("Lactate SMD before adjustment (mean, max):", smd_lactate_before.mean(), smd_lactate_before.max())

# Propensity score
ps_lactate_model = LogisticRegression(max_iter=5000, solver="saga").fit(X_lactate, ref_lactate["has_antibiotic"])
ps_lactate = ps_lactate_model.predict_proba(X_lactate)[:, 1]
weights_lactate = ref_lactate["has_antibiotic"] / ps_lactate + (1 - ref_lactate["has_antibiotic"]) / (1 - ps_lactate)
weights_lactate = np.clip(weights_lactate, 0.1, 10)

# SMD after IPW
smd_lactate_after = abs(
    treated_lactate.apply(lambda x: np.average(x, weights=weights_lactate[ref_lactate["has_antibiotic"] == 1])) -
    control_lactate.apply(lambda x: np.average(x, weights=weights_lactate[ref_lactate["has_antibiotic"] == 0]))
) / X_lactate.std()
print("Lactate SMD after IPW (mean, max):", smd_lactate_after.mean(), smd_lactate_after.max())

# Subset balance
print("Primary surv_30d rate:", ref_anti["surv_30d"].mean())
print("Lactate surv_30d rate:", ref_lactate["surv_30d"].mean())
print("Primary antibiotic %:", ref_anti["has_antibiotic"].mean())
print("Lactate antibiotic %:", ref_lactate["has_antibiotic"].mean())
print("Primary age mean:", ref_anti["age"].mean())
print("Lactate age mean:", ref_lactate["age"].mean())


=== Sensitivity Analysis with Lactate_mean ===
Lactate subset shape: (20110, 14)
Lactate SMD before adjustment (mean, max): 0.12011222949496798 0.30591923193735143


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_lactate[numerical_vars + ["Lactate_mean"]] = scaler.fit_transform(X_lactate[numerical_vars + ["Lactate_mean"]])


Lactate SMD after IPW (mean, max): 0.01592260419111029 0.03184043232025828
Primary surv_30d rate: 0.8768413284132841
Lactate surv_30d rate: 0.8420188960716062
Primary antibiotic %: 0.16433948339483395
Lactate antibiotic %: 0.2384883142715067
Primary age mean: 75.05608108986637
Lactate age mean: 74.62836856416378


#### ***Purpose: Verifying which variables are actually associated with both treatment (antibiotic use) and outcome (30-day survival). True confounders must influence both. Platelets is the only numerical variable not significantly associated with treatment decisions (p = 0.7139).***

In [31]:
# Step 5: Validation - Statistical Tests
print("\n=== Statistical Tests ===")
# Treatment association
print("Treatment Associations (has_antibiotic):")
for conf in numerical_vars + categorical_vars:
    try:
        if conf in numerical_vars or conf == "sepsis_icd9":
            X = sm.add_constant(ref_anti[conf])
        else:
            X = sm.add_constant(pd.get_dummies(ref_anti[conf], drop_first=True))
        model = sm.Logit(ref_anti["has_antibiotic"], X).fit(disp=0)
        pval = model.pvalues.iloc[1]
        print(f"{conf}: p = {pval:.4f}")
    except Exception as e:
        print(f"{conf}: Skipped ({str(e)})")

# Outcome association
print("\nOutcome Associations (surv_30d, adjusting for has_antibiotic):")
for conf in numerical_vars + categorical_vars:
    try:
        if conf in numerical_vars or conf == "sepsis_icd9":
            X = sm.add_constant(ref_anti[[conf, "has_antibiotic"]])
        else:
            X = sm.add_constant(pd.concat([pd.get_dummies(ref_anti[conf], drop_first=True), 
                                         ref_anti["has_antibiotic"]], axis=1))
        model = sm.Logit(ref_anti["surv_30d"], X).fit(disp=0)
        pval = model.pvalues.iloc[1]
        print(f"{conf}: p = {pval:.4f}")
    except Exception as e:
        print(f"{conf}: Skipped ({str(e)})")


=== Statistical Tests ===
Treatment Associations (has_antibiotic):
age: p = 0.0004
White blood cell count_mean: p = 0.0000
Temperature_mean: p = 0.0000
Oxygen saturation_mean: p = 0.0000
Creatinine_mean: p = 0.0000
Platelets_mean: p = 0.7139
Respiratory rate_mean: p = 0.0000
gender: Skipped (Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).)
admission_type: Skipped (Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).)
first_careunit: Skipped (Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).)
sepsis_icd9: Skipped (Singular matrix)

Outcome Associations (surv_30d, adjusting for has_antibiotic):
age: p = 0.0000
White blood cell count_mean: p = 0.0000
Temperature_mean: p = 0.0000
Oxygen saturation_mean: p = 0.0000
Creatinine_mean: p = 0.0000
Platelets_mean: p = 0.0002
Respiratory rate_mean: p = 0.0000
gender: Skipped (Pandas data cast to numpy dtype of object. Check input data with np.asarray

#### ***Purpose: Removing Platelets_mean from the list of variables: a variable that wasn't strongly linked to treatment decisions***

In [32]:
# Reference subset (same as original)
vars_anti = [
    "surv_30d", "has_antibiotic", "age", "gender", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"  # Excluded Platelets_mean
]
ref_anti = final_data[vars_anti].dropna()
print(f"Reference subset shape: {ref_anti.shape}")

# Encode icd9_codes
ref_anti = ref_anti.copy()
if ref_anti["icd9_codes"].dtype == object:
    ref_anti["sepsis_icd9"] = ref_anti["icd9_codes"].isin(["995.91", "995.92"]).astype(int)
    categorical_vars = ["gender", "admission_type", "first_careunit", "sepsis_icd9"]
else:
    categorical_vars = ["gender", "admission_type", "first_careunit", "icd9_codes"]

# Numerical confounders (no Platelets_mean)
numerical_vars = [
    "age", "White blood cell count_mean", "Temperature_mean",
    "Oxygen saturation_mean", "Creatinine_mean", "Respiratory rate_mean"
]

# Propensity Score Balance
print("\n=== Propensity Score Balance (Without Platelets_mean) ===")
X = ref_anti[numerical_vars + categorical_vars]
scaler = StandardScaler()
X[numerical_vars] = scaler.fit_transform(X[numerical_vars])
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)

# SMD before
treated = X[ref_anti["has_antibiotic"] == 1]
control = X[ref_anti["has_antibiotic"] == 0]
smd_before = abs(treated.mean() - control.mean()) / X.std()
print("SMD before adjustment (mean, max):", smd_before.mean(), smd_before.max())

# Propensity score
ps_model = LogisticRegression(max_iter=5000, solver="saga").fit(X, ref_anti["has_antibiotic"])
ps = ps_model.predict_proba(X)[:, 1]

# IPW weights
weights = ref_anti["has_antibiotic"] / ps + (1 - ref_anti["has_antibiotic"]) / (1 - ps)
weights = np.clip(weights, 0.1, 10)

# SMD after IPW
smd_after = abs(
    treated.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 1])) -
    control.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 0]))
) / X.std()
print("SMD after IPW (mean, max):", smd_after.mean(), smd_after.max())

Reference subset shape: (33883, 12)

=== Propensity Score Balance (Without Platelets_mean) ===
SMD before adjustment (mean, max): 0.12587399924466128 0.3031979490422918


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_vars] = scaler.fit_transform(X[numerical_vars])


SMD after IPW (mean, max): 0.022418172926089122 0.04051751319644393


#### ***Purpose: Refined Sensitivity Analysis (excluding platelets)  ***

In [33]:
# Reference subset
vars_anti = [
    "surv_30d", "has_antibiotic", "age", "gender", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"
]
ref_anti = final_data[vars_anti].dropna()
print(f"Primary subset shape: {ref_anti.shape}")

# Sensitivity analysis
vars_lactate = vars_anti + ["Lactate_mean"]
ref_lactate = final_data[vars_lactate].dropna()
print(f"Lactate subset shape: {ref_lactate.shape}")

ref_lactate = ref_lactate.copy()
if ref_lactate["icd9_codes"].dtype == object:
    ref_lactate["sepsis_icd9"] = ref_lactate["icd9_codes"].isin(["995.91", "995.92"]).astype(int)
    lactate_categorical = ["gender", "admission_type", "first_careunit", "sepsis_icd9"]
else:
    lactate_categorical = ["gender", "admission_type", "first_careunit", "icd9_codes"]

numerical_vars = ["age", "White blood cell count_mean", "Temperature_mean",
                 "Oxygen saturation_mean", "Creatinine_mean", "Respiratory rate_mean"]

X_lactate = ref_lactate[numerical_vars + lactate_categorical + ["Lactate_mean"]]
scaler = StandardScaler()
X_lactate[numerical_vars + ["Lactate_mean"]] = scaler.fit_transform(X_lactate[numerical_vars + ["Lactate_mean"]])
X_lactate = pd.get_dummies(X_lactate, columns=lactate_categorical, drop_first=True)

treated_lactate = X_lactate[ref_lactate["has_antibiotic"] == 1]
control_lactate = X_lactate[ref_lactate["has_antibiotic"] == 0]
smd_lactate_before = abs(treated_lactate.mean() - control_lactate.mean()) / X_lactate.std()
print("Lactate SMD before adjustment (mean, max):", smd_lactate_before.mean(), smd_lactate_before.max())

ps_lactate_model = LogisticRegression(max_iter=5000, solver="saga").fit(X_lactate, ref_lactate["has_antibiotic"])
ps_lactate = ps_lactate_model.predict_proba(X_lactate)[:, 1]
weights_lactate = ref_lactate["has_antibiotic"] / ps_lactate + (1 - ref_lactate["has_antibiotic"]) / (1 - ps_lactate)
weights_lactate = np.clip(weights_lactate, 0.1, 10)

smd_lactate_after = abs(
    treated_lactate.apply(lambda x: np.average(x, weights=weights_lactate[ref_lactate["has_antibiotic"] == 1])) -
    control_lactate.apply(lambda x: np.average(x, weights=weights_lactate[ref_lactate["has_antibiotic"] == 0]))
) / X_lactate.std()
print("Lactate SMD after IPW (mean, max):", smd_lactate_after.mean(), smd_lactate_after.max())
print("Lactate SMD after IPW per variable:\n", pd.Series(smd_lactate_after, index=X_lactate.columns))
print("Primary surv_30d rate:", ref_anti["surv_30d"].mean())
print("Lactate surv_30d rate:", ref_lactate["surv_30d"].mean())
print("Primary antibiotic %:", ref_anti["has_antibiotic"].mean())
print("Lactate antibiotic %:", ref_lactate["has_antibiotic"].mean())

Primary subset shape: (33883, 12)
Lactate subset shape: (20114, 13)
Lactate SMD before adjustment (mean, max): 0.12874402275972927 0.30630448970957536


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_lactate[numerical_vars + ["Lactate_mean"]] = scaler.fit_transform(X_lactate[numerical_vars + ["Lactate_mean"]])


Lactate SMD after IPW (mean, max): 0.014664006905853748 0.03154907069303353
Lactate SMD after IPW per variable:
 age                            0.016008
White blood cell count_mean    0.011897
Temperature_mean               0.016740
Oxygen saturation_mean         0.023419
Creatinine_mean                0.013330
Respiratory rate_mean          0.031549
Lactate_mean                   0.005620
gender_M                       0.011979
admission_type_EMERGENCY       0.001777
admission_type_URGENT          0.001845
first_careunit_CSRU            0.013804
first_careunit_MICU            0.014740
first_careunit_SICU            0.014654
first_careunit_TSICU           0.027934
dtype: float64
Primary surv_30d rate: 0.8767818670129564
Lactate surv_30d rate: 0.8419508799840907
Primary antibiotic %: 0.16433019508308
Lactate antibiotic %: 0.23849060355970966


#### ***Purpose: Statistical Analysis of Categorical Variables - Gender: No significant association (p = 0.7396). This suggests  clinicians don't treat males and females differently regarding antibiotics when accounting for other factors***

In [34]:
vars_anti = [
    "surv_30d", "has_antibiotic", "age", "gender", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"
]
ref_anti = final_data[vars_anti].dropna()
print(f"Subset shape: {ref_anti.shape}")

# Step 1.1: Inspect categoricals
print("\nCategorical distributions:")
for conf in ["gender", "admission_type", "first_careunit"]:
    print(f"\n{conf} distribution:")
    print(ref_anti[conf].value_counts(dropna=False))
    print(f"Type: {ref_anti[conf].dtype}")
    print(f"Missing: {ref_anti[conf].isna().sum()}")

# Step 1.2: Test categoricals with fixed encoding
for conf in ["gender", "admission_type", "first_careunit"]:
    try:
        # Ensure categorical type and drop NaNs
        data = ref_anti[[conf, "has_antibiotic", "surv_30d"]].dropna()
        data[conf] = data[conf].astype(str).astype("category")  # Convert to string then category
        
        # Create dummies
        dummies = pd.get_dummies(data[conf], prefix=conf, drop_first=True)
        X = sm.add_constant(dummies.astype(float))  # Ensure numeric
        y_treatment = data["has_antibiotic"].astype(float)
        
        # Treatment association
        model = sm.Logit(y_treatment, X).fit(disp=0, maxiter=1000)
        pvals = model.pvalues[1:]  # Skip constant
        print(f"\n{conf} (treatment):")
        for var, pval in zip(pvals.index, pvals):
            print(f"  {var}: p = {pval:.4f}")
        
        # Outcome association
        X_outcome = sm.add_constant(pd.concat([dummies.astype(float), 
                                             data["has_antibiotic"].astype(float)], axis=1))
        y_outcome = data["surv_30d"].astype(float)
        model_outcome = sm.Logit(y_outcome, X_outcome).fit(disp=0, maxiter=1000)
        pvals_outcome = model_outcome.pvalues[1:len(dummies.columns)+1]  # Skip constant, has_antibiotic
        print(f"{conf} (outcome):")
        for var, pval in zip(pvals_outcome.index, pvals_outcome):
            print(f"  {var}: p = {pval:.4f}")
            
    except Exception as e:
        print(f"\n{conf}: Skipped ({str(e)})")

Subset shape: (33883, 12)

Categorical distributions:

gender distribution:
gender
M    19158
F    14725
Name: count, dtype: int64
Type: object
Missing: 0

admission_type distribution:
admission_type
EMERGENCY    27432
ELECTIVE      5564
URGENT         887
Name: count, dtype: int64
Type: object
Missing: 0

first_careunit distribution:
first_careunit
MICU     12030
CSRU      6835
SICU      5451
CCU       5118
TSICU     4449
Name: count, dtype: int64
Type: object
Missing: 0

gender (treatment):
  gender_M: p = 0.7396
gender (outcome):
  gender_M: p = 0.0000

admission_type (treatment):
  admission_type_EMERGENCY: p = 0.0000
  admission_type_URGENT: p = 0.0258
admission_type (outcome):
  admission_type_EMERGENCY: p = 0.0000
  admission_type_URGENT: p = 0.0000

first_careunit (treatment):
  first_careunit_CSRU: p = 0.0000
  first_careunit_MICU: p = 0.0000
  first_careunit_SICU: p = 0.0000
  first_careunit_TSICU: p = 0.4844
first_careunit (outcome):
  first_careunit_CSRU: p = 0.0000
  first

#### ***Purpose: Removing Gender from the Propensity Score Model - Removing gender has negligible impact on the model's ability to balance confounders.***

In [35]:
# Updated subset without gender
vars_anti_no_gender = [
    "surv_30d", "has_antibiotic", "age", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"
]
ref_anti = final_data[vars_anti_no_gender].dropna()
print(f"Reference subset shape (no gender): {ref_anti.shape}")

# Encode icd9_codes
ref_anti = ref_anti.copy()
if ref_anti["icd9_codes"].dtype == object:
    ref_anti["sepsis_icd9"] = ref_anti["icd9_codes"].isin(["995.91", "995.92"]).astype(int)
    categorical_vars = ["admission_type", "first_careunit", "sepsis_icd9"]
else:
    categorical_vars = ["admission_type", "first_careunit", "icd9_codes"]

numerical_vars = [
    "age", "White blood cell count_mean", "Temperature_mean",
    "Oxygen saturation_mean", "Creatinine_mean", "Respiratory rate_mean"
]

# Propensity Score Balance
print("\n=== Propensity Score Balance (Without gender) ===")
X = ref_anti[numerical_vars + categorical_vars]
scaler = StandardScaler()
X[numerical_vars] = scaler.fit_transform(X[numerical_vars])
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)

# SMD before
treated = X[ref_anti["has_antibiotic"] == 1]
control = X[ref_anti["has_antibiotic"] == 0]
smd_before = abs(treated.mean() - control.mean()) / X.std()
print("SMD before adjustment (mean, max):", smd_before.mean(), smd_before.max())

# Propensity score
ps_model = LogisticRegression(max_iter=5000, solver="saga").fit(X, ref_anti["has_antibiotic"])
ps = ps_model.predict_proba(X)[:, 1]

# IPW weights
weights = ref_anti["has_antibiotic"] / ps + (1 - ref_anti["has_antibiotic"]) / (1 - ps)
weights = np.clip(weights, 0.1, 10)

# SMD after IPW
smd_after = abs(
    treated.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 1])) -
    control.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 0]))
) / X.std()
print("SMD after IPW (mean, max):", smd_after.mean(), smd_after.max())

Reference subset shape (no gender): (33883, 11)

=== Propensity Score Balance (Without gender) ===
SMD before adjustment (mean, max): 0.13595748442554603 0.3031979490422918


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_vars] = scaler.fit_transform(X[numerical_vars])


SMD after IPW (mean, max): 0.023603327807974684 0.040413689021012646


#### ***Purpose: Understanding the ICD-9 Code Analysis: 038.0 (Streptococcal septicemia), 038.9 (Unspecified septicemia)***

In [36]:
# Updated confounder set (no gender)
vars_anti = [
    "surv_30d", "has_antibiotic", "age", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"
]
ref_anti = final_data[vars_anti].dropna()
print(f"Reference subset shape: {ref_anti.shape}")

# Step 1.1: Inspect icd9_codes
print("\nSample icd9_codes:")
print(ref_anti["icd9_codes"].head(10))
print(f"Type: {ref_anti['icd9_codes'].dtype}")
print(f"Unique values (first 20): {ref_anti['icd9_codes'].unique()[:20]}")

# Step 1.2: Update sepsis_icd9
ref_anti = ref_anti.copy()
if ref_anti["icd9_codes"].dtype == object:
    ref_anti["sepsis_icd9"] = ref_anti["icd9_codes"].isin(
        ["995.91", "995.92", "038.0", "038.9"]
    ).astype(int)
else:
    print("Warning: icd9_codes not strings, attempting conversion")
    ref_anti["sepsis_icd9"] = ref_anti["icd9_codes"].astype(str).isin(
        ["995.91", "995.92", "038.0", "038.9"]
    ).astype(int)
print("\nUpdated sepsis_icd9 distribution:")
print(ref_anti["sepsis_icd9"].value_counts())

# Step 1.3: Propensity Score Balance
print("\n=== Propensity Score Balance (Updated sepsis_icd9) ===")
categorical_vars = ["admission_type", "first_careunit", "sepsis_icd9"]
numerical_vars = [
    "age", "White blood cell count_mean", "Temperature_mean",
    "Oxygen saturation_mean", "Creatinine_mean", "Respiratory rate_mean"
]
X = ref_anti[numerical_vars + categorical_vars]
scaler = StandardScaler()
X[numerical_vars] = scaler.fit_transform(X[numerical_vars])
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)

# SMD before
treated = X[ref_anti["has_antibiotic"] == 1]
control = X[ref_anti["has_antibiotic"] == 0]
smd_before = abs(treated.mean() - control.mean()) / X.std()
print("SMD before adjustment (mean, max):", smd_before.mean(), smd_before.max())

# Propensity score
ps_model = LogisticRegression(max_iter=5000, solver="saga").fit(X, ref_anti["has_antibiotic"])
ps = ps_model.predict_proba(X)[:, 1]

# IPW weights
weights = ref_anti["has_antibiotic"] / ps + (1 - ref_anti["has_antibiotic"]) / (1 - ps)
weights = np.clip(weights, 0.1, 10)

# SMD after IPW
smd_after = abs(
    treated.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 1])) -
    control.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 0]))
) / X.std()
print("SMD after IPW (mean, max):", smd_after.mean(), smd_after.max())

Reference subset shape: (33883, 11)

Sample icd9_codes:
0    ['0389', '78559', '5849', '4275', '41071', '42...
1    ['042', '1363', '7994', '2763', '7907', '5715'...
2    ['40391', '4440', '9972', '2766', '2767', '285...
3      ['431', '5070', '4280', '5849', '2765', '4019']
4                                             ['1913']
5    ['1570', '57410', '9971', '4275', '99811', '40...
6           ['41401', '4111', '25000', '4019', '2720']
7                   ['7455', '45829', 'V1259', '2724']
8    ['25080', '78039', '29633', 'V5867', 'E9323', ...
9    ['80502', '5990', '5964', 'E8809', '8220', '73...
Name: icd9_codes, dtype: object
Type: object
Unique values (first 20): ["['0389', '78559', '5849', '4275', '41071', '4280', '6826', '4254', '2639']"
 "['042', '1363', '7994', '2763', '7907', '5715', '04111', 'V090', 'E9317']"
 "['40391', '4440', '9972', '2766', '2767', '2859', '2753', 'V1582']"
 "['431', '5070', '4280', '5849', '2765', '4019']" "['1913']"
 "['1570', '57410', '9971', '4275', 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_vars] = scaler.fit_transform(X[numerical_vars])


SMD after IPW (mean, max): 0.023603559986632928 0.04041499282322323


#### ***Purpose: Successful Identification of Sepsis Patients ***

In [37]:
# Confounder set (no gender)
import ast
vars_anti = [
    "surv_30d", "has_antibiotic", "age", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"
]
ref_anti = final_data[vars_anti].dropna()
print(f"Reference subset shape: {ref_anti.shape}")

# Step 1.1: Parse icd9_codes
print("\nSample icd9_codes (raw):")
print(ref_anti["icd9_codes"].head(10))

def parse_icd9_codes(codes):
    try:
        # Handle stringified lists (e.g., "['0389', ...]")
        if isinstance(codes, str) and codes.startswith("["):
            codes_list = ast.literal_eval(codes)
            return [str(code).replace(".", "") for code in codes_list]  # Normalize (e.g., '038.9' -> '0389')
        elif isinstance(codes, str):
            return [codes.replace(".", "")]
        elif isinstance(codes, list):
            return [str(code).replace(".", "") for code in codes]
        return []
    except (ValueError, SyntaxError):
        return []

# Apply parsing
ref_anti = ref_anti.copy()
ref_anti["icd9_codes_parsed"] = ref_anti["icd9_codes"].apply(parse_icd9_codes)
print("\nSample icd9_codes (parsed):")
print(ref_anti["icd9_codes_parsed"].head(10))

# Step 1.2: Update sepsis_icd9
sepsis_codes = ["99591", "99592", "0380", "0389"]  # Normalized
ref_anti["sepsis_icd9"] = ref_anti["icd9_codes_parsed"].apply(
    lambda codes: int(any(code in sepsis_codes for code in codes))
)
print("\nUpdated sepsis_icd9 distribution:")
print(ref_anti["sepsis_icd9"].value_counts())
print(f"Prevalence: {ref_anti['sepsis_icd9'].mean():.2%}")

# Step 1.3: Propensity Score Balance
print("\n=== Propensity Score Balance (Fixed sepsis_icd9) ===")
categorical_vars = ["admission_type", "first_careunit", "sepsis_icd9"]
numerical_vars = [
    "age", "White blood cell count_mean", "Temperature_mean",
    "Oxygen saturation_mean", "Creatinine_mean", "Respiratory rate_mean"
]
X = ref_anti[numerical_vars + categorical_vars].copy()  # Avoid SettingWithCopyWarning
scaler = StandardScaler()
X.loc[:, numerical_vars] = scaler.fit_transform(X[numerical_vars])
X = pd.get_dummies(X, columns=categorical_vars, drop_first=True)

# SMD before
treated = X[ref_anti["has_antibiotic"] == 1]
control = X[ref_anti["has_antibiotic"] == 0]
smd_before = abs(treated.mean() - control.mean()) / X.std()
print("SMD before adjustment (mean, max):", smd_before.mean(), smd_before.max())

# Propensity score
ps_model = LogisticRegression(max_iter=5000, solver="saga").fit(X, ref_anti["has_antibiotic"])
ps = ps_model.predict_proba(X)[:, 1]

# IPW weights
weights = ref_anti["has_antibiotic"] / ps + (1 - ref_anti["has_antibiotic"]) / (1 - ps)
weights = np.clip(weights, 0.1, 10)

# SMD after IPW
smd_after = abs(
    treated.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 1])) -
    control.apply(lambda x: np.average(x, weights=weights[ref_anti["has_antibiotic"] == 0]))
) / X.std()
print("SMD after IPW (mean, max):", smd_after.mean(), smd_after.max())
print("SMD after IPW (per variable):\n", pd.Series(smd_after, index=X.columns))

Reference subset shape: (33883, 11)

Sample icd9_codes (raw):
0    ['0389', '78559', '5849', '4275', '41071', '42...
1    ['042', '1363', '7994', '2763', '7907', '5715'...
2    ['40391', '4440', '9972', '2766', '2767', '285...
3      ['431', '5070', '4280', '5849', '2765', '4019']
4                                             ['1913']
5    ['1570', '57410', '9971', '4275', '99811', '40...
6           ['41401', '4111', '25000', '4019', '2720']
7                   ['7455', '45829', 'V1259', '2724']
8    ['25080', '78039', '29633', 'V5867', 'E9323', ...
9    ['80502', '5990', '5964', 'E8809', '8220', '73...
Name: icd9_codes, dtype: object

Sample icd9_codes (parsed):
0    [0389, 78559, 5849, 4275, 41071, 4280, 6826, 4...
1    [042, 1363, 7994, 2763, 7907, 5715, 04111, V09...
2    [40391, 4440, 9972, 2766, 2767, 2859, 2753, V1...
3                  [431, 5070, 4280, 5849, 2765, 4019]
4                                               [1913]
5    [1570, 57410, 9971, 4275, 99811, 4019, 5680, 5.

#### ***Purpose: Save the antibiotic reference subset and checking missing values to ensure the complete dataset***

In [38]:
import pandas as pd
import ast

# Define confounder set, including identifiers
vars_anti = [
    "subject_id", "hadm_id", "icustay_id",  # Add identifiers
    "surv_30d", "has_antibiotic", "age", "icd9_codes",
    "admission_type", "first_careunit", "White blood cell count_mean",
    "Temperature_mean", "Oxygen saturation_mean", "Creatinine_mean",
    "Respiratory rate_mean"
]

# Assume final_data is your preprocessed MIMIC-III DataFrame
# Create reference subset (complete cases)
ref_anti = final_data[vars_anti].dropna()
print(f"Reference subset shape: {ref_anti.shape}")

# Verify no missing values
print("\nMissing values per column:")
print(ref_anti.isna().sum())
if ref_anti.isna().sum().sum() == 0:
    print("Confirmed: No missing values in reference subset")
else:
    print("Warning: Missing values detected!")

# Parse icd9_codes for sepsis_icd9
def parse_icd9_codes(codes):
    try:
        if isinstance(codes, str) and codes.startswith("["):
            codes_list = ast.literal_eval(codes)
            return [str(code).replace(".", "") for code in codes_list]
        elif isinstance(codes, str):
            return [codes.replace(".", "")]
        return []
    except:
        return []

ref_anti = ref_anti.copy()
ref_anti["icd9_codes_parsed"] = ref_anti["icd9_codes"].apply(parse_icd9_codes)
sepsis_codes = ["99591", "99592", "0380", "0389"]
ref_anti["sepsis_icd9"] = ref_anti["icd9_codes_parsed"].apply(
    lambda codes: int(any(code in sepsis_codes for code in codes))
)
print(f"\nsepsis_icd9 prevalence: {ref_anti['sepsis_icd9'].mean():.2%}")
print(f"Missing values in sepsis_icd9: {ref_anti['sepsis_icd9'].isna().sum()}")

# Drop temporary column
ref_anti = ref_anti.drop(columns=["icd9_codes_parsed"])

# Save dataset
save_path = "C:/MIMIC-Extract/data/ref_anti_with_ids.csv"  # Updated path
ref_anti.to_csv(save_path, index=False)
print(f"\nReference subset saved to: {save_path}")

# Load and verify
loaded_data = pd.read_csv(save_path)
print(f"\nLoaded dataset shape: {loaded_data.shape}")
print(f"Loaded sepsis_icd9 prevalence: {loaded_data['sepsis_icd9'].mean():.2%}")
print("\nMissing values in loaded dataset:")
print(loaded_data.isna().sum())
if loaded_data.isna().sum().sum() == 0:
    print("Confirmed: No missing values in loaded dataset")
else:
    print("Warning: Missing values in loaded dataset!")
print("Columns in saved dataset:", loaded_data.columns.tolist())

Reference subset shape: (33883, 14)

Missing values per column:
subject_id                     0
hadm_id                        0
icustay_id                     0
surv_30d                       0
has_antibiotic                 0
age                            0
icd9_codes                     0
admission_type                 0
first_careunit                 0
White blood cell count_mean    0
Temperature_mean               0
Oxygen saturation_mean         0
Creatinine_mean                0
Respiratory rate_mean          0
dtype: int64
Confirmed: No missing values in reference subset

sepsis_icd9 prevalence: 8.75%
Missing values in sepsis_icd9: 0

Reference subset saved to: C:/MIMIC-Extract/data/ref_anti_with_ids.csv

Loaded dataset shape: (33883, 15)
Loaded sepsis_icd9 prevalence: 8.75%

Missing values in loaded dataset:
subject_id                     0
hadm_id                        0
icustay_id                     0
surv_30d                       0
has_antibiotic                 0
age 