In [4]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder
from imblearn.combine import SMOTETomek
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
visualize = False
df = pd.read_csv("../src/dataframe_clean.csv")

# distribution
class_counts = df['readmission_30'].value_counts()
print("readmission class distribution (0 = unreadmitted, 1 = readmitted):")
print(class_counts)

readmission class distribution (0 = unreadmitted, 1 = readmitted):
readmission_30
0    48142
1     3297
Name: count, dtype: int64


In [5]:
# split X and Y
Y_df = df.set_index('HADM_ID')[['readmission_30']]
# Y_df.to_csv("../src/Y_df.csv")
X = df.drop(
    columns=[
        "SUBJECT_ID",
        "readmission_30",
        "readmission_60",
        "readmit_gap",
    ],
    errors="ignore"
)
# identify numeric and categorical
X_cat = X[['GENDER']].copy()
X_num = X.drop(columns=['GENDER'])

# categorical encode 0 -> female; 1 -> male
ohe = OneHotEncoder(sparse_output=False, drop='first')
X_cat_encoded = ohe.fit_transform(X_cat)
cat_feature_names = ohe.get_feature_names_out(X_cat.columns)
X_cat_encoded_df = pd.DataFrame(X_cat_encoded, columns=cat_feature_names, index=X_cat.index)

# outliers filter
# p_low, p_high = 0.01, 0.99
# lower = X_num.quantile(p_low)
# upper = X_num.quantile(p_high)
#
# # 2) Build mask: keep rows where *all* features are within those percentiles
# mask = ((X_num >= lower) & (X_num <= upper)).all(axis=1)
#
# # 3) Filter
# X_num_trim = X_num.loc[mask]
# X_cat_trim = X_cat.loc[mask]
# Y_trim     = Y.loc[mask]
#
# print(f"Kept {len(X_num_trim)} of {len(X_num)} rows after 1–99% trimming")


X_processed = pd.concat([X_cat_encoded_df, X_num], axis=1)
print(X_processed.head())

# Impute missing values in features before resampling
imputer = SimpleImputer(strategy='median')
X_processed = pd.DataFrame(imputer.fit_transform(X_processed), columns=X_processed.columns)
print("Missing values after imputation:", X_processed.isna().sum().sum())

   GENDER_M   HADM_ID  AGE  Albumin  Bicarbonate    Chloride  Creatinine  \
0       0.0  165315.0   64      NaN    27.500000  104.000000    0.600000   
1       1.0  152223.0   71      NaN    24.000000  111.000000    0.725000   
2       1.0  161859.0   39      NaN    24.000000  105.500000    0.925000   
3       1.0  129635.0   58      3.2    24.333333  104.222222    1.188889   
4       1.0  197661.0   72      NaN    25.625000  103.250000    1.312500   

   Glucose  Hematocrit  Hemoglobin  ...  CSRU  MICU  NICU  SICU  TSICU  \
0      NaN   35.900000   12.200000  ...   0.0   1.0   0.0   0.0    0.0   
1    113.7   25.911111    9.300000  ...   1.0   0.0   0.0   0.0    0.0   
2      NaN   40.866667   14.333333  ...   0.0   0.0   0.0   0.0    0.0   
3      NaN   31.333333   11.042857  ...   0.0   0.0   0.0   0.0    0.0   
4      NaN   35.411111   11.866667  ...   0.0   0.0   0.0   0.0    0.0   

   first_icu_los_hours  last_icu_los_hours  mean_icu_los_hours  \
0            27.450000          

In [6]:
# smote + tomek
smk = SMOTETomek(random_state=42)
X_res, y_res = smk.fit_resample(X_processed, df['readmission_30'])
df_smt = X_res.copy()
df_smt['readmission_30'] = y_res
print("distribution after smote+tomek:")
print(df_smt['readmission_30'].value_counts())
print(y_res.head())
df_smt.to_csv("../src/data_smote.csv", index=False)
print(df_smt.head())
# visualization
if visualize:
    const_cols = [
        col
        for col in X_num.columns
        if X_num[col].nunique(dropna=True) <= 1
    ]
    print("Dropping constant columns:", const_cols)
    X_num_plot = X_num.drop(columns=const_cols)
    # Scatter matrix of raw numeric features (no outlier filter)
    fig = scatter_matrix(
        X_num_plot,
        alpha=0.3,
        figsize=(30, 30),
        diagonal='kde'
    )
    plt.suptitle("Raw Numeric Features (No Outlier Filter)", y=0.92)
    plt.savefig('../figures/scatter_pre_no_outlier_filter.png')
    plt.clf()



distribution after smote+tomek:
readmission_30
0    46951
1    46951
Name: count, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: readmission_30, dtype: int64
   GENDER_M   HADM_ID   AGE  Albumin  Bicarbonate    Chloride  Creatinine  \
0       0.0  165315.0  64.0      3.2    27.500000  104.000000    0.600000   
1       1.0  152223.0  71.0      3.2    24.000000  111.000000    0.725000   
2       1.0  161859.0  39.0      3.2    24.000000  105.500000    0.925000   
3       1.0  129635.0  58.0      3.2    24.333333  104.222222    1.188889   
4       1.0  197661.0  72.0      3.2    25.625000  103.250000    1.312500   

      Glucose  Hematocrit  Hemoglobin  ...  MICU  NICU  SICU  TSICU  \
0  133.769231   35.900000   12.200000  ...   1.0   0.0   0.0    0.0   
1  113.700000   25.911111    9.300000  ...   0.0   0.0   0.0    0.0   
2  133.769231   40.866667   14.333333  ...   0.0   0.0   0.0    0.0   
3  133.769231   31.333333   11.042857  ...   0.0   0.0   0.0    0.0   
4  133.769231   3