In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTENC, SMOTE
import pandas as pd
import numpy as np

In [2]:
# --- 1. Load Data and handel imbalance ---
df_336 = pd.read_csv("UCI_336.csv")
df_857 = pd.read_csv("UCI_857.csv")
df_ckd = pd.read_csv("Chronic_Kidney_Disease_data.csv")

In [3]:
# --- 2. Harmonization and Feature Engineering ---
# Target Unification and Feature Renaming for D3
df_ckd = df_ckd.rename(columns={'Diagnosis': 'class', 'Age': 'age', 'SerumCreatinine': 'sc', 
                                'HemoglobinLevels': 'hemo', 'ProteinInUrine': 'al'})

# Derived Proxy Features for D3
df_ckd['htn_proxy'] = ((df_ckd['SystolicBP'] >= 140) | (df_ckd['DiastolicBP'] >= 90)).astype(int)
df_ckd['dm_proxy'] = ((df_ckd['FastingBloodSugar'] >= 126) | (df_ckd['HbA1c'] >= 6.5)).astype(int)

# Define Feature Sets
features_m1 = ['age', 'htn', 'dm', 'cad', 'su', 'sg', 'al', 'bp', 'sc', 'hemo', 'rbcc']
features_m2 = ['age', 'htn', 'dm', 'cad', 'su', 'sg', 'al', 'sc', 'gfr', 'hemo', 'rbcc']
features_m3 = ['age', 'htn_proxy', 'dm_proxy', 'sc', 'GFR', 'hemo', 'al',
               'SystolicBP', 'DiastolicBP', 'FastingBloodSugar', 'HbA1c',
               'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
               'CholesterolTriglycerides', 'ACR']

data_dict = {
    'df_336': {'df': df_336, 'features': features_m1, 'target': 'class'},
    'df_857': {'df': df_857, 'features': features_m2, 'target': 'class'},
    'df_ckd': {'df': df_ckd, 'features': features_m3, 'target': 'class'}
}
split_data = {}

In [4]:
# --- 3. Data Splitting and Scaling (for each dataset independently) ---
for name, data in data_dict.items():
    df, features, target = data['df'], data['features'], data['target']
    
    # 3A: Split
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[target], test_size=0.2, random_state=42, stratify=df[target]
    )
    
    # 3B: Handle Class Imbalance (ONLY on training data)
    if name == 'df_336':
        cat_features_336 = ['htn', 'dm', 'cad', 'su', 'al'] 
        cat_indices = [X_train.columns.get_loc(col) for col in cat_features_336 if col in X_train.columns]
        smotenc = SMOTENC(categorical_features=cat_indices, random_state=42)
        X_train, y_train = smotenc.fit_resample(X_train, y_train)
        
    elif name == 'df_857':
        cat_features_857 = ['htn', 'dm', 'cad', 'su', 'rbcc']
        cat_indices = [X_train.columns.get_loc(col) for col in cat_features_857 if col in X_train.columns]
        smotenc = SMOTENC(categorical_features=cat_indices, random_state=42)
        X_train, y_train = smotenc.fit_resample(X_train, y_train)
        
    elif name == 'df_ckd':
        # CKD: all numerical features, use regular SMOTE
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # Identify numerical columns for scaling
    binary_cols = ['htn', 'dm', 'cad', 'htn_proxy', 'dm_proxy']
    cols_to_scale = [col for col in X_train.columns if col not in binary_cols and X_train[col].dtype in [np.float64, np.int64]]
    
    # Scale
    scaler = StandardScaler()
    X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()
    X_train_scaled.loc[:, cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
    X_test_scaled.loc[:, cols_to_scale] = scaler.transform(X_test[cols_to_scale])
    
    split_data[name] = {
        'X_train': X_train_scaled, 'X_test': X_test_scaled, 'y_train': y_train, 'y_test': y_test
    }


In [5]:
# --- 4. Base Model Training (Level 0) with Imbalance Handling ---
common_features = ['age', 'sc', 'hemo']
X_train_ckd = split_data['df_ckd']['X_train']
y_train_ckd = split_data['df_ckd']['y_train']
X_test_ckd = split_data['df_ckd']['X_test']
y_test_ckd = split_data['df_ckd']['y_test']

# M1: SVC (Balanced, trained on D1 common features)
svc_common_balanced = SVC(kernel='rbf', probability=True, random_state=42, class_weight='balanced')
svc_common_balanced.fit(split_data['df_336']['X_train'][common_features], split_data['df_336']['y_train'])

# M2: Decision Tree (Balanced, trained on D2 common features)
dt_common_balanced = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt_common_balanced.fit(split_data['df_857']['X_train'][common_features], split_data['df_857']['y_train'])

# M3: KNN (trained on D3 full features)
knn_ckd = KNeighborsClassifier(n_neighbors=5)
knn_ckd.fit(X_train_ckd, y_train_ckd)

KNeighborsClassifier()

In [6]:
# --- 5. Generate Meta-Features (Training and Test Sets) ---
X_train_ckd_common = X_train_ckd[common_features]
X_test_ckd_common = X_test_ckd[common_features]

# Training Meta-Features
m1_train_proba = svc_common_balanced.predict_proba(X_train_ckd_common)[:, 1]
m2_train_proba = dt_common_balanced.predict_proba(X_train_ckd_common)[:, 1]
m3_train_proba = knn_ckd.predict_proba(X_train_ckd)[:, 1] 
X_meta_train = pd.DataFrame({'M1_Proba': m1_train_proba, 'M2_Proba': m2_train_proba, 'M3_Proba': m3_train_proba})

# Test Meta-Features
m1_test_proba = svc_common_balanced.predict_proba(X_test_ckd_common)[:, 1]
m2_test_proba = dt_common_balanced.predict_proba(X_test_ckd_common)[:, 1]
m3_test_proba = knn_ckd.predict_proba(X_test_ckd)[:, 1]
X_meta_test = pd.DataFrame({'M1_Proba': m1_test_proba, 'M2_Proba': m2_test_proba, 'M3_Proba': m3_test_proba})


In [7]:
# --- 6. Meta-Learner Training (Level 1) ---
# Tuned and Balanced Logistic Regression
meta_learner_tuned_balanced = LogisticRegression(
    C=1, 
    solver='lbfgs', 
    random_state=42, 
    class_weight='balanced'
)

meta_learner_tuned_balanced.fit(X_meta_train, y_train_ckd)


LogisticRegression(C=1, class_weight='balanced', random_state=42)

In [8]:
# --- 7. Final Evaluation ---
y_pred_proba_final = meta_learner_tuned_balanced.predict_proba(X_meta_test)[:, 1]
y_pred_final = meta_learner_tuned_balanced.predict(X_meta_test)

In [9]:
# Calculate Final Metrics
accuracy_final = accuracy_score(y_test_ckd, y_pred_final)
f1_final = f1_score(y_test_ckd, y_pred_final)
roc_auc_final = roc_auc_score(y_test_ckd, y_pred_proba_final)

# --- Output Results ---
print("\n--- Final Stacked Ensemble Model Performance (on D3 Test Set) ---")
print(f"Base Models: SVC (D1), Decision Tree (D2), KNN (D3)")
print(f"Meta-Learner: Tuned Logistic Regression (C=1, class_weight='balanced')")
print(f"\nAccuracy: {accuracy_final:.4f}")
print(f"F1 Score: {f1_final:.4f}")
print(f"ROC AUC: {roc_auc_final:.4f}")


--- Final Stacked Ensemble Model Performance (on D3 Test Set) ---
Base Models: SVC (D1), Decision Tree (D2), KNN (D3)
Meta-Learner: Tuned Logistic Regression (C=1, class_weight='balanced')

Accuracy: 0.7831
F1 Score: 0.8732
ROC AUC: 0.6974
