In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report, 
                           roc_auc_score, roc_curve)
from sklearn.impute import KNNImputer, SimpleImputer
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 1. DATA LOADING
print("-"*80)
print("1. LOADING DATA...")
print("-"*80)

# Fetch dataset
print("\nLoading Dataset 1: Chronic Kidney Disease (UCI ID: 336)...")
ckd_dataset1 = fetch_ucirepo(id=336)

# Get features and targets
x1 = ckd_dataset1.data.features.copy()
y1 = ckd_dataset1.data.targets.copy()

print(f"UCI ID 336 - Dataset Shape: {x1.shape}")
print(f"UCI ID 336 - Target Shape: {y1.shape}")
print(f"UCI ID 336 - Features: {list(x1.columns)}")
print(f"UCI ID 336 - Samples: {x1.shape[0]}")
print(f"UCI ID 336 - Target Classes: {y1['class'].unique()}")
print(f"UCI ID 336 - Target Classes Count: \n{y1['class'].value_counts()}")

# Fetch dataset
print("\nLoading Dataset 2: Risk Factor Prediction of CKD (UCI ID: 857)...")
ckd_dataset1 = fetch_ucirepo(id=857)

# Get features and targets
x2 = ckd_dataset1.data.features.copy()
y2 = ckd_dataset1.data.targets.copy()

print(f"UCI ID 857 - Dataset Shape: {x2.shape}")
print(f"UCI ID 857 - Target Shape: {y2.shape}")
print(f"UCI ID 857 - Features: {list(x2.columns)}")
print(f"UCI ID 857 - Samples: {x2.shape[0]}")
print(f"UCI ID 857 - Target Classes: {y2['class'].unique()}")
print(f"UCI ID 857 - Target Classes Count: \n{y2['class'].value_counts()}")

--------------------------------------------------------------------------------
1. LOADING DATA...
--------------------------------------------------------------------------------

Loading Dataset 1: Chronic Kidney Disease (UCI ID: 336)...


In [4]:
# 1.1 Replacing ckd\t to ckd

# Count problematic rows before cleaning
ckd_tab_count = (y1['class'] == 'ckd\t').sum()
whitespace_issues = (y1['class'] != y1['class'].str.strip()).sum()

# Clean the class column by stripping whitespace
y1['class'] = y1['class'].str.strip()

# Verify cleaning worked
print(f"Target Classes: {y1['class'].unique()}")
print(f"Target Classes Count: {y1['class'].value_counts()}")

Target Classes: ['ckd' 'notckd']
Target Classes Count: ckd       250
notckd    150
Name: class, dtype: int64


In [5]:
# 2. DATA ANALYSIS & EXPLORATION
print("-"*80)
print("2. DATA ANALYSIS & EXPLORATION...")
print("-"*80)

# Basic info dataset 1
print("\n--- Dataset 1 Analysis ---")
print(f"- Total samples: {len(x1)}")
print(f"- Total features: {x1.shape[1]}")
print(f"- Target distribution:")
target_counts_336 = y1['class'].value_counts()
for cls, count in target_counts_336.items():
    print(f"  • {cls}: {count} ({count/len(y1)*100:.1f}%)")

# Missing values analysis dataset 1
print("\nMissing Values Analysis:")
missing_info_336 = []
for col in x1.columns:
    missing_count = x1[col].isnull().sum()
    missing_percent = (missing_count / len(x1)) * 100
    missing_info_336.append({
        'Feature': col,
        'Missing_Count': missing_count,
        'Missing_Percent': missing_percent,
        'Data_Type': str(x1[col].dtype)
    })

missing_df_336 = pd.DataFrame(missing_info_336)
missing_df_336 = missing_df_336.sort_values('Missing_Percent', ascending=False)
print(missing_df_336.to_string(index=False))

# Data types analysis dataset 1
print(f"\nData Types Summary:")
numeric_features_336 = []
categorical_features_336 = []

for col in x1.columns:
    if x1[col].dtype in ['int64', 'float64']:
        numeric_features_336.append(col)
    else:
        categorical_features_336.append(col)

print(f"- Numeric features ({len(numeric_features_336)}): {numeric_features_336}")
print(f"- Categorical features ({len(categorical_features_336)}): {categorical_features_336}")

# Basic info dataset 2
print("\n--- Dataset 2 Analysis ---")
print(f"- Total samples: {len(x2)}")
print(f"- Total features: {x2.shape[1]}")
print(f"- Target distribution:")
target_counts_857 = y2['class'].value_counts()
for cls, count in target_counts_857.items():
    print(f"  • {cls}: {count} ({count/len(y2)*100:.1f}%)")

# Missing values analysis dataset 2
print("\nMissing Values Analysis:")
missing_info_857 = []
for col in x2.columns:
    missing_count = x2[col].isnull().sum()
    missing_percent = (missing_count / len(x2)) * 100
    missing_info_857.append({
        'Feature': col,
        'Missing_Count': missing_count,
        'Missing_Percent': missing_percent,
        'Data_Type': str(x2[col].dtype)
    })

missing_df_857 = pd.DataFrame(missing_info_857)
missing_df_857 = missing_df_857.sort_values('Missing_Percent', ascending=False)
print(missing_df_857.to_string(index=False))

# Data types analysis dataset 2
print(f"\nData Types Summary:")
numeric_features_857 = []
categorical_features_857 = []

for col in x2.columns:
    if x2[col].dtype in ['int64', 'float64']:
        numeric_features_857.append(col)
    else:
        categorical_features_857.append(col)

print(f"- Numeric features ({len(numeric_features_857)}): {numeric_features_857}")
print(f"- Categorical features ({len(categorical_features_857)}): {categorical_features_857}")

----------------------------------------
2. DATA ANALYSIS & EXPLORATION...
----------------------------------------

--- Dataset 1 Analysis ---
- Total samples: 400
- Total features: 24
- Target distribution:
  • ckd: 250 (62.5%)
  • notckd: 150 (37.5%)

Missing Values Analysis:
Feature  Missing_Count  Missing_Percent Data_Type
    rbc            152            38.00    object
   rbcc            131            32.75   float64
   wbcc            106            26.50   float64
    pot             88            22.00   float64
    sod             87            21.75   float64
    pcv             71            17.75   float64
     pc             65            16.25    object
   hemo             52            13.00   float64
     su             49            12.25   float64
     sg             47            11.75   float64
     al             46            11.50   float64
    bgr             44            11.00   float64
     bu             19             4.75   float64
     sc             

In [6]:
# 3. DATA CLEANING & PREPROCESSING
print("-"*80)
print("3. DATA CLEANING & PREPROCESSING...")
print("-"*80)

print("\nProcessing Dataset 1...")
# Create a copy for processing
x1_processed = x1.copy()
y1_processed = y1.copy()

y1_processed['class'] = y1_processed['class'].str.strip()

# Binary mappings
binary_mappings_336 = {
    'rbc': {'normal': 0, 'abnormal': 1},
    'pc': {'normal': 0, 'abnormal': 1},
    'pcc': {'notpresent': 0, 'present': 1},
    'ba': {'notpresent': 0, 'present': 1},
    'htn': {'no': 0, 'yes': 1},
    'dm': {'no': 0, 'yes': 1},
    'cad': {'no': 0, 'yes': 1},
    'appet': {'good': 0, 'poor': 1},
    'pe': {'no': 0, 'yes': 1},
    'ane': {'no': 0, 'yes': 1}
}

# Apply binary mappings
for feature, mapping in binary_mappings_336.items():
    if feature in x1_processed.columns:
        x1_processed[feature] = x1_processed[feature].map(mapping)

# Convert ordinal features to numeric
for feature in ['sg', 'al', 'su']:
    if feature in x1_processed.columns:
        x1_processed[feature] = pd.to_numeric(x1_processed[feature], errors='coerce')

# Convert target to binary
y1_processed['class'] = y1_processed['class'].map({'notckd': 0, 'ckd': 1})

print("\nHandling missing values for Dataset 1...")

# Separate features by type for different imputation strategies
numeric_cols_336 = x1_processed.select_dtypes(include=[np.number]).columns
categorical_cols_336 = x1_processed.select_dtypes(exclude=[np.number]).columns

print(f"Numeric columns for imputation: {list(numeric_cols_336)}")
print(f"Categorical columns for imputation: {list(categorical_cols_336)}")

# For numeric features, use KNN imputation
if len(numeric_cols_336) > 0:
    knn_imputer = KNNImputer(n_neighbors=5)
    x1_processed[numeric_cols_336] = knn_imputer.fit_transform(x1_processed[numeric_cols_336])

# For categorical features (if any remaining), use mode imputation
if len(categorical_cols_336) > 0:
    mode_imputer = SimpleImputer(strategy='most_frequent')
    x1_processed[categorical_cols_336] = mode_imputer.fit_transform(x1_processed[categorical_cols_336])

# Feature engineering - create some additional features
print("\nFeature Engineering for Dataset 1...")

# BMI-like indicator using available blood parameters
if all(col in x1_processed.columns for col in ['hemo', 'pcv']):
    x1_processed['hemo_pcv_ratio'] = x1_processed['hemo'] / (x1_processed['pcv'] + 0.001)

# Kidney function indicator
if all(col in x1_processed.columns for col in ['sc', 'bu']):
    x1_processed['kidney_function_score'] = x1_processed['sc'] * x1_processed['bu']

# Blood pressure category
if 'bp' in x1_processed.columns:
    x1_processed['bp_category'] = pd.cut(x1_processed['bp'], 
                                       bins=[0, 90, 120, 140, 200], 
                                       labels=[0, 1, 2, 3])
    x1_processed['bp_category'] = x1_processed['bp_category'].astype(int)
    
# Add dataset identifier
x1_processed['dataset_source'] = 1

print(f"Missing values after imputation: {x1_processed.isnull().sum().sum()}")
print(f"Final feature set for dataset 1: {x1_processed.shape[1]} features")

print("\nProcessing Dataset 2...")
# Create a copy for processing
x2_processed = x2.copy()
y2_processed = y2.copy()

x2_processed.columns = x2_processed.columns.str.lower()
y2_processed.columns = y2_processed.columns.str.lower()

print("\nHandling missing values for Dataset 2...")

# Separate features by type for different imputation strategies
numeric_cols_857 = x2_processed.select_dtypes(include=[np.number]).columns
categorical_cols_857 = x2_processed.select_dtypes(exclude=[np.number]).columns

print(f"Numeric columns for imputation: {list(numeric_cols_857)}")
print(f"Categorical columns for imputation: {list(categorical_cols_857)}")

# For numeric features, use KNN imputation
if len(numeric_cols_857) > 0:
    knn_imputer = KNNImputer(n_neighbors=5)
    x2_processed[numeric_cols_857] = knn_imputer.fit_transform(x2_processed[numeric_cols_857])

# For categorical features (if any remaining), use mode imputation
if len(categorical_cols_857) > 0:
    # Encode categorical variables
    label_encoders = {}
    for col in categorical_cols_857:
        le = LabelEncoder()
        x2_processed[col] = le.fit_transform(x2_processed[col].astype(str))
        label_encoders[col] = le
        
# Convert target to binary (0: no CKD, 1: CKD)
if 'class' in y2_processed.columns:
    unique_classes = y2_processed['class'].unique()
    if len(unique_classes) == 2:
        # Assuming the positive class contains 'ckd' or similar
        y2_processed['class'] = y2_processed['class'].apply(
            lambda x: 1 if str(x).lower() in ['ckd', '1', 'yes'] else 0
        )

# Add dataset identifier
x2_processed['dataset_source'] = 2

print(f"Missing values after imputation: {x2_processed.isnull().sum().sum()}")
print(f"Final feature set: {x2_processed.shape[1]} features")


----------------------------------------
3. DATA CLEANING & PREPROCESSING...
----------------------------------------

Processing Dataset 1...

Handling missing values for Dataset 1...
Numeric columns for imputation: ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
Categorical columns for imputation: []

Feature Engineering for Dataset 1...
Missing values after imputation: 0
Final feature set for dataset 1: 28 features

Processing Dataset 2...

Handling missing values for Dataset 2...
Numeric columns for imputation: ['bp (diastolic)', 'bp limit', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'affected']
Categorical columns for imputation: ['sg', 'al', 'su', 'bgr', 'bu', 'sod', 'sc', 'pot', 'hemo', 'pcv', 'rbcc', 'wbcc', 'grf', 'stage', 'age']
Missing values after imputation: 0
Final feature set: 29 features


In [8]:
# 4. FEATURE ALIGNMENT & DATASET COMBINATION
print("-"*80)
print("4. FEATURE ALIGNMENT & DATASET COMBINATION")
print("-"*80)

# Find common features (excluding dataset_source)
common_features = list(set(x1_processed.columns) & set(x2_processed.columns))
common_features.remove('dataset_source')

print(f"\nCommon features found: {len(common_features)}")
print(f"Features: {common_features[:10]}..." if len(common_features) > 10 else f"Features: {common_features}")

# Create unified dataset with common features + dataset_source
x1_common = x1_processed[common_features + ['dataset_source']].copy()
x2_common = x2_processed[common_features + ['dataset_source']].copy()

# Combine datasets
x_combined = pd.concat([x1_common, x2_common], axis=0, ignore_index=True)
y_combined = pd.concat([y1_processed['class'], y2_processed['class']], axis=0, ignore_index=True)

print(f"\nCombined dataset shape: {x_combined.shape}")
print(f"Total samples: {len(x_combined)}")
print(f"Total features: {x_combined.shape[1]}")
print(f"\nCombined target distribution:")
print(f"  - No CKD (0): {(y_combined == 0).sum()} ({(y_combined == 0).sum()/len(y_combined)*100:.2f}%)")
print(f"  - CKD (1): {(y_combined == 1).sum()} ({(y_combined == 1).sum()/len(y_combined)*100:.2f}%)")

-----------------------------------
4. FEATURE ALIGNMENT & DATASET COMBINATION
-----------------------------------

Common features found: 23
Features: ['age', 'pcv', 'pot', 'hemo', 'wbcc', 'al', 'su', 'bu', 'bgr', 'ba']...

Combined dataset shape: (600, 24)
Total samples: 600
Total features: 24

Combined target distribution:
  - No CKD (0): 222 (37.00%)
  - CKD (1): 378 (63.00%)


In [11]:
# 5. TRAIN/TEST SPLIT
print("-"*80)
print("5. CREATING TRAIN/TEST SPLIT...")
print("-"*80)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    x_combined, y_combined, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_combined
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Train class distribution: CKD={np.sum(y_train)}, No CKD={len(y_train)-np.sum(y_train)}")
print(f"Test class distribution: CKD={np.sum(y_test)}, No CKD={len(y_test)-np.sum(y_test)}")

# Feature scaling
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

-----------------------------------
5. CREATING TRAIN/TEST SPLIT...
-----------------------------------
Training set: 480 samples
Testing set: 120 samples
Train class distribution: CKD=302, No CKD=178
Test class distribution: CKD=76, No CKD=44


In [13]:
# 6. MODEL TRAINING & EVALUATION
print("-"*38)
print("6. MODEL TRAINING & EVALUATION...")
print("-"*38)

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'XGBoost': XGBClassifier(random_state=42, n_estimators=100, eval_metric='logloss'),
    'SVM (RBF)': SVC(random_state=42, probability=True, kernel='rbf', class_weight='balanced'),
    'SVM (Linear)': SVC(random_state=42, probability=True, kernel='linear', class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=100)
}

# Storage for results
results = []
model_objects = {}
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Training and evaluating models...")

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Choose scaled or unscaled data based on model type
    if name in ['Logistic Regression', 'SVM (RBF)', 'SVM (Linear)', 'KNN', 'Naive Bayes']:
        X_train_model = X_train_scaled
        X_test_model = X_test_scaled
    else:
        X_train_model = X_train
        X_test_model = X_test
    
    # Train model
    model.fit(X_train_model, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_model)
    y_pred_proba = model.predict_proba(X_test_model)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    # ROC-AUC if probability predictions available
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    # Specificity (True Negative Rate)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # Store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Specificity': specificity,
        'ROC-AUC': roc_auc,
        'CV_Mean': cv_mean,
        'CV_Std': cv_std
    })
    
    # Store model object
    model_objects[name] = model
    
    print(f"  Accuracy:    {accuracy:.4f}")
    print(f"  Precision:   {precision:.4f}")
    print(f"  Recall:      {recall:.4f}")
    print(f"  F1-Score:    {f1:.4f}")
    print(f"  Specificity: {specificity:.4f}")
    if roc_auc:
        print(f"  ROC-AUC:     {roc_auc:.4f}")
    print(f"  CV Score:    {cv_mean:.4f} ± {cv_std:.4f}")
    print()

--------------------------------------
6. MODEL TRAINING & EVALUATION...
--------------------------------------
Training and evaluating models...

Training Logistic Regression...
  Accuracy:    0.9833
  Precision:   1.0000
  Recall:      0.9737
  F1-Score:    0.9867
  Specificity: 1.0000
  ROC-AUC:     0.9988
  CV Score:    0.9458 ± 0.0267


Training Random Forest...
  Accuracy:    1.0000
  Precision:   1.0000
  Recall:      1.0000
  F1-Score:    1.0000
  Specificity: 1.0000
  ROC-AUC:     1.0000
  CV Score:    0.9917 ± 0.0102


Training Gradient Boosting...
  Accuracy:    0.9917
  Precision:   1.0000
  Recall:      0.9868
  F1-Score:    0.9934
  Specificity: 1.0000
  ROC-AUC:     1.0000
  CV Score:    0.9750 ± 0.0182


Training XGBoost...
  Accuracy:    0.9833
  Precision:   1.0000
  Recall:      0.9737
  F1-Score:    0.9867
  Specificity: 1.0000
  ROC-AUC:     0.9910
  CV Score:    0.9833 ± 0.0169


Training SVM (RBF)...
  Accuracy:    0.6917
  Precision:   0.7097
  Recall:      0.86

In [14]:
# 7. RESULTS SUMMARY

print("-"*80)
print("7. COMPREHENSIVE MODEL PERFORMANCE COMPARISON")
print("-"*80)

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1-Score', ascending=False)
print("\n" + results_df.round(4).to_string(index=False))

# Find best models
best_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]
best_f1 = results_df.loc[results_df['F1-Score'].idxmax()]
best_roc = results_df.loc[results_df['ROC-AUC'].idxmax()]

print("\n" + "="*80)
print("BEST PERFORMING MODELS")
print("="*80)
print(f"\nBest Accuracy:  {best_accuracy['Model']} ({best_accuracy['Accuracy']:.4f})")
print(f"Best F1-Score:  {best_f1['Model']} ({best_f1['F1-Score']:.4f})")
print(f"Best ROC-AUC:   {best_roc['Model']} ({best_roc['ROC-AUC']:.4f})")

--------------------------------------
7. COMPREHENSIVE MODEL PERFORMANCE COMPARISON
--------------------------------------

              Model  Accuracy  Precision  Recall  F1-Score  Specificity  ROC-AUC  CV_Mean  CV_Std
      Random Forest    1.0000     1.0000  1.0000    1.0000       1.0000   1.0000   0.9917  0.0102
           AdaBoost    1.0000     1.0000  1.0000    1.0000       1.0000   1.0000   0.9854  0.0106
  Gradient Boosting    0.9917     1.0000  0.9868    0.9934       1.0000   1.0000   0.9750  0.0182
       SVM (Linear)    0.9917     1.0000  0.9868    0.9934       1.0000   0.9997   0.9583  0.0228
      Decision Tree    0.9833     0.9868  0.9868    0.9868       0.9773   0.9821   0.9688  0.0174
Logistic Regression    0.9833     1.0000  0.9737    0.9867       1.0000   0.9988   0.9458  0.0267
            XGBoost    0.9833     1.0000  0.9737    0.9867       1.0000   0.9910   0.9833  0.0169
                KNN    0.9750     1.0000  0.9605    0.9799       1.0000   0.9931   0.9396  