In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report, roc_auc_score)
from sklearn.impute import KNNImputer, SimpleImputer
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings('ignore')

In [11]:
# 1. DATA LOADING
print("-"*30)
print("1. LOADING DATA...")
print("-"*30)

# Fetch dataset
chronic_kidney_disease = fetch_ucirepo(id=336)

# Get features and targets
X = chronic_kidney_disease.data.features.copy()
y = chronic_kidney_disease.data.targets.copy()

print(f"Dataset Shape: {X.shape}")
print(f"Target Shape: {y.shape}")
print(f"Features: {list(X.columns)}")
print(f"Target Classes: {y['class'].unique()}")
print(f"Target Classes Count: {y['class'].value_counts()}")

------------------------------
1. LOADING DATA...
------------------------------
Dataset Shape: (400, 24)
Target Shape: (400, 1)
Features: ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
Target Classes: ['ckd' 'ckd\t' 'notckd']
Target Classes Count: ckd       248
notckd    150
ckd\t       2
Name: class, dtype: int64


In [13]:
# 1.1 Replacing ckd\t to ckd

# Count problematic rows before cleaning
ckd_tab_count = (y['class'] == 'ckd\t').sum()
whitespace_issues = (y['class'] != y['class'].str.strip()).sum()

# Clean the class column by stripping whitespace
y['class'] = y['class'].str.strip()

# Verify cleaning worked
print(f"Target Classes: {y['class'].unique()}")
print(f"Target Classes Count: {y['class'].value_counts()}")

Target Classes: ['ckd' 'notckd']
Target Classes Count: ckd       250
notckd    150
Name: class, dtype: int64


In [14]:
# 2. DATA ANALYSIS & EXPLORATION
print("-"*40)
print("2. DATA ANALYSIS & EXPLORATION...")
print("-"*40)

# Basic info
print("Dataset Info:")
print(f"- Total samples: {len(X)}")
print(f"- Total features: {X.shape[1]}")
print(f"- Target distribution:")
target_counts = y['class'].value_counts()
for cls, count in target_counts.items():
    print(f"  • {cls}: {count} ({count/len(y)*100:.1f}%)")

# Missing values analysis
print("\nMissing Values Analysis:")
missing_info = []
for col in X.columns:
    missing_count = X[col].isnull().sum()
    missing_percent = (missing_count / len(X)) * 100
    missing_info.append({
        'Feature': col,
        'Missing_Count': missing_count,
        'Missing_Percent': missing_percent,
        'Data_Type': str(X[col].dtype)
    })

missing_df = pd.DataFrame(missing_info)
missing_df = missing_df.sort_values('Missing_Percent', ascending=False)
print(missing_df.to_string(index=False))

# Data types analysis
print(f"\nData Types Summary:")
numeric_features = []
categorical_features = []

for col in X.columns:
    if X[col].dtype in ['int64', 'float64']:
        numeric_features.append(col)
    else:
        categorical_features.append(col)

print(f"- Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"- Categorical features ({len(categorical_features)}): {categorical_features}")

----------------------------------------
2. DATA ANALYSIS & EXPLORATION...
----------------------------------------
Dataset Info:
- Total samples: 400
- Total features: 24
- Target distribution:
  • ckd: 250 (62.5%)
  • notckd: 150 (37.5%)

Missing Values Analysis:
Feature  Missing_Count  Missing_Percent Data_Type
    rbc            152            38.00    object
   rbcc            131            32.75   float64
   wbcc            106            26.50   float64
    pot             88            22.00   float64
    sod             87            21.75   float64
    pcv             71            17.75   float64
     pc             65            16.25    object
   hemo             52            13.00   float64
     su             49            12.25   float64
     sg             47            11.75   float64
     al             46            11.50   float64
    bgr             44            11.00   float64
     bu             19             4.75   float64
     sc             17            

In [16]:
# 3. DATA CLEANING & PREPROCESSING
print("-"*40)
print("3. DATA CLEANING & PREPROCESSING...")
print("-"*40)

# Create a copy for processing
X_processed = X.copy()
y_processed = y.copy()

# Binary mappings
binary_mappings = {
    'rbc': {'normal': 0, 'abnormal': 1},
    'pc': {'normal': 0, 'abnormal': 1},
    'pcc': {'notpresent': 0, 'present': 1},
    'ba': {'notpresent': 0, 'present': 1},
    'htn': {'no': 0, 'yes': 1},
    'dm': {'no': 0, 'yes': 1},
    'cad': {'no': 0, 'yes': 1},
    'appet': {'good': 0, 'poor': 1},
    'pe': {'no': 0, 'yes': 1},
    'ane': {'no': 0, 'yes': 1}
}

# Apply binary mappings
for feature, mapping in binary_mappings.items():
    if feature in X_processed.columns:
        X_processed[feature] = X_processed[feature].map(mapping)

# Handle ordinal categorical features
# Specific gravity - convert to numeric
if 'sg' in X_processed.columns:
    X_processed['sg'] = pd.to_numeric(X_processed['sg'], errors='coerce')

# Albumin and Sugar - already numeric
for feature in ['al', 'su']:
    if feature in X_processed.columns:
        X_processed[feature] = pd.to_numeric(X_processed[feature], errors='coerce')

# Convert target to binary
y_processed['class'] = y_processed['class'].map({'notckd': 0, 'ckd': 1})

print("\nHandling missing values...")

# Separate features by type for different imputation strategies
numeric_cols = X_processed.select_dtypes(include=[np.number]).columns
categorical_cols = X_processed.select_dtypes(exclude=[np.number]).columns

print(f"Numeric columns for imputation: {list(numeric_cols)}")
print(f"Categorical columns for imputation: {list(categorical_cols)}")

# For numeric features, use KNN imputation
if len(numeric_cols) > 0:
    knn_imputer = KNNImputer(n_neighbors=5)
    X_processed[numeric_cols] = knn_imputer.fit_transform(X_processed[numeric_cols])

# For categorical features (if any remaining), use mode imputation
if len(categorical_cols) > 0:
    mode_imputer = SimpleImputer(strategy='most_frequent')
    X_processed[categorical_cols] = mode_imputer.fit_transform(X_processed[categorical_cols])

print(f"Missing values after imputation: {X_processed.isnull().sum().sum()}")

# Feature engineering - create some additional features
print("\nFeature Engineering...")

# BMI-like indicator using available blood parameters
if all(col in X_processed.columns for col in ['hemo', 'pcv']):
    X_processed['hemo_pcv_ratio'] = X_processed['hemo'] / (X_processed['pcv'] + 0.001)

# Kidney function indicator
if all(col in X_processed.columns for col in ['sc', 'bu']):
    X_processed['kidney_function_score'] = X_processed['sc'] * X_processed['bu']

# Blood pressure category
if 'bp' in X_processed.columns:
    X_processed['bp_category'] = pd.cut(X_processed['bp'], 
                                       bins=[0, 90, 120, 140, 200], 
                                       labels=[0, 1, 2, 3])
    X_processed['bp_category'] = X_processed['bp_category'].astype(int)

print(f"Final feature set: {X_processed.shape[1]} features")


----------------------------------------
3. DATA CLEANING & PREPROCESSING...
----------------------------------------
Numeric columns for imputation: ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
Categorical columns for imputation: []
Missing values after imputation: 0

Feature Engineering...
Final feature set: 27 features
Feature engineering completed.


In [17]:
# 4. TRAIN/TEST SPLIT
print("-"*35)
print("4. CREATING TRAIN/TEST SPLIT...")
print("-"*35)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed['class'], 
    test_size=0.2, 
    random_state=42, 
    stratify=y_processed['class']
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Train class distribution: {np.bincount(y_train)}")
print(f"Test class distribution: {np.bincount(y_test)}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

-----------------------------------
4. CREATING TRAIN/TEST SPLIT...
-----------------------------------
Training set: 320 samples
Testing set: 80 samples
Train class distribution: [120 200]
Test class distribution: [30 50]


In [18]:
# 5. MODEL TRAINING & EVALUATION
print("-"*38)
print("5. MODEL TRAINING & EVALUATION...")
print("-"*38)

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Storage for results
results = []
model_objects = {}

print("Training and evaluating models...")

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Choose scaled or unscaled data based on model type
    if name in ['Logistic Regression', 'SVM', 'KNN', 'Naive Bayes']:
        X_train_model = X_train_scaled
        X_test_model = X_test_scaled
    else:
        X_train_model = X_train
        X_test_model = X_test
    
    # Train model
    model.fit(X_train_model, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_model)
    y_pred_proba = model.predict_proba(X_test_model)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # ROC-AUC if probability predictions available
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    # Store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'CV_Mean': cv_mean,
        'CV_Std': cv_std
    })
    
    # Store model object
    model_objects[name] = model
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  CV Score: {cv_mean:.4f} ± {cv_std:.4f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\n" + "="*80)
print("MODEL PERFORMANCE COMPARISON")
print("="*80)
print(results_df.round(4).to_string(index=False))

--------------------------------------
5. MODEL TRAINING & EVALUATION...
--------------------------------------
Training and evaluating models...

Training Logistic Regression...
  Accuracy: 0.9625
  F1-Score: 0.9691
  CV Score: 0.9844 ± 0.0171

Training Random Forest...
  Accuracy: 1.0000
  F1-Score: 1.0000
  CV Score: 0.9906 ± 0.0125

Training Gradient Boosting...
  Accuracy: 0.9750
  F1-Score: 0.9796
  CV Score: 0.9781 ± 0.0234

Training SVM...
  Accuracy: 0.9875
  F1-Score: 0.9899
  CV Score: 0.9875 ± 0.0117

Training KNN...
  Accuracy: 0.9625
  F1-Score: 0.9691
  CV Score: 0.9656 ± 0.0230

Training Naive Bayes...
  Accuracy: 0.9625
  F1-Score: 0.9691
  CV Score: 0.9281 ± 0.0351

Training Decision Tree...
  Accuracy: 0.9125
  F1-Score: 0.9278
  CV Score: 0.9437 ± 0.0272

MODEL PERFORMANCE COMPARISON
              Model  Accuracy  Precision  Recall  F1-Score  ROC-AUC  CV_Mean  CV_Std
      Random Forest    1.0000     1.0000    1.00    1.0000   1.0000   0.9906  0.0125
               