### Import necessary packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Import the dataset

In [2]:
# Load dataset
df = pd.read_csv("../../data/processed/cleaned_ckd_data.csv")

# Display first few rows
print(df.head())

   bp (Diastolic)  bp limit  sg  al  class  rbc  su  pc  pcc  ba  ...  htn  \
0               2         3   4   4      1    2   5   2    2   2  ...    2   
1               0         0   2   0      0    0   4   0    0   0  ...    0   
2               0         0   0   3      0    0   4   0    0   0  ...    0   
3               0         0   0   5      0    1   4   1    0   1  ...    0   
4               1         1   0   2      0    0   4   0    0   0  ...    0   

   dm  cad  appet  pe  ane  grf  stage  affected  age  
0   2    2      2   2    2   10      0         2    9  
1   0    0      0   0    0   11      1         1    8  
2   0    0      0   0    0   11      1         1    8  
3   0    0      1   0    0    2      1         1    8  
4   0    0      0   0    0    2      1         1    8  

[5 rows x 29 columns]


In [3]:
X = df.drop(columns=["class"]) 
y = df["class"] 

In [4]:
# Normalize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
nb = GaussianNB()
svm = SVC(probability=True, random_state=42)  # Enable probability for soft voting

In [5]:
# Hard Voting Classifier
hard_voting_clf = VotingClassifier(estimators=[('rf', rf), ('nb', nb), ('svm', svm)], voting='hard')
hard_voting_clf.fit(X_train, y_train)

# Soft Voting Classifier
soft_voting_clf = VotingClassifier(estimators=[('rf', rf), ('nb', nb), ('svm', svm)], voting='soft')
soft_voting_clf.fit(X_train, y_train)

# Predictions
y_pred_hard = hard_voting_clf.predict(X_test)
y_pred_soft = soft_voting_clf.predict(X_test)


In [6]:
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} ===")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1-score: {f1_score(y_true, y_pred):.4f}")
    print("")

In [7]:
# Compare both approaches
evaluate_model(y_test, y_pred_hard, "Hard Voting")
evaluate_model(y_test, y_pred_soft, "Soft Voting")

=== Hard Voting ===
Accuracy: 1.0000


ValueError: pos_label=1 is not a valid label. It should be one of [0, 2]