In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
sns.set()
plt.style.use('ggplot')
%matplotlib inline

In [None]:
df = pd.read_csv("kidney_disease.csv")


In [None]:
df.head(1190)


In [None]:
df['classification'].value_counts()


In [None]:
df.shape

In [None]:
df.drop('id', axis=1 , inplace =True)

In [None]:
df.head()

In [None]:
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [None]:
df.head()


In [None]:
df.describe()


In [None]:
df.info()


In [None]:
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')

In [None]:
df.info()

In [None]:
df.columns


In [None]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']


In [None]:
cat_cols


In [None]:
num_cols


In [None]:
for col in cat_cols:
    print(f"{col} has {df[col].unique()}")

In [None]:
df['diabetes_mellitus'].replace(to_replace = {'\tno':'no', '\tyes': 'yes', ' yes':'yes'}, inplace=True)
df['coronary_artery_disease'] = df['coronary_artery_disease'].replace(to_replace = '\tno', value = 'no')
df['class'] = df['class'].replace(to_replace={'ckd\t':'ckd', 'notckd': 'not ckd'})

In [None]:
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']
for col in cols:
    print(f"{col} has {df[col].unique()}")

In [None]:
df['class'] = df['class'].map({'ckd':0, 'not ckd': 1})
df['class'] = pd.to_numeric(df['class'], errors = 'coerce')

In [None]:
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']
for col in cols:
    print(f"{col} has {df[col].unique()}")

In [None]:
plt.figure(figsize = (20, 15))
plotnumber = 1

for column in num_cols:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (20, 15))
plotnumber = 1

for column in cat_cols:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        import seaborn as sns
        sns.barplot(x=df[column].value_counts().index, y=df[column].value_counts(), palette='rocket')
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# Convert non-numeric columns to numeric using label encoding
from sklearn.preprocessing import LabelEncoder

# Create a copy of the DataFrame to preserve the original
df_encoded = df.copy()

# Iterate through each column
for col in df_encoded.columns:
    # Check if the column contains non-numeric data
    if df_encoded[col].dtype == 'object':
        # Use label encoding to convert the non-numeric data to numeric
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])

# Compute the correlation matrix
correlation_matrix = df_encoded.corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:
#EDA

In [None]:
def kde(col):
    grid = sns.FacetGrid(df, hue="class", height = 6, aspect=2)
    grid.map(sns.kdeplot, col)
    grid.add_legend()


In [None]:
kde('red_blood_cell_count')


In [None]:
kde('white_blood_cell_count')


In [None]:
# Data Preprocessing


In [None]:
# checking for missing value
df.isnull().sum().sort_values(ascending=False)

In [None]:
df[num_cols].isnull().sum()

In [None]:
df[cat_cols].isnull().sum()

In [None]:
df.head()


In [None]:
# two method
# radom sampling->higer null value
# mean/mode-> lower null value

In [None]:
def random_sampling(feature):
    random_sample = df[feature].dropna().sample(df[feature].isna().sum())
    random_sample.index = df[df[feature].isnull()].index
    df.loc[df[feature].isnull(), feature] = random_sample

def impute_mode(feature):
    mode = df[feature].mode()[0]
    df[feature] = df[feature].fillna(mode)

In [None]:
# random sampling for numerical value
for col in num_cols:
    random_sampling(col)

In [None]:
df[num_cols].isnull().sum()


In [None]:
random_sampling('red_blood_cells')
random_sampling('pus_cell')

for col in cat_cols:
    impute_mode(col)

In [None]:
df[cat_cols].isnull().sum()


In [None]:
# Feature Encoding


In [None]:
for col in cat_cols:
    print(f"{col} has {df[col].nunique()}")

In [None]:
# label_encoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()


In [None]:
# Model Building


In [None]:
X = df.drop('class', axis = 1)
y = df['class']

In [None]:
X


In [None]:
y


In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test =  train_test_split(X,y, test_size = 0.4, random_state = 0)

# KNN


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Train the KNN model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predictions
y_pred_knn = knn.predict(X_test)

# Accuracy
knn_acc = accuracy_score(y_test, y_pred_knn)
print(f"Training Accuracy of KNN is {accuracy_score(y_train, knn.predict(X_train))}")
print(f"Testing Accuracy of KNN is {accuracy_score(y_test, y_pred_knn)}")

# Confusion Matrix
cm_knn = confusion_matrix(y_test, y_pred_knn)
print(f"Confusion Matrix of KNN is \n {cm_knn}\n")

# Classification Report
print(f"Classification Report of KNN is \n{classification_report(y_test, y_pred_knn)}")

# Extract TP, TN, FP, FN
TN, FP, FN, TP = cm_knn.ravel()

# Print TP, TN, FP, FN
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Calculate additional metrics
TPR = TP / (TP + FN)  # True Positive Rate
FNR = FN / (TP + FN)  # False Negative Rate
TNR = TN / (TN + FP)  # True Negative Rate
FPR = FP / (TN + FP)  # False Positive Rate

print(f"True Positive Rate (TPR): {TPR}")
print(f"False Negative Rate (FNR): {FNR}")
print(f"True Negative Rate (TNR): {TNR}")
print(f"False Positive Rate (FPR): {FPR}")

# ROC and AUC
y_prob_knn = knn.predict_proba(X_test)[:, 1]  
fpr, tpr, _ = roc_curve(y_test, y_prob_knn)
knn_auc = roc_auc_score(y_test, y_prob_knn)
print(f"AUC Score of KNN: {knn_auc}")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate confusion matrix
cm = confusion_matrix(y_test, knn.predict(X_test))

# Visualize confusion matrix using seaborn heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2'], yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for KNN')
plt.show()


## Random Forest Classifier


In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Initialize and fit the Random Forest model
rand_clf = RandomForestClassifier(
    criterion="gini", 
    max_depth=10, 
    max_features="sqrt", 
    min_samples_leaf=1, 
    min_samples_split=7, 
    n_estimators=400
)
rand_clf.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = rand_clf.predict(X_train)
y_test_pred = rand_clf.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print accuracy
print(f"Training Accuracy of Random Forest is {train_accuracy}")
print(f"Testing Accuracy of Random Forest is {test_accuracy}")

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Confusion Matrix of Random Forest is \n{conf_matrix}\n")
print(f"Classification Report of Random Forest is \n{class_report}")

# Extract True Positives, True Negatives, False Positives, False Negatives
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Print TP, TN, FP, FN
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Calculate TPR, FNR, TNR, FPR
TPR = TP / (TP + FN)
FNR = FN / (TP + FN)
TNR = TN / (TN + FP)
FPR = FP / (TN + FP)

# Print TPR, FNR, TNR, FPR
print(f"True Positive Rate (TPR): {TPR}")
print(f"False Negative Rate (FNR): {FNR}")
print(f"True Negative Rate (TNR): {TNR}")
print(f"False Positive Rate (FPR): {FPR}")

# Calculate AUC Score
if len(set(y_test)) == 2:  # Ensure binary classification for AUC calculation
    auc_score = roc_auc_score(y_test, rand_clf.predict_proba(X_test)[:, 1])
    print(f"AUC Score: {auc_score}")
else:
    print("AUC Score: Not applicable for non-binary classification")


In [None]:
from sklearn.metrics import confusion_matrix


# Calculate confusion matrix for Random Forest
rf_cm = confusion_matrix(y_test, rand_clf.predict(X_test))

# Visualize confusion matrix using seaborn heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2'], yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()


## LOGISTIC REGRESSION


In [None]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Initialize and fit the Logistic Regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print accuracy
print(f"Training Accuracy of Logistic Regression is {train_accuracy}")
print(f"Testing Accuracy of Logistic Regression is {test_accuracy}")

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Confusion Matrix of Logistic Regression is \n{conf_matrix}\n")
print(f"Classification Report of Logistic Regression is \n{class_report}")

# Extract True Positives, True Negatives, False Positives, False Negatives
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Print TP, TN, FP, FN
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Calculate TPR, FNR, TNR, FPR
TPR = TP / (TP + FN)
FNR = FN / (TP + FN)
TNR = TN / (TN + FP)
FPR = FP / (TN + FP)

# Print TPR, FNR, TNR, FPR
print(f"True Positive Rate (TPR): {TPR}")
print(f"False Negative Rate (FNR): {FNR}")
print(f"True Negative Rate (TNR): {TNR}")
print(f"False Positive Rate (FPR): {FPR}")

# Calculate AUC Score
if len(set(y_test)) == 2:  # Ensure binary classification for AUC calculation
    auc_score = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1])
    print(f"AUC Score: {auc_score}")
else:
    print("AUC Score: Not applicable for non-binary classification")


In [None]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix for Logistic Regression
lr_cm = confusion_matrix(y_test, lr.predict(X_test))

# Visualize confusion matrix using seaborn heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2'], yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()


## SVM

In [None]:
# SVM
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Initialize the SVM model
svm = SVC(probability=True)

# Define the parameter grid
parameter_grid = {
    'gamma': [0.0001, 0.001, 0.01, 0.1],
    'C': [0.01, 0.05, 0.1, 0.5, 1, 10, 15, 20]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(svm, parameter_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Train the SVC model with the best parameters
svm = SVC(gamma=grid_search.best_params_['gamma'], C=grid_search.best_params_['C'], probability=True)
svm.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print accuracy
print(f"Training Accuracy of SVC is {train_accuracy}")
print(f"Testing Accuracy of SVC is {test_accuracy}")

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Confusion Matrix of SVC is \n{conf_matrix}\n")
print(f"Classification Report of SVC is \n{class_report}")

# Extract True Positives, True Negatives, False Positives, False Negatives
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Print TP, TN, FP, FN
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Calculate TPR, FNR, TNR, FPR
TPR = TP / (TP + FN)
FNR = FN / (TP + FN)
TNR = TN / (TN + FP)
FPR = FP / (TN + FP)

# Print TPR, FNR, TNR, FPR
print(f"True Positive Rate (TPR): {TPR}")
print(f"False Negative Rate (FNR): {FNR}")
print(f"True Negative Rate (TNR): {TNR}")
print(f"False Positive Rate (FPR): {FPR}")

# Calculate AUC Score
if len(set(y_test)) == 2:  # Ensure binary classification for AUC calculation
    auc_score = roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
    print(f"AUC Score: {auc_score}")
else:
    print("AUC Score: Not applicable for non-binary classification")


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)  # Increase font scale for better readability
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu', cbar=False, annot_kws={"size": 14}, linewidths=.5)
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()



## Decision Tree

In [None]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize and fit the Decision Tree model
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = dtc.predict(X_train)
y_test_pred = dtc.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print accuracy
print(f"Training Accuracy of Decision Tree Classifier is {train_accuracy}")
print(f"Testing Accuracy of Decision Tree Classifier is {test_accuracy}")

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Confusion Matrix of Decision Tree Classifier is \n{conf_matrix}\n")
print(f"Classification Report of Decision Tree Classifier is \n{class_report}")

# Extract True Positives, True Negatives, False Positives, False Negatives
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Print TP, TN, FP, FN
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Calculate TPR, FNR, TNR, FPR
TPR = TP / (TP + FN)
FNR = FN / (TP + FN)
TNR = TN / (TN + FP)
FPR = FP / (TN + FP)

# Print TPR, FNR, TNR, FPR
print(f"True Positive Rate (TPR): {TPR}")
print(f"False Negative Rate (FNR): {FNR}")
print(f"True Negative Rate (TNR): {TNR}")
print(f"False Positive Rate (FPR): {FPR}")

# Calculate AUC Score
if len(set(y_test)) == 2:  # Ensure binary classification for AUC calculation
    auc_score = roc_auc_score(y_test, dtc.predict_proba(X_test)[:, 1])
    print(f"AUC Score: {auc_score}")
else:
    print("AUC Score: Not applicable for non-binary classification")



In [None]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize and fit the Decision Tree model
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# Predict on test set
y_test_pred = dtc.predict(X_test)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Testing Accuracy of Decision Tree Classifier is {test_accuracy}")

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(f"Confusion Matrix of Decision Tree Classifier is \n{conf_matrix}\n")

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)  # Increase font scale for better readability
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu', cbar=False, annot_kws={"size": 14}, linewidths=.5)
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

# Binarize the output (replace with actual class labels)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])  
n_classes = y_test_bin.shape[1]

# Define models and their labels
models = [
    {'label': 'LR', 'model': lr},
    {'label': 'SVM', 'model': svm},
    {'label': 'KNN', 'model': knn},
    {'label': 'RF', 'model': rand_clf},
    {'label': 'DT', 'model': dtc},
    
]

plt.figure(figsize=(8, 5))

# Loop through each model
for m in models:
    model = OneVsRestClassifier(m['model'])
    model.fit(X_train, label_binarize(y_train, classes=[0, 1, 2, 3]))  # Binarize y_train
    
    # Predict probabilities
    y_score = model.predict_proba(X_test)
    
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
    
    # Plot micro-average ROC curve
    plt.plot(fpr["micro"], tpr["micro"], label='%s - ROC (area = %0.2f)' % (m['label'], roc_auc["micro"]))

# Plot the diagonal
plt.plot([0, 1], [0, 1], 'r--')

# Set plot limits and labels
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)', fontsize=12)
plt.ylabel('Sensitivity (True Positive Rate)', fontsize=12)
plt.title('ROC - Kidney Disease Prediction', fontsize=12)
plt.legend(loc="lower right", fontsize=12)

# Save the plot
plt.savefig("roc_kidney.jpeg", format='jpeg', dpi=400, bbox_inches='tight')
plt.show()

## Ensemble

In [None]:
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np

# 1. Load the dataset
df = pd.read_csv("kidney_disease.csv")

# 2. Data Preprocessing
df_numeric = df.select_dtypes(include=['number'])
df_numeric.fillna(df_numeric.mean(), inplace=True)
df_categorical = df.select_dtypes(include=['object'])
le = LabelEncoder()
df_categorical_encoded = df_categorical.apply(le.fit_transform)
df_processed = pd.concat([df_numeric, df_categorical_encoded], axis=1)

# Split into features and target
X = df_processed.drop(columns=['classification'])
y = df_processed['classification']

# 3. Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# 4. Initialize Base Models
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

# 5. Train Base Models
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)

# Initialize classifiers
classifiers = {
    'rf': rf,
    'knn': knn,
    'dt': dt
}

# Create a voting classifier using all three classifiers with soft voting
combo = ('rf', 'knn', 'dt')
models = [classifiers[clf_name] for clf_name in combo]
voting_clf = VotingClassifier(estimators=[(clf_name, clf) for clf_name, clf in zip(combo, models)], voting='soft')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']

# Calculate confusion matrix and derive metrics
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Initialize metrics
tpr = fnr = tnr = fpr = auc_score = None

if cm.shape == (2, 2):  # Binary classification case
    tn, fp, fn, tp = cm.ravel()
    tpr = tp / (tp + fn)  # True Positive Rate
    fnr = fn / (tp + fn)  # False Negative Rate
    tnr = tn / (tn + fp)  # True Negative Rate
    fpr = fp / (tn + fp)  # False Positive Rate
    
    # Calculate AUC score
    y_prob = voting_clf.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
else:  # Multiclass classification case
    auc_score = roc_auc_score(y_test, voting_clf.predict_proba(X_test), multi_class='ovr')

# Print metrics
print(f"Combination: {combo}, Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1_score}")
print(f"AUC Score: {auc_score}")


train_accuracy = dt.score(X_train, y_train)
test_accuracy = dt.score(X_test, y_test)

print(f"Training Accuracy of ensemble Classifier is {train_accuracy}")
print(f"Testing Accuracy of ensemble Classifier is {test_accuracy}")

# Confusion Matrix and Classification Report for Decision Tree
conf_matrix = confusion_matrix(y_test, dt.predict(X_test))
class_report = classification_report(y_test, dt.predict(X_test))
print("Classification Report for Decision Tree:\n", class_report)

# Calculate TP, TN, FP, FN
if conf_matrix.shape == (2, 2):
    tn, fp, fn, tp = conf_matrix.ravel()
    # Calculate TPR, FNR, TNR, FPR
    TPR = tp / (tp + fn)
    FNR = fn / (tp + fn)
    TNR = tn / (tn + fp)
    FPR = fp / (tn + fp)

    # Print TP, TN, FP, FN
    print(f"True Positives (TP): {tp}")
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")

    # Print TPR, FNR, TNR, FPR
    print(f"True Positive Rate (TPR): {TPR}")
    print(f"False Negative Rate (FNR): {FNR}")
    print(f"True Negative Rate (TNR): {TNR}")
    print(f"False Positive Rate (FPR): {FPR}")



In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

# Binarize the output (replace with actual class labels)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])  
n_classes = y_test_bin.shape[1]

# Define models and their labels
models = [
    {'label': 'LR', 'model': lr},
    {'label': 'SVM', 'model': svm},
    {'label': 'KNN', 'model': knn},
    {'label': 'RF', 'model': rand_clf},
    {'label': 'DT', 'model': dtc},
    {'label': 'Ensemble', 'model': voting_clf}
]

plt.figure(figsize=(8, 5))

# Loop through each model
for m in models:
    model = OneVsRestClassifier(m['model'])
    model.fit(X_train, label_binarize(y_train, classes=[0, 1, 2, 3]))  # Binarize y_train
    
    # Predict probabilities
    y_score = model.predict_proba(X_test)
    
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
    
    # Plot micro-average ROC curve
    plt.plot(fpr["micro"], tpr["micro"], label='%s - ROC (area = %0.2f)' % (m['label'], roc_auc["micro"]))

# Plot the diagonal
plt.plot([0, 1], [0, 1], 'r--')

# Set plot limits and labels
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)', fontsize=12)
plt.ylabel('Sensitivity (True Positive Rate)', fontsize=12)
plt.title('ROC - Kidney Disease Prediction', fontsize=12)
plt.legend(loc="lower right", fontsize=12)

# Save the plot
plt.savefig("roc_kidney.jpeg", format='jpeg', dpi=400, bbox_inches='tight')
plt.show()


In [None]:
classifiers = {
    'Random Forest': {'precision': 0.98, 'recall': 0.98, 'f1-score': 0.98, 'accuracy': 0.98},
    'KNN': {'precision': 0.79, 'recall': 0.79, 'f1-score': 0.79, 'accuracy': 0.798},
    'SVC': {'precision': 0.90, 'recall': 0.90, 'f1-score': 0.90, 'accuracy': 0.90},
    'Logistic Regression': {'precision': 0.91, 'recall': 0.90, 'f1-score': 0.90, 'accuracy': 0.90},
    'Decision Tree': {'precision': 0.97, 'recall': 0.97, 'f1-score': 0.97, 'accuracy': 0.98},
    'Ensemble': {'precision': 0.995, 'recall': 0.995, 'f1-score':0.995 , 'accuracy': 0.995}
}

# Create a bar plot
fig, ax = plt.subplots(figsize=(10, 6))
x = range(len(classifiers))
classifier_names = list(classifiers.keys())
precision = [classifiers[name]['precision'] for name in classifier_names]
recall = [classifiers[name]['recall'] for name in classifier_names]
f1_score = [classifiers[name]['f1-score'] for name in classifier_names]
accuracy = [classifiers[name]['accuracy'] for name in classifier_names]

bar_width = 0.15
index = x
ax.bar(index, precision, bar_width, label='Precision')
index = [i + bar_width for i in index]
ax.bar(index, recall, bar_width, label='Recall')
index = [i + bar_width for i in index]
ax.bar(index, f1_score, bar_width, label='F1-Score')
index = [i + bar_width for i in index]
ax.bar(index, accuracy, bar_width, label='Accuracy')

ax.set_xlabel('Classifiers', fontsize=12)
ax.set_xticks([i + 1.5 * bar_width for i in x])
ax.set_xticklabels(classifier_names, rotation=45, ha='right')
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Performance Comparison of Classifiers', fontsize=14)
ax.legend()

plt.tight_layout()
plt.show()