# Import

In [1]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For model evulation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Load the Dataset

In [2]:
df  = pd.read_csv("/content/kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


# Short definitions for each important feature


'age': 'Patient age (years)',


'bp': 'Blood pressure (mm/Hg)',


'sg': 'Urine specific gravity',


'al': 'Albumin in urine (0–5)',


'hemo': 'Hemoglobin level (g/dL)',


'sc': 'Serum creatinine (mg/dL)',


'htn': 'Hypertension (yes/no)',


'dm': 'Diabetes mellitus (yes/no)',


'cad': 'Coronary artery disease (yes/no)',


'appet': 'Appetite status (good/poor)',


'pc': 'Pus cell status (normal/abnormal)',


'classification': 'CKD diagnosis (ckd/notckd)'

In [3]:
important_columns = ['age', 'bp', 'sg', 'al', 'hemo', 'sc', 'htn', 'dm', 'cad', 'appet', 'pc','classification']
df = df[important_columns]
df

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,yes,yes,no,good,normal,ckd
1,7.0,50.0,1.020,4.0,11.3,0.8,no,no,no,good,normal,ckd
2,62.0,80.0,1.010,2.0,9.6,1.8,no,yes,no,poor,normal,ckd
3,48.0,70.0,1.005,4.0,11.2,3.8,yes,no,no,poor,abnormal,ckd
4,51.0,80.0,1.010,2.0,11.6,1.4,no,no,no,good,normal,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,no,no,no,good,normal,notckd
396,42.0,70.0,1.025,0.0,16.5,1.2,no,no,no,good,normal,notckd
397,12.0,80.0,1.020,0.0,15.8,0.6,no,no,no,good,normal,notckd
398,17.0,60.0,1.025,0.0,14.2,1.0,no,no,no,good,normal,notckd


In [10]:
for col in df.select_dtypes(include='object').columns:
  df[col] = df[col].str.strip().str.replace('\t','', regex=True)

print(df['cad'].value_counts())
print(df['dm'].value_counts())
print(df['classification'].value_counts())

cad
no     364
yes     34
Name: count, dtype: int64
dm
no     261
yes    137
Name: count, dtype: int64
classification
ckd       250
notckd    150
Name: count, dtype: int64


# Data Cleaning and Preprocessing

In [14]:
df.isnull().sum()

Unnamed: 0,0
age,9
bp,12
sg,47
al,46
hemo,52
sc,17
htn,2
dm,2
cad,2
appet,1


# Filling the missing Values

In [16]:
df['age'].fillna(df['age'].median(), inplace=True)
df['bp'].fillna(df['bp'].median(), inplace=True)
df['sg'].fillna(df['sg'].mode()[0], inplace=True)
df['al'].fillna(df['al'].mode()[0], inplace=True)
df['hemo'].fillna(df['hemo'].median(), inplace=True)
df['sc'].fillna(df['sc'].median(), inplace=True)
df['htn'].fillna(df['htn'].mode()[0], inplace=True)
df['dm'].fillna(df['dm'].mode()[0], inplace=True)
df['cad'].fillna(df['cad'].mode()[0], inplace=True)
df['appet'].fillna(df['appet'].mode()[0], inplace=True)
df['pc'].fillna(df['pc'].mode()[0], inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bp'].fillna(df['bp'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

Unnamed: 0,0
age,0
bp,0
sg,0
al,0
hemo,0
sc,0
htn,0
dm,0
cad,0
appet,0


# Encoding


In [17]:
df['htn'] = df['htn'].map({'yes':1, 'no':0})
df['pc'] = df['pc'].map({'normal':1,'abnormal':0})
df['dm'] = df['dm'].map({'yes':1, 'no':0})
df['cad'] = df['cad'].map({'yes':1, 'no':0})
df['appet'] = df['appet'].map({'good':1, 'poor':0})
df['classification'] = df['classification'].map({'ckd':1, 'notckd':0})
df

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,1,1,0,1,1,1
1,7.0,50.0,1.020,4.0,11.3,0.8,0,0,0,1,1,1
2,62.0,80.0,1.010,2.0,9.6,1.8,0,1,0,0,1,1
3,48.0,70.0,1.005,4.0,11.2,3.8,1,0,0,0,0,1
4,51.0,80.0,1.010,2.0,11.6,1.4,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,0,0,0,1,1,0
396,42.0,70.0,1.025,0.0,16.5,1.2,0,0,0,1,1,0
397,12.0,80.0,1.020,0.0,15.8,0.6,0,0,0,1,1,0
398,17.0,60.0,1.025,0.0,14.2,1.0,0,0,0,1,1,0


# Scaling Normalization

In [18]:
# Select numeric columns to normalise
numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']

# Initialize scaler
scaler = MinMaxScaler()

# Fit-transform the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.head()

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,0.522727,0.230769,0.75,0.2,0.836735,0.010582,1,1,0,1,1,1
1,0.056818,0.0,0.75,0.8,0.557823,0.005291,0,0,0,1,1,1
2,0.681818,0.230769,0.25,0.4,0.442177,0.018519,0,1,0,0,1,1
3,0.522727,0.153846,0.0,0.8,0.55102,0.044974,1,0,0,0,0,1
4,0.556818,0.230769,0.25,0.4,0.578231,0.013228,0,0,0,1,1,1


# Data Balancing

In [28]:
from imblearn.over_sampling import SMOTE

X = df.drop('classification', axis=1)
y = df['classification']


smote = SMOTE(random_state=42)

X_balanced, y_balanced = smote.fit_resample(X, y)

print(y.value_counts())
print(y_balanced.value_counts())

classification
1    250
0    150
Name: count, dtype: int64
classification
1    250
0    250
Name: count, dtype: int64


# Train Test Split

In [31]:

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# check the shape
print("Train shape: ", X_train.shape)
print("Test shape: ", X_test.shape)

Train shape:  (400, 11)
Test shape:  (100, 11)


# Training and Testing Multiple Classifier

In [32]:
# Define models

models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbours": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier()
}


# Train and evaluate each model
for name, model in models.items():
  print("="*50)
  print("Model: ", name)
  # train the model
  model.fit(X_train, y_train)

  # predict on the test data
  y_pred = model.predict(X_test)

  # Calculate the metrices
  accuracy = accuracy_score(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Print the results
  print("Accuracy: ", accuracy)
  print("Classification Report: \n", classification_rep)
  print("Confusion Matrix:\n", conf_matrix)

Model:  Logistic Regression
Accuracy:  0.96
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.91      0.95        46

    accuracy                           0.96       100
   macro avg       0.97      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

Confusion Matrix:
 [[54  0]
 [ 4 42]]
Model:  Support Vector Machine
Accuracy:  0.97
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       1.00      0.93      0.97        46

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100

Confusion Matrix:
 [[54  0]
 [ 3 43]]
Model:  Random Forest Classifier
Accuracy:  1.0
Classification Report: 
               precision    recall  f1-score   support

 

# Selecting the Best Model

In [33]:
model_gbc = GradientBoostingClassifier()
model_gbc.fit(X_train, y_train)

# predict on the test data
y_pred = model_gbc.predict(X_test)

print("confusion matrix \n: ", confusion_matrix(y_test,y_pred))
print("classification report \n: ", classification_report(y_test, y_pred))


confusion matrix 
:  [[54  0]
 [ 1 45]]
classification report 
:                precision    recall  f1-score   support

           0       0.98      1.00      0.99        54
           1       1.00      0.98      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



# Saving Model, Encoder, Scaler for production

In [35]:
import pickle

pickle.dump(scaler, open("scaler.pkl",'wb'))
pickle.dump(model_gbc,open("model_gbc.pkl",'wb'))

# Inference (Prediction on new data)

In [36]:
# Load the encoder, scaler, and trained model from saved files
scaler = pickle.load(open("/content/scaler.pkl", 'rb'))  # Load the scaler
model_gbc = pickle.load(open("/content/model_gbc.pkl", 'rb'))  # Load the trained model

def predict_chronic_disease(age, bp, sg, al, hemo, sc, htn, dm, cad, appet, pc):
    # Create a DataFrame with input variables, following the correct order
    df_dict = {
        'age': [age],
        'bp': [bp],
        'sg': [sg],
        'al': [al],
        'hemo': [hemo],
        'sc': [sc],
        'htn': [htn],
        'dm': [dm],
        'cad': [cad],
        'appet': [appet],
        'pc': [pc]
    }
    df = pd.DataFrame(df_dict)

    # Encode the categorical columns
    df['htn'] = df['htn'].map({'yes':1, "no":0})
    df['dm'] = df['dm'].map({'yes':1, "no":0})
    df['cad'] = df['cad'].map({'yes':1, "no":0})
    df['appet'] = df['appet'].map({'good':1, "poor":0})
    df['pc'] = df['pc'].map({'normal':1, "abnormal":0})

    # Scale the numeric columns using the previously fitted scaler
    numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']
    df[numeric_cols] = scaler.transform(df[numeric_cols])

    # Make the prediction
    prediction = model_gbc.predict(df)

    # Return the predicted class
    return prediction[0]

# Example usage:
result = predict_chronic_disease(age=30, bp=20, sg=1.020, al=1.0, hemo=15.4, sc=1.2, htn="no", dm="no", cad='no', appet='good', pc='normal')

if result == 1:
    print("The Patient Has CKD....")
else:
    print("The Patient Has not CKD....")

The Patient Has not CKD....
