In [25]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer



# Load the preprocessed dataset
data = pd.read_csv("dataset/kidney_disease.csv")

# Separate features and target variable
X = data.drop(columns=["classification"])
y = data["classification"]

In [4]:
data.dtypes

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [6]:
data.shape

(400, 26)

In [7]:
data.describe(include="all")

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
count,400.0,391.0,388.0,353.0,354.0,351.0,248,335,396,396,...,330.0,295.0,270.0,398,398,398,399,399,399,400
unique,,,,,,,2,2,2,2,...,44.0,92.0,49.0,2,5,3,2,2,2,3
top,,,,,,,normal,normal,notpresent,notpresent,...,41.0,9800.0,5.2,no,no,no,good,no,no,ckd
freq,,,,,,,201,259,354,374,...,21.0,11.0,18.0,251,258,362,317,323,339,248
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,,,,,...,,,,,,,,,,
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,,,,,...,,,,,,,,,,
min,0.0,2.0,50.0,1.005,0.0,0.0,,,,,...,,,,,,,,,,
25%,99.75,42.0,70.0,1.01,0.0,0.0,,,,,...,,,,,,,,,,
50%,199.5,55.0,80.0,1.02,0.0,0.0,,,,,...,,,,,,,,,,
75%,299.25,64.5,80.0,1.02,2.0,0.0,,,,,...,,,,,,,,,,


In [38]:
data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,-0.216167,0.254214,0.421486,0.076249,-0.380269,1,2,1,1,-0.283841,...,0.569881,-0.206202,0.481295,2,5,2,0,1,1,ckd
1,-2.62783,-1.972476,0.421486,2.363728,-0.380269,1,2,1,1,-0.57237,...,-0.098536,-0.818559,0.090447,1,4,2,0,1,1,ckd
2,0.607327,0.254214,-1.421074,0.838742,2.507853,2,2,1,1,3.676881,...,-0.878356,-0.308261,0.090447,1,5,2,2,1,2,ckd
3,-0.216167,-0.488016,-2.342354,2.363728,-0.380269,2,0,2,1,-0.336301,...,-0.766953,-0.58042,-0.788961,2,4,2,2,2,2,ckd
4,-0.039704,0.254214,-1.421074,0.838742,-0.380269,2,2,1,1,-0.480565,...,-0.432744,-0.376301,-0.104977,1,4,2,0,1,1,ckd


In [27]:
# Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')  
data[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']] = imputer.fit_transform(data[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']])


# For categorical columns
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in categorical_cols:
    data[col].fillna('missing', inplace=True)  

# Convert object columns to numeric
binary_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
label_encoder = LabelEncoder()
for col in binary_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Convert remaining object columns to numeric
data['pcv'] = pd.to_numeric(data['pcv'], errors='coerce')
data['wc'] = pd.to_numeric(data['wc'], errors='coerce')
data['rc'] = pd.to_numeric(data['rc'], errors='coerce')

# Feature Scaling
scaler = StandardScaler()
numerical_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Drop Unnecessary Columns
data.drop(['id'], axis=1, inplace=True)

print(data.head())

        age        bp        sg        al        su  rbc  pc  pcc  ba  \
0 -0.216167  0.254214  0.421486  0.076249 -0.380269    1   2    1   1   
1 -2.627830 -1.972476  0.421486  2.363728 -0.380269    1   2    1   1   
2  0.607327  0.254214 -1.421074  0.838742  2.507853    2   2    1   1   
3 -0.216167 -0.488016 -2.342354  2.363728 -0.380269    2   0    2   1   
4 -0.039704  0.254214 -1.421074  0.838742 -0.380269    2   2    1   1   

        bgr  ...       pcv        wc        rc  htn  dm  cad  appet  pe  ane  \
0 -0.283841  ...  0.569881 -0.206202  0.481295    2   5    2      0   1    1   
1 -0.572370  ... -0.098536 -0.818559       NaN    1   4    2      0   1    1   
2  3.676881  ... -0.878356 -0.308261       NaN    1   5    2      2   1    2   
3 -0.336301  ... -0.766953 -0.580420 -0.788961    2   4    2      2   2    2   
4 -0.480565  ... -0.432744 -0.376301 -0.104977    1   4    2      0   1    1   

   classification  
0             ckd  
1             ckd  
2             ckd  


In [28]:
# Convert 'rc' column to numeric
data['rc'] = pd.to_numeric(data['rc'], errors='coerce')

# Handling Missing Values for Numerical Columns
numerical_imputer = SimpleImputer(strategy='median')  
data[['pcv', 'wc', 'rc']] = numerical_imputer.fit_transform(data[['pcv', 'wc', 'rc']])

# Display the preprocessed data
print(data.head())


        age        bp        sg        al        su  rbc  pc  pcc  ba  \
0 -0.216167  0.254214  0.421486  0.076249 -0.380269    1   2    1   1   
1 -2.627830 -1.972476  0.421486  2.363728 -0.380269    1   2    1   1   
2  0.607327  0.254214 -1.421074  0.838742  2.507853    2   2    1   1   
3 -0.216167 -0.488016 -2.342354  2.363728 -0.380269    2   0    2   1   
4 -0.039704  0.254214 -1.421074  0.838742 -0.380269    2   2    1   1   

        bgr  ...       pcv        wc        rc  htn  dm  cad  appet  pe  ane  \
0 -0.283841  ...  0.569881 -0.206202  0.481295    2   5    2      0   1    1   
1 -0.572370  ... -0.098536 -0.818559  0.090447    1   4    2      0   1    1   
2  3.676881  ... -0.878356 -0.308261  0.090447    1   5    2      2   1    2   
3 -0.336301  ... -0.766953 -0.580420 -0.788961    2   4    2      2   2    2   
4 -0.480565  ... -0.432744 -0.376301 -0.104977    1   4    2      0   1    1   

   classification  
0             ckd  
1             ckd  
2             ckd  


In [7]:
# # Drop specified columns from features
# X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])

# # Separate the target variable
# y = data['Diabetes_012']

# # Now you can proceed with preprocessing the data, splitting it into training and testing sets, and further analysis.


In [31]:
# Splitting the dataset into features (X) and target variable (y)
X = data.drop('classification', axis=1)
y = data['classification']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [32]:
# Train Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting on the test set using Decision Tree
dt_y_pred = dt_classifier.predict(X_test)

# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))

Random Forest Classifier:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        52
      notckd       1.00      1.00      1.00        28

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80


Decision Tree Classifier:
Accuracy: 0.9875
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      0.98      0.99        52
        ckd	       0.00      0.00      0.00         0
      notckd       1.00      1.00      1.00        28

    accuracy                           0.99        80
   macro avg       0.67      0.66      0.66        80
weighted avg       1.00      0.99      0.99        80



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for KNN Classifier
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate KNN Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.9875
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      0.98      0.99        52
      notckd       0.97      1.00      0.98        28

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



In [34]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for Random Forest and Decision Tree classifiers
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Random Forest and Decision Tree classifiers
rf_classifier = RandomForestClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV for Random Forest and Decision Tree classifiers
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search = GridSearchCV(dt_classifier, dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
rf_grid_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Predicting on the test set using best estimators from grid search
rf_y_pred = rf_grid_search.predict(X_test)
dt_y_pred = dt_grid_search.predict(X_test)

# Evaluate Random Forest Classifier
print("\nRandom Forest Classifier:")
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))




Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}

Random Forest Classifier:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        52
      notckd       1.00      1.00      1.00        28

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80


Decision Tree Classifier:
Accuracy: 0.975
Classification Report:
               precision    recall  f1-score   support

         ckd       0.98      0.98      0.98        52
      notckd       0.96      0.96      0.96        28

    accuracy                           0.97        80
   macro avg       0.97      0.97      0.97        80
weighted avg       0.97      0.97      0.97 

In [37]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for XGBoost classifier
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Instantiate GridSearchCV for XGBoost classifier
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
xgb_grid_search.fit(X_train, y_train_encoded)

# Best parameters for XGBoost
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Predicting on the test set using best estimator from grid search
xgb_y_pred = xgb_grid_search.predict(X_test)

# Evaluate XGBoost Classifier
print("\nXGBoost Classifier:")
xgb_accuracy = accuracy_score(y_test_encoded, xgb_y_pred)
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, xgb_y_pred))




Best parameters for XGBoost: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}

XGBoost Classifier:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           2       1.00      1.00      1.00        28

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



In [None]:



# Load the dataset
data = pd.read_csv("dataset/diabetes_012_health_indicators_BRFSS2015.csv")

# Define and preprocess your features and target variable
X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])
y = data['Diabetes_012']

# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


Best Parameters for LightGBM: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200}
Best LightGBM Classifier:
Accuracy: 0.8510525070955535
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.56      0.18      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.48      0.39      0.40     50736
weighted avg       0.81      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
