In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer



# Load the preprocessed dataset
data = pd.read_csv("dataset/kidney_disease.csv")

# Separate features and target variable
X = data.drop(columns=["classification"])
y = data["classification"]

In [None]:
data.dtypes

In [None]:
data.info()

In [4]:
data.shape

(400, 26)

In [None]:
data.describe(include="all")

In [None]:
data.head()

In [2]:

# Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')  
data[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']] = imputer.fit_transform(data[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']])

# For categorical columns
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in categorical_cols:
    data[col].fillna('missing', inplace=True)  

# Convert object columns to numeric
binary_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
label_encoder = LabelEncoder()
for col in binary_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Convert remaining object columns to numeric
data['pcv'] = pd.to_numeric(data['pcv'], errors='coerce')
data['wc'] = pd.to_numeric(data['wc'], errors='coerce')
data['rc'] = pd.to_numeric(data['rc'], errors='coerce')

# Feature Scaling
scaler = StandardScaler()
numerical_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Drop Unnecessary Columns
data.drop(['id'], axis=1, inplace=True)

print(data.head())


        age        bp        sg        al        su  rbc  pc  pcc  ba  \
0 -0.216167  0.254214  0.421486  0.076249 -0.380269    1   2    1   1   
1 -2.627830 -1.972476  0.421486  2.363728 -0.380269    1   2    1   1   
2  0.607327  0.254214 -1.421074  0.838742  2.507853    2   2    1   1   
3 -0.216167 -0.488016 -2.342354  2.363728 -0.380269    2   0    2   1   
4 -0.039704  0.254214 -1.421074  0.838742 -0.380269    2   2    1   1   

        bgr  ...       pcv        wc        rc  htn  dm  cad  appet  pe  ane  \
0 -0.283841  ...  0.569881 -0.206202  0.481295    2   5    2      0   1    1   
1 -0.572370  ... -0.098536 -0.818559       NaN    1   4    2      0   1    1   
2  3.676881  ... -0.878356 -0.308261       NaN    1   5    2      2   1    2   
3 -0.336301  ... -0.766953 -0.580420 -0.788961    2   4    2      2   2    2   
4 -0.480565  ... -0.432744 -0.376301 -0.104977    1   4    2      0   1    1   

   classification  
0             ckd  
1             ckd  
2             ckd  


In [3]:
# Convert 'rc' column to numeric
data['rc'] = pd.to_numeric(data['rc'], errors='coerce')

# Handling Missing Values for Numerical Columns
numerical_imputer = SimpleImputer(strategy='median')  
data[['pcv', 'wc', 'rc']] = numerical_imputer.fit_transform(data[['pcv', 'wc', 'rc']])

# Display the preprocessed data
print(data.head())


        age        bp        sg        al        su  rbc  pc  pcc  ba  \
0 -0.216167  0.254214  0.421486  0.076249 -0.380269    1   2    1   1   
1 -2.627830 -1.972476  0.421486  2.363728 -0.380269    1   2    1   1   
2  0.607327  0.254214 -1.421074  0.838742  2.507853    2   2    1   1   
3 -0.216167 -0.488016 -2.342354  2.363728 -0.380269    2   0    2   1   
4 -0.039704  0.254214 -1.421074  0.838742 -0.380269    2   2    1   1   

        bgr  ...       pcv        wc        rc  htn  dm  cad  appet  pe  ane  \
0 -0.283841  ...  0.569881 -0.206202  0.481295    2   5    2      0   1    1   
1 -0.572370  ... -0.098536 -0.818559  0.090447    1   4    2      0   1    1   
2  3.676881  ... -0.878356 -0.308261  0.090447    1   5    2      2   1    2   
3 -0.336301  ... -0.766953 -0.580420 -0.788961    2   4    2      2   2    2   
4 -0.480565  ... -0.432744 -0.376301 -0.104977    1   4    2      0   1    1   

   classification  
0             ckd  
1             ckd  
2             ckd  


In [13]:
# Convert object columns to numeric
label_encoder = LabelEncoder()
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')  
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Splitting the dataset into features (X) and target variable (y)
X = data.drop('classification', axis=1)
y = data['classification']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add Gaussian noise to numerical features
numerical_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
noise_std = 0.1  # Adjust this value based on the level of noise desired
for col in numerical_columns:
    noise_train = np.random.normal(0, noise_std, size=X_train[col].shape)
    X_train[col] += noise_train
    noise_test = np.random.normal(0, noise_std, size=X_test[col].shape)
    X_test[col] += noise_test

# Ensure that the values stay within bounds (0 and 1 for some features)
X_train = np.clip(X_train, 0, 1)
X_test = np.clip(X_test, 0, 1)

TypeError: can only concatenate str (not "float") to str

In [12]:

# Convert object columns to numeric
label_encoder = LabelEncoder()
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')  
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Splitting the dataset into features (X) and target variable (y)
X = data.drop('classification', axis=1)
y = data['classification']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add more Gaussian noise to numerical features
numerical_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
noise_std = 0.2  # Increased noise level
for col in numerical_columns:
    noise_train = np.random.normal(0, noise_std, size=X_train[col].shape)
    X_train[col] += noise_train
    noise_test = np.random.normal(0, noise_std, size=X_test[col].shape)
    X_test[col] += noise_test

# Ensure that the values stay within bounds (0 and 1 for some features)
X_train = np.clip(X_train, 0, 1)
X_test = np.clip(X_test, 0, 1)


TypeError: can only concatenate str (not "float") to str

In [2]:
# Load the dataset
data = pd.read_csv("dataset/kidney_disease.csv", na_values=['\t?'])

# Convert object columns to numeric
label_encoder = LabelEncoder()
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')  
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Splitting the dataset into features (X) and target variable (y)
X = data.drop('classification', axis=1)
y = data['classification']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add Gaussian noise to numerical features
numerical_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
noise_std = 0.2  # Increased noise level

# Convert numerical columns to float
X_train[numerical_columns] = X_train[numerical_columns].astype(float)
X_test[numerical_columns] = X_test[numerical_columns].astype(float)

# Add Gaussian noise to numerical features
for col in numerical_columns:
    noise_train = np.random.normal(0, noise_std, size=X_train[col].shape)
    X_train[col] += noise_train
    noise_test = np.random.normal(0, noise_std, size=X_test[col].shape)
    X_test[col] += noise_test

# Ensure that the values stay within bounds (0 and 1 for some features)
X_train = np.clip(X_train, 0, 1)
X_test = np.clip(X_test, 0, 1)

In [3]:


# Splitting the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.925


In [4]:
# Train Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting on the test set using Decision Tree
dt_y_pred = dt_classifier.predict(X_test)

# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))

Random Forest Classifier:
Accuracy: 0.925
Classification Report:
               precision    recall  f1-score   support

         ckd       0.96      0.92      0.94        52
      notckd       0.87      0.93      0.90        28

    accuracy                           0.93        80
   macro avg       0.91      0.93      0.92        80
weighted avg       0.93      0.93      0.93        80


Decision Tree Classifier:
Accuracy: 0.825
Classification Report:
               precision    recall  f1-score   support

         ckd       0.87      0.87      0.87        52
      notckd       0.75      0.75      0.75        28

    accuracy                           0.82        80
   macro avg       0.81      0.81      0.81        80
weighted avg       0.82      0.82      0.82        80



In [5]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for KNN Classifier
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate KNN Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.925
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      0.88      0.94        52
      notckd       0.82      1.00      0.90        28

    accuracy                           0.93        80
   macro avg       0.91      0.94      0.92        80
weighted avg       0.94      0.93      0.93        80



In [6]:
# import numpy as np
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score, classification_report
# import pandas as pd



# Define parameter grids for Random Forest and Decision Tree classifiers
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Random Forest and Decision Tree classifiers
rf_classifier = RandomForestClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV for Random Forest and Decision Tree classifiers
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search = GridSearchCV(dt_classifier, dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
rf_grid_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Predicting on the test set using best estimators from grid search
rf_y_pred = rf_grid_search.predict(X_test)
dt_y_pred = dt_grid_search.predict(X_test)

# Evaluate Random Forest Classifier
print("\nRandom Forest Classifier:")
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))




Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}

Random Forest Classifier:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        52
      notckd       1.00      1.00      1.00        28

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80


Decision Tree Classifier:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         ckd       1.00      1.00      1.00        52
      notckd       1.00      1.00      1.00        28

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00  

In [8]:

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for XGBoost classifier
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Instantiate GridSearchCV for XGBoost classifier
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
xgb_grid_search.fit(X_train, y_train_encoded)

# Best parameters for XGBoost
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Predicting on the test set using best estimator from grid search
xgb_y_pred = xgb_grid_search.predict(X_test)

# Evaluate XGBoost Classifier
print("\nXGBoost Classifier:")
xgb_accuracy = accuracy_score(y_test_encoded, xgb_y_pred)
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, xgb_y_pred))




ValueError: 
All the 32805 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32805 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Zawad\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Python310\lib\site-packages\xgboost\sklearn.py", line 1500, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Python310\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "c:\Python310\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix
    return QuantileDMatrix(
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 1529, in __init__
    self._init(
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 1588, in _init
    it.reraise()
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 576, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 557, in _handle_exception
    return fn()
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "c:\Python310\lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 624, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
  File "c:\Python310\lib\site-packages\xgboost\data.py", line 1315, in _proxy_transform
    arr, feature_names, feature_types = _transform_pandas_df(
  File "c:\Python310\lib\site-packages\xgboost\data.py", line 490, in _transform_pandas_df
    _invalid_dataframe_dtype(data)
  File "c:\Python310\lib\site-packages\xgboost\data.py", line 308, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:id: object, age: object, bp: object, sg: object, al: object, su: object, rbc: object, pc: object, pcc: object, ba: object, bgr: object, bu: object, sc: object, sod: object, pot: object, hemo: object, pcv: object, wc: object, rc: object, htn: object, dm: object, cad: object, appet: object, pe: object, ane: object


In [None]:
# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 632
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 25
[LightGBM] [Info] Start training from score -0.490206
[LightGBM] [Info] Start training from score -5.075174
[LightGBM] [Info] Start training from score -0.964300




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 632
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 25
[LightGBM] [Info] Start training from score -0.490206
[LightGBM] [Info] Start training from score -5.075174
[LightGBM] [Info] Start training from score -0.964300
Best Parameters for LightGBM: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 632
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 25
[LightGBM] [Info] Start training from score -0.