![](https://www.miskawaanhealth.com/wp-content/uploads/2021/05/chronic-kidney-disease-stages.jpg)

In [None]:
# import necessary libraries like numpy, pandas, pyplot and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading dataset
df = pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
# Checking no of rows and columns of the dataset
df.shape

(400, 26)

# **Data Cleaning**

In [None]:
# drop id column
df = df.drop('id',axis=1)
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
# rename column names to make it more user-friendly

df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [None]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
# checking info of columns and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood_pressure           388 non-null    float64
 2   specific_gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red_blood_cells          248 non-null    object 
 6   pus_cell                 335 non-null    object 
 7   pus_cell_clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood_glucose_random     356 non-null    float64
 10  blood_urea               381 non-null    float64
 11  serum_creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  haemoglobin              3

According to the data description
- Cols(pcv, wc and rc) needs to convert back in numerical since it is object right now

In [None]:
# converting necessary columns like packed_cell_volume, white-blood_cell_count and red_blood_cell_count
# currently it is in object type and converting back to numerical type
# Creating a function to convert data type to float

def convert_to_num_dtype(df,feature):
  df[feature] = pd.to_numeric(df[feature],errors='coerce')

In [None]:
features1 = ['packed_cell_volume','white_blood_cell_count','red_blood_cell_count']

for feature in features1:
  convert_to_num_dtype(df,feature)

In [None]:
# Rechecking the data types
df.dtypes

age                        float64
blood_pressure             float64
specific_gravity           float64
albumin                    float64
sugar                      float64
red_blood_cells             object
pus_cell                    object
pus_cell_clumps             object
bacteria                    object
blood_glucose_random       float64
blood_urea                 float64
serum_creatinine           float64
sodium                     float64
potassium                  float64
haemoglobin                float64
packed_cell_volume         float64
white_blood_cell_count     float64
red_blood_cell_count       float64
hypertension                object
diabetes_mellitus           object
coronary_artery_disease     object
appetite                    object
peda_edema                  object
aanemia                     object
class                       object
dtype: object

In [None]:
# Extracting categorical and numerical columns

cat_cols = [col for col in df.columns if df[col].dtype=='object']
num_cols = [col for col in df.columns if df[col].dtype!='object']


In [None]:
cat_cols

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'peda_edema',
 'aanemia',
 'class']

In [None]:
num_cols

['age',
 'blood_pressure',
 'specific_gravity',
 'albumin',
 'sugar',
 'blood_glucose_random',
 'blood_urea',
 'serum_creatinine',
 'sodium',
 'potassium',
 'haemoglobin',
 'packed_cell_volume',
 'white_blood_cell_count',
 'red_blood_cell_count']

In [None]:
# by looping & looking at unique values in categorical columns

for col in cat_cols:
  print(f'{col} has {df[col].unique()} values\n')

red_blood_cells has [nan 'normal' 'abnormal'] values

pus_cell has ['normal' 'abnormal' nan] values

pus_cell_clumps has ['notpresent' 'present' nan] values

bacteria has ['notpresent' 'present' nan] values

hypertension has ['yes' 'no' nan] values

diabetes_mellitus has ['yes' 'no' ' yes' '\tno' '\tyes' nan] values

coronary_artery_disease has ['no' 'yes' '\tno' nan] values

appetite has ['good' 'poor' nan] values

peda_edema has ['no' 'yes' nan] values

aanemia has ['no' 'yes' nan] values

class has ['ckd' 'ckd\t' 'notckd'] values



In [None]:
# replace incorrect values like '\tno', '\tyes', ' yes', '\tno', 'ckd\t', 'notckd'  in categorical cols

df['diabetes_mellitus'].replace({'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
df['coronary_artery_disease'].replace({'\tno':'no'},inplace=True)
df['class'].replace({'ckd\t':'ckd'},inplace=True)

In [None]:
# Rechecking by looping & looking at unique values in categorical columns

for col in cat_cols:
  print(f'{col} has {df[col].unique()} values\n')

red_blood_cells has [nan 'normal' 'abnormal'] values

pus_cell has ['normal' 'abnormal' nan] values

pus_cell_clumps has ['notpresent' 'present' nan] values

bacteria has ['notpresent' 'present' nan] values

hypertension has ['yes' 'no' nan] values

diabetes_mellitus has ['yes' 'no' nan] values

coronary_artery_disease has ['no' 'yes' nan] values

appetite has ['good' 'poor' nan] values

peda_edema has ['no' 'yes' nan] values

aanemia has ['no' 'yes' nan] values

class has ['ckd' 'notckd'] values



In [None]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [None]:
# let's check count of null values in whole df

df.isna().sum().sort_values(ascending=False)

red_blood_cells            152
red_blood_cell_count       131
white_blood_cell_count     106
potassium                   88
sodium                      87
packed_cell_volume          71
pus_cell                    65
haemoglobin                 52
sugar                       49
specific_gravity            47
albumin                     46
blood_glucose_random        44
blood_urea                  19
serum_creatinine            17
blood_pressure              12
age                          9
bacteria                     4
pus_cell_clumps              4
hypertension                 2
diabetes_mellitus            2
coronary_artery_disease      2
appetite                     1
peda_edema                   1
aanemia                      1
class                        0
dtype: int64

# Feature Engineering

In [None]:
# filling null values, we will use two methods, random sampling for higher null values and
# mean/mode sampling for lower null values

# creating func for imputing random values
def random_value_imputation(col):
    random_sample = df[col].dropna().sample(df[col].isna().sum())
    random_sample.index = df[df[col].isnull()].index
    df.loc[df[col].isnull(), col] = random_sample

# creating func for imputing most common value(modal value)
def impute_mode(col):
    mode = df[col].mode()[0]
    df[col] = df[col].fillna(mode)

In [None]:
# let's check count of null values in num_cols

df[num_cols].isna().sum().sort_values(ascending=False)

red_blood_cell_count      131
white_blood_cell_count    106
potassium                  88
sodium                     87
packed_cell_volume         71
haemoglobin                52
sugar                      49
specific_gravity           47
albumin                    46
blood_glucose_random       44
blood_urea                 19
serum_creatinine           17
blood_pressure             12
age                         9
dtype: int64

In [None]:
# filling num_cols null values using random sampling method

for col in num_cols:
  random_value_imputation(col)

In [None]:
# let's check count of null values in num_cols again

df[num_cols].isna().sum().sort_values(ascending=False)

age                       0
blood_pressure            0
specific_gravity          0
albumin                   0
sugar                     0
blood_glucose_random      0
blood_urea                0
serum_creatinine          0
sodium                    0
potassium                 0
haemoglobin               0
packed_cell_volume        0
white_blood_cell_count    0
red_blood_cell_count      0
dtype: int64

In [None]:
# let's check count of null values in cat cols

df[cat_cols].isna().sum().sort_values(ascending=False)

red_blood_cells            152
pus_cell                    65
pus_cell_clumps              4
bacteria                     4
hypertension                 2
diabetes_mellitus            2
coronary_artery_disease      2
appetite                     1
peda_edema                   1
aanemia                      1
class                        0
dtype: int64

In [None]:
# filling "red_blood_cells" and "pus_cell" using random sampling method and rest of cat_cols using mode imputation
random_value_imputation('red_blood_cells')
random_value_imputation('pus_cell')


for col in cat_cols:
  impute_mode(col)

In [None]:
# let's check count of null values in cat_cols again
df[cat_cols].isna().sum().sort_values(ascending=False)


red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
hypertension               0
diabetes_mellitus          0
coronary_artery_disease    0
appetite                   0
peda_edema                 0
aanemia                    0
class                      0
dtype: int64

In [None]:
# checking count of null values in whole dataframe
df.isna().sum().sort_values(ascending=False)

age                        0
potassium                  0
aanemia                    0
peda_edema                 0
appetite                   0
coronary_artery_disease    0
diabetes_mellitus          0
hypertension               0
red_blood_cell_count       0
white_blood_cell_count     0
packed_cell_volume         0
haemoglobin                0
sodium                     0
blood_pressure             0
serum_creatinine           0
blood_urea                 0
blood_glucose_random       0
bacteria                   0
pus_cell_clumps            0
pus_cell                   0
red_blood_cells            0
sugar                      0
albumin                    0
specific_gravity           0
class                      0
dtype: int64

In [None]:
# check unique categories in each cat col by looping over cat cols

for col in cat_cols:
  print(f'{col} has {df[col].nunique()} categories\n')

red_blood_cells has 2 categories

pus_cell has 2 categories

pus_cell_clumps has 2 categories

bacteria has 2 categories

hypertension has 2 categories

diabetes_mellitus has 2 categories

coronary_artery_disease has 2 categories

appetite has 2 categories

peda_edema has 2 categories

aanemia has 2 categories

class has 2 categories



In [None]:
# using labelencoder and applying on cat cols
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
# check chronic df after transforming cat cols
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,0,1,0,0,210.0,...,38.0,6000.0,2.8,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,5.8,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0


# **Feature Selection**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
# Seperating features into independent and dependent
ind_col = [col for col in df.columns if col != 'class']
dep_col = 'class'

In [None]:
x = df[ind_col]
y = df[dep_col]

In [None]:
x.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,15.4,44.0,7800.0,5.2,1,1,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,0,1,0,0,210.0,...,11.3,38.0,6000.0,2.8,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,9.6,31.0,7500.0,5.8,0,1,0,1,0,1
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,11.2,32.0,6700.0,3.9,1,0,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,11.6,35.0,7300.0,4.6,0,0,0,0,0,0


In [None]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

In [None]:
ordered_rank_feat = SelectKBest(score_func=chi2,k=20)
ordered_feat = ordered_rank_feat.fit(x,y)

In [None]:
ordered_feat.scores_

array([1.05094446e+02, 8.45320701e+01, 5.46290495e-03, 2.37563709e+02,
       1.03951890e+02, 9.36523517e+00, 1.43796356e+01, 2.52000000e+01,
       1.32000000e+01, 2.27416069e+03, 2.40004057e+03, 3.37782021e+02,
       2.75918550e+01, 7.08858656e+00, 1.26207844e+02, 3.36071581e+02,
       1.00360845e+04, 1.88382285e+01, 8.82000000e+01, 8.22000000e+01,
       2.04000000e+01, 4.92000000e+01, 4.56000000e+01, 3.60000000e+01])

In [None]:
datascores = pd.DataFrame(ordered_feat.scores_, columns=['Score'])
datascores

Unnamed: 0,Score
0,105.094446
1,84.53207
2,0.005463
3,237.563709
4,103.95189
5,9.365235
6,14.379636
7,25.2
8,13.2
9,2274.160692


In [None]:
# Creating a dataframe of all columns so that we can concatenate both dataframes
dfcols = pd.DataFrame(x.columns)
dfcols

Unnamed: 0,0
0,age
1,blood_pressure
2,specific_gravity
3,albumin
4,sugar
5,red_blood_cells
6,pus_cell
7,pus_cell_clumps
8,bacteria
9,blood_glucose_random


In [None]:
# Concatenating the dfcols datascores dataframe and manually assigning column names to them
ranked_feat = pd.concat([dfcols,datascores], axis=1)
ranked_feat.columns = ['Features','Score']
ranked_feat

Unnamed: 0,Features,Score
0,age,105.094446
1,blood_pressure,84.53207
2,specific_gravity,0.005463
3,albumin,237.563709
4,sugar,103.95189
5,red_blood_cells,9.365235
6,pus_cell,14.379636
7,pus_cell_clumps,25.2
8,bacteria,13.2
9,blood_glucose_random,2274.160692


In [None]:
# Finding the top 10 features on the basis of their scores
ranked_feat.nlargest(10,'Score')

Unnamed: 0,Features,Score
16,white_blood_cell_count,10036.084487
10,blood_urea,2400.040568
9,blood_glucose_random,2274.160692
11,serum_creatinine,337.782021
15,packed_cell_volume,336.071581
3,albumin,237.563709
14,haemoglobin,126.207844
0,age,105.094446
4,sugar,103.95189
18,hypertension,88.2


In [None]:
sel_col = ranked_feat.nlargest(10,'Score')['Features'].values

In [None]:
# Creating a new Dataframe with top 10 features
x_new = df[sel_col]
x_new.head()

Unnamed: 0,white_blood_cell_count,blood_urea,blood_glucose_random,serum_creatinine,packed_cell_volume,albumin,haemoglobin,age,sugar,hypertension
0,7800.0,36.0,121.0,1.2,44.0,1.0,15.4,48.0,0.0,1
1,6000.0,18.0,210.0,0.8,38.0,4.0,11.3,7.0,0.0,0
2,7500.0,53.0,423.0,1.8,31.0,2.0,9.6,62.0,3.0,0
3,6700.0,56.0,117.0,3.8,32.0,4.0,11.2,48.0,0.0,1
4,7300.0,26.0,106.0,1.4,35.0,2.0,11.6,51.0,0.0,0


In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
x_new = mms.fit_transform(x_new)

In [None]:
x_new

array([[0.23140496, 0.0885751 , 0.21153846, ..., 0.52272727, 0.        ,
        1.        ],
       [0.15702479, 0.042362  , 0.4017094 , ..., 0.05681818, 0.        ,
        0.        ],
       [0.21900826, 0.1322208 , 0.85683761, ..., 0.68181818, 0.6       ,
        0.        ],
       ...,
       [0.18181818, 0.06290116, 0.16666667, ..., 0.11363636, 0.        ,
        0.        ],
       [0.20661157, 0.12451861, 0.1965812 , ..., 0.17045455, 0.        ,
        0.        ],
       [0.19008264, 0.042362  , 0.23290598, ..., 0.63636364, 0.        ,
        0.        ]])

# **Train Test Split**

In [None]:
# splitting data intp training and test set, so import train_test_split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_new, df['class'], test_size= 0.30,random_state = 13)

# **Model Building**

In [None]:
# import KNeighborsClassifier, accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
fitted_knn = knn.fit(x_train,y_train)
train_predictions_knn = fitted_knn.predict(x_train)
test_predictions_knn = fitted_knn.predict(x_test)

# accuracy score, confusion matrix and classification report of knn

print(f'Training Accuracy of KNN is {accuracy_score(y_train, train_predictions_knn)}')
print(f'Test Accuracy of KNN is {accuracy_score(y_test, test_predictions_knn)}\n')
print(f'Confusion Matrix : \n{confusion_matrix(y_test, test_predictions_knn)}\n')
print(f'Classification Report : \n{classification_report(y_test, test_predictions_knn)}')

Training Accuracy of KNN is 0.9571428571428572
Test Accuracy of KNN is 0.9333333333333333

Confusion Matrix : 
[[72  6]
 [ 2 40]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.97      0.92      0.95        78
           1       0.87      0.95      0.91        42

    accuracy                           0.93       120
   macro avg       0.92      0.94      0.93       120
weighted avg       0.94      0.93      0.93       120



In [None]:
# Finding Optimal Value of K
param_grid = {'n_neighbors' : np.arange(1,20,2)}
kn = KNeighborsClassifier()

# Create GridSearchCV object
grid_search = GridSearchCV(kn, param_grid, cv=5, scoring='accuracy')

# Fit the model to the data
fitted_model_kn = grid_search.fit(x_train,y_train)
predictions_kn = fitted_model_kn.predict(x_test)

# Print the best parameters found by the grid search
print("Best Parameters: ", grid_search.best_params_)

# Get the best model
best_knn = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_knn.score(x_test, y_test)
train_accuracy = best_knn.score(x_train, y_train)
print(f"Accuracy on Training Set: {train_accuracy*100:.2f}%")
print(f"Accuracy on Test Set: {test_accuracy*100:.2f}%")
print(f'Confusion Matrix : \n{confusion_matrix(y_test, predictions_kn)}\n')
print(f'Classification Report : \n{classification_report(y_test, predictions_kn)}')

Best Parameters:  {'n_neighbors': 5}
Accuracy on Training Set: 95.71%
Accuracy on Test Set: 93.33%
Confusion Matrix : 
[[72  6]
 [ 2 40]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.97      0.92      0.95        78
           1       0.87      0.95      0.91        42

    accuracy                           0.93       120
   macro avg       0.92      0.94      0.93       120
weighted avg       0.94      0.93      0.93       120



In [None]:
# import DecisionTreeClassifer
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
fitted_dtc = dtc.fit(x_train, y_train)
train_predictions_dtc = fitted_dtc.predict(x_train)
test_predictions_dtc = fitted_dtc.predict(x_test)

# accuracy score, confusion matrix and classification report of decision tree

print(f'Training Accuracy of DTC is {accuracy_score(y_train, train_predictions_dtc)}')
print(f'Test Accuracy of DTC is {accuracy_score(y_test, test_predictions_dtc)}\n')
print(f'Confusion Matrix : \n{confusion_matrix(y_test, test_predictions_dtc)}\n')
print(f'Classification Report : \n{classification_report(y_test, test_predictions_dtc)}')

Training Accuracy of DTC is 1.0
Test Accuracy of DTC is 0.9

Confusion Matrix : 
[[74  4]
 [ 8 34]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.90      0.95      0.92        78
           1       0.89      0.81      0.85        42

    accuracy                           0.90       120
   macro avg       0.90      0.88      0.89       120
weighted avg       0.90      0.90      0.90       120



In [None]:
# Finding optimal parameters for Decision Tree
param_grid = {'criterion': ['gini', 'entropy'],'max_depth': np.arange(1,10),'min_samples_split': np.arange(1,10),'min_samples_leaf': np.arange(1,10)}
dt = DecisionTreeClassifier()

# Create GridSearchCV object
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')

# Fit the model to the data
fitted_model_dt = grid_search.fit(x_train,y_train)
predictions_dt = fitted_model_dt.predict(x_test)

# Print the best parameters found by the grid search
print("Best Parameters: ", grid_search.best_params_)

# Get the best model
best_tree = grid_search.best_estimator_

# Evaluate the best model on the test set
train_accuracy = best_tree.score(x_train, y_train)
test_accuracy = best_tree.score(x_test, y_test)
print(f"Accuracy on Training Set: {train_accuracy*100:.2f}%")
print(f"Accuracy on Test Set: {test_accuracy*100:.2f}%")
print(f'Confusion Matrix : \n{confusion_matrix(y_test, predictions_dt)}\n')
print(f'Classification Report : \n{classification_report(y_test, predictions_dt)}')

Best Parameters:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 4}
Accuracy on Training Set: 98.57%
Accuracy on Test Set: 94.17%
Confusion Matrix : 
[[77  1]
 [ 6 36]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.93      0.99      0.96        78
           1       0.97      0.86      0.91        42

    accuracy                           0.94       120
   macro avg       0.95      0.92      0.93       120
weighted avg       0.94      0.94      0.94       120



In [None]:
# import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, max_features='auto')
fitted_rf = rf.fit(x_train,y_train)
train_predictions_rf = fitted_rf.predict(x_train)
test_predictions_rf = fitted_rf.predict(x_test)

# accuracy score, confusion matrix and classification report of decision tree

print(f'Training Accuracy of RF is {accuracy_score(y_train, train_predictions_rf)}')
print(f'Test Accuracy of RF is {accuracy_score(y_test, test_predictions_rf)}\n')
print(f'Confusion Matrix : \n{confusion_matrix(y_test, test_predictions_rf)}\n')
print(f'Classification Report : \n{classification_report(y_test, test_predictions_rf)}')

Training Accuracy of RF is 1.0
Test Accuracy of RF is 0.9416666666666667

Confusion Matrix : 
[[76  2]
 [ 5 37]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.97      0.96        78
           1       0.95      0.88      0.91        42

    accuracy                           0.94       120
   macro avg       0.94      0.93      0.93       120
weighted avg       0.94      0.94      0.94       120



In [None]:
# import RandomForestClassifier
import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(x_train, y_train)

train_pred_xgb = model.predict(x_train)
test_pred_xgb = model.predict(x_test)

# accuracy score, confusion matrix and classification report of decision tree

print(f'Training Accuracy of RF is {accuracy_score(y_train, train_pred_xgb)}')
print(f'Test Accuracy of RF is {accuracy_score(y_test, test_pred_xgb)}\n')
print(f'Confusion Matrix : \n{confusion_matrix(y_test, test_pred_xgb)}\n')
print(f'Classification Report : \n{classification_report(y_test, test_pred_xgb)}')

Training Accuracy of RF is 1.0
Test Accuracy of RF is 0.9416666666666667

Confusion Matrix : 
[[73  5]
 [ 2 40]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        78
           1       0.89      0.95      0.92        42

    accuracy                           0.94       120
   macro avg       0.93      0.94      0.94       120
weighted avg       0.94      0.94      0.94       120

