### Import Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

### Read, explore, clean dataset

In [3]:
df= pd.read_csv("kidney_disease.csv")

In [4]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,classification
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44,7800,5.2,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38,6000,,ckd
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31,7500,,ckd
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,ckd
4,4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35,7300,4.6,ckd


In [5]:
df= df.drop("id", axis=1)
df.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,classification
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44,7800,5.2,ckd
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38,6000,,ckd
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31,7500,,ckd
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,ckd
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35,7300,4.6,ckd


In [6]:
df.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
       'pcv', 'wc', 'rc', 'classification'],
      dtype='object')

In [164]:
#Split the data into predictors and target

X=df.drop("classification",axis=1)
y= df[["classification"]]

In [28]:
print(X.shape)
print(y.shape)

(400, 14)
(400, 1)


In [29]:
y.isnull().sum()

classification    0
dtype: int64

In [30]:
y.isna().sum()

classification    0
dtype: int64

In [165]:
#Know unique values in target column
y["classification"].unique()


array(['ckd', 'ckd\t', 'notckd'], dtype=object)

In [40]:
y[y["classification"]=="ckd\t"]

Unnamed: 0,classification
37,ckd\t
230,ckd\t


Assuming that ckd\t is a wrong entry, it is removed

In [41]:
y["classification"]=y["classification"].replace(["ckd\t"], "ckd")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["classification"]=y["classification"].replace(["ckd\t"], "ckd")


In [42]:
y["classification"].unique()

array(['ckd', 'notckd'], dtype=object)

Imputing and one hot encoding in target column

In [43]:
y_si = SimpleImputer(strategy='most_frequent')
y_si.fit(y)

In [None]:
y_ohe= pd.get_dummies(y,"classification", drop_first=True)
y_ohe.head()

Explore X

In [46]:
X.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo',
       'pcv', 'wc', 'rc'],
      dtype='object')

In [60]:
X.isna().sum()

age       9
bp       12
sg       47
al       46
su       49
bgr      44
bu       19
sc       17
sod      87
pot      88
hemo     52
pcv      70
wc      105
rc      130
dtype: int64

In [62]:
X.isnull().sum()

age       9
bp       12
sg       47
al       46
su       49
bgr      44
bu       19
sc       17
sod      87
pot      88
hemo     52
pcv      70
wc      105
rc      130
dtype: int64

In [55]:
X.dtypes

age     float64
bp      float64
sg      float64
al      float64
su      float64
bgr     float64
bu      float64
sc      float64
sod     float64
pot     float64
hemo    float64
pcv      object
wc       object
rc       object
dtype: object

In [59]:
X.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44,7800,5.2
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38,6000,
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31,7500,
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35,7300,4.6


Divide to numeric and categorical

In [92]:
X_numeric= X.select_dtypes(include=[np.number])
X_cat= X.select_dtypes(exclude=[np.number])

In [93]:
X_numeric.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo'], dtype='object')

In [94]:
X_cat.columns

Index(['pcv', 'wc', 'rc'], dtype='object')

In [95]:
num_si = SimpleImputer(strategy='mean')
num_si.fit(X_numeric)

In [96]:
X_imp_num = pd.DataFrame(num_si.transform(X_numeric), columns = X_numeric.columns)
X_imp_num.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,15.4
1,7.0,50.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,11.3
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,137.528754,4.627244,9.6
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,11.6


In [97]:
scaler = StandardScaler()
scaler.fit(X_imp_num,)

In [98]:
X_sca_imp_num = pd.DataFrame(scaler.transform(X_imp_num), columns = X_imp_num.columns)

In [99]:
cat_si = SimpleImputer(strategy='most_frequent')
cat_si.fit(X_cat)

In [100]:
X_imp_cat = pd.DataFrame(cat_si.transform(X_cat), columns = X_cat.columns)
X_imp_cat.head()

Unnamed: 0,pcv,wc,rc
0,44,7800,5.2
1,38,6000,5.2
2,31,7500,5.2
3,32,6700,3.9
4,35,7300,4.6


It is seen that the columns that were considered as categorical should also be numeric. Hence convert them to numerics

But some wrong entry found in cells of these columns. Hence replacing it by suitable data suitable 

In [69]:
X_imp_cat["pcv"]= pd.to_numeric(X_imp_cat["pcv"])

ValueError: Unable to parse string "	?" at position 66

In [101]:
X_imp_cat["pcv"][66]="44"

In [79]:
X_imp_cat["wc"]= pd.to_numeric(X_imp_cat["wc"])

ValueError: Unable to parse string "	?" at position 185

In [102]:
X_imp_cat["wc"][185]="6000"

In [86]:
X_imp_cat["rc"]= pd.to_numeric(X_imp_cat["rc"])

ValueError: Unable to parse string "	?" at position 162

In [103]:
X_imp_cat["rc"][162]="4.6"

In [166]:
#Covert to numeric data
X_imp_cat["pcv"]= pd.to_numeric(X_imp_cat["pcv"])
X_imp_cat["wc"]= pd.to_numeric(X_imp_cat["wc"])
X_imp_cat["rc"]= pd.to_numeric(X_imp_cat["rc"])

In [105]:
scaler1=StandardScaler()
scaler1.fit(X_imp_cat)

In [108]:
X_sca_imp_cat = pd.DataFrame(scaler1.transform(X_imp_cat), columns = X_imp_cat.columns)

In [109]:
X_final = pd.concat([X_sca_imp_num, X_sca_imp_cat], axis = 1)
X_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   bgr     400 non-null    float64
 6   bu      400 non-null    float64
 7   sc      400 non-null    float64
 8   sod     400 non-null    float64
 9   pot     400 non-null    float64
 10  hemo    400 non-null    float64
 11  pcv     400 non-null    float64
 12  wc      400 non-null    float64
 13  rc      400 non-null    float64
dtypes: float64(14)
memory usage: 43.9 KB


In [167]:
## Split into train test
X_train, X_test, y_train, y_test = train_test_split(X_final, y_ohe, test_size = 0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(320, 14)
(80, 14)
(320, 1)
(80, 1)


### PCA


In [113]:
pca_model = PCA()

In [122]:
pca_model.fit(X_train)

In [115]:
pca_model.explained_variance_ratio_

array([0.32494665, 0.12186284, 0.09758176, 0.0842209 , 0.07047492,
       0.06183823, 0.05288292, 0.04159894, 0.03699307, 0.03126024,
       0.02633531, 0.02209807, 0.01694184, 0.01096432])

In [123]:
np.cumsum(pca_model.explained_variance_ratio_)

array([0.32494665, 0.44680949, 0.54439124, 0.62861214, 0.69908706,
       0.76092529, 0.81380821, 0.85540715, 0.89240022, 0.92366046,
       0.94999577, 0.97209384, 0.98903568, 1.        ])

In [145]:
grid={"n_components":[4,9,12]}

In [146]:
gcv_pca_model= GridSearchCV(estimator=pca_model, param_grid=grid, verbose=0)

In [147]:
gcv_pca_model.fit(X_train)

In [148]:
gcv_pca_model.best_estimator_

In [150]:
pca_new_model= PCA(n_components=4)

In [151]:
pca_new_model.fit(X_train)

In [152]:
pca_new_model.explained_variance_ratio_

array([0.32494665, 0.12186284, 0.09758176, 0.0842209 ])

In [153]:
np.cumsum(pca_new_model.explained_variance_ratio_)

array([0.32494665, 0.44680949, 0.54439124, 0.62861214])

In [155]:
pca_new_model.get_feature_names_out

<bound method ClassNamePrefixFeaturesOutMixin.get_feature_names_out of PCA(n_components=4)>

### PCA model with  60% variability explained

In [159]:
pca_60= PCA(n_components=0.6, svd_solver="full")

In [161]:
pca_60.fit(X_train)

In [162]:
pca_60.explained_variance_ratio_

array([0.32494665, 0.12186284, 0.09758176, 0.0842209 ])

In [163]:
np.cumsum(pca_60.explained_variance_ratio_)

array([0.32494665, 0.44680949, 0.54439124, 0.62861214])

### Observations

4 Principal components can cover more than 60% of variability