In [218]:
import pandas as pd
import numpy as np
!pip install category_encoders

In [256]:
# Read dataset 
df = pd.read_csv('kidney_disease.csv')

In [257]:
# Describe dataset
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


Dealing with Null Data

In [258]:
def normalization_element(x):
    if type(x) == str:
        x = x.replace('\t', '')
        try:
            x = float(x)
        except Exception as e:
            x = np.nan
    else:
        x = np.nan
    return x

In [259]:
df['bp'] = df['bp'].fillna(value=df['bp'].mean())
df['age'] = df['age'].fillna(value=df['age'].mean())
df['sg'] = df['sg'].fillna(value=df['sg'].mean())
df['al'] = df['al'].fillna(method= 'pad')
df['su'] = df['su'].fillna(method= 'pad')
df['rbc'] = df['rbc'].fillna(df['rbc'].value_counts().idxmax())
df['pc'] = df['pc'].fillna(df['pc'].value_counts().idxmax())
df['pcc'] = df['pcc'].fillna(df['pcc'].value_counts().idxmax())
df['bgr'] = df['bgr'].fillna(value=df['bgr'].mean())
df['bu'] = df['bu'].fillna(value=df['bu'].mean())
df['sc'] = df['sc'].fillna(value=df['sc'].mean())
df['sod'] = df['sod'].fillna(value=df['sod'].mean())
df['pot'] = df['pot'].fillna(value=df['pot'].mean())
df['hemo'] = df['hemo'].fillna(value=df['hemo'].mean())
df['pcv'] = df['pcv'].fillna(method= 'pad')
df['rc'] = df['rc'].apply(normalization_element)
df['rc'] = df['rc'].fillna(value=df['rc'].mean())
df['htn'] = df['htn'].fillna(df['htn'].value_counts().idxmax())
df['dm'] = df['dm'].fillna(df['dm'].value_counts().idxmax())
df['cad'] = df['cad'].fillna(df['cad'].value_counts().idxmax())
df['appet'] = df['appet'].fillna(df['appet'].value_counts().idxmax())
df['pe'] = df['pe'].fillna(df['pe'].value_counts().idxmax())
df['ane'] = df['ane'].fillna(df['ane'].value_counts().idxmax())
df['wc'] = df['wc'].apply(normalization_element)
df['wc'] = df['wc'].fillna(df['wc'].value_counts().idxmax())    
df['ba'] = df['ba'].fillna(df['ba'].value_counts().idxmax())    

Encoding data

In [342]:
# Separating target from data
X = df.iloc[:,:-1]
y= df.iloc[:, -1]

In [343]:
# Define categorical columns
categorical_columns

['rbc', 'pc', 'ba', 'pcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'wc']

In [347]:
# Encoding X data
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
encoder=ce.OneHotEncoder(cols=categorical_columns,handle_unknown='return_nan',return_df=True,use_cat_names=True)
X_encoded = encoder.fit_transform(X)

In [348]:
X_encoded

Unnamed: 0,id,age,bp,sg,al,su,rbc_normal,rbc_abnormal,pc_normal,pc_abnormal,...,dm_\tyes,cad_no,cad_yes,cad_\tno,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes
0,0,48.0,80.0,1.020,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,1,7.0,50.0,1.020,4.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,2,62.0,80.0,1.010,2.0,3.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,4,51.0,80.0,1.010,2.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
396,396,42.0,70.0,1.025,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
397,397,12.0,80.0,1.020,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
398,398,17.0,60.0,1.025,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [349]:
# Making maping of columns
maping_y = {'notckd': 0, 'ckd': 1}

In [350]:
def normalize_y(x):
    """
    Function for remove nondesired characters from data
    """
    x = x.replace('\t','')
    return x

In [351]:
# Mapping and cleaning data
y = y.apply(lambda x: maping_y[normalize_y(x)])

In [352]:
# Split data in test and train part
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

In [353]:
# Create an instance of the SimpleImputer with the strategy set as median.
from sklearn.impute import SimpleImputer
si_mean = SimpleImputer(strategy='mean')
si_median = SimpleImputer(strategy='median')
si_most_frequent = SimpleImputer(strategy='most_frequent')
si_constant = SimpleImputer(strategy='constant', fill_value=0)

In [354]:
# Import Linear Regression 
from sklearn.linear_model import LinearRegression 

In [355]:
# Create an instance of the InteractiveImputer with the LinearReagression as the estimator model.

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
ii = IterativeImputer(estimator=LinearRegression())

In [356]:
# Create an instance of the KNNImputer.
from sklearn.impute import KNNImputer

knni = KNNImputer()

In [359]:
# Train every imputer on the train data set.
si_mean.fit(X_train)
si_median.fit(X_train)
si_most_frequent.fit(X_train)
si_constant.fit(X_train)
knni.fit(X_train)

In [None]:
from sklearn.metrics import accuracy_score

In [360]:
y_pred = si_mean.predict(X_test)

AttributeError: 'SimpleImputer' object has no attribute 'predict'

In [363]:
si_mean.transform(X_test)

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '\t?'