In [135]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [136]:
dataset = pd.read_csv('/content/kidney_disease.csv')

In [137]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [138]:
dataset.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [139]:
dataset.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [140]:
dataset[["al"]].value_counts()

Unnamed: 0_level_0,count
al,Unnamed: 1_level_1
0.0,199
1.0,44
2.0,43
3.0,43
4.0,24
5.0,1


In [141]:
dataset[["al"]].isnull().sum()

Unnamed: 0,0
al,46


In [142]:
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["al"]])

In [143]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [144]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [145]:
data_imputed

array([[1.        ],
       [4.        ],
       [2.        ],
       [4.        ],
       [2.        ],
       [3.        ],
       [0.        ],
       [2.        ],
       [3.        ],
       [2.        ],
       [2.        ],
       [3.        ],
       [3.        ],
       [1.01694915],
       [3.        ],
       [3.        ],
       [2.        ],
       [1.01694915],
       [0.        ],
       [1.        ],
       [2.        ],
       [1.01694915],
       [4.        ],
       [0.        ],
       [4.        ],
       [0.        ],
       [0.        ],
       [3.        ],
       [1.        ],
       [1.        ],
       [1.01694915],
       [3.        ],
       [1.        ],
       [2.        ],
       [1.        ],
       [2.        ],
       [1.        ],
       [1.01694915],
       [3.        ],
       [2.        ],
       [2.        ],
       [0.        ],
       [0.        ],
       [1.        ],
       [3.        ],
       [3.        ],
       [0.        ],
       [3.   

In [146]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["al"])

In [147]:
imputed_df

Unnamed: 0,al
0,1.0
1,4.0
2,2.0
3,4.0
4,2.0
...,...
395,0.0
396,0.0
397,0.0
398,0.0


In [148]:
imputed_df.value_counts()

Unnamed: 0_level_0,count
al,Unnamed: 1_level_1
0.0,199
1.016949,46
1.0,44
2.0,43
3.0,43
4.0,24
5.0,1


In [149]:
import numpy as np

In [150]:
# Round & clip 'al'
dataset[["al"]] = np.clip(np.round(imputed_df), 0, 5).astype(int)

In [151]:
dataset[["al"]].value_counts()

Unnamed: 0_level_0,count
al,Unnamed: 1_level_1
0,199
1,90
2,43
3,43
4,24
5,1


In [152]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              400 non-null    int64  
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [153]:
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["age"]])

In [154]:
# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

In [155]:
# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

In [156]:
data_imputed

array([[48.        ],
       [ 7.        ],
       [62.        ],
       [48.        ],
       [51.        ],
       [60.        ],
       [68.        ],
       [24.        ],
       [52.        ],
       [53.        ],
       [50.        ],
       [63.        ],
       [68.        ],
       [68.        ],
       [68.        ],
       [40.        ],
       [47.        ],
       [47.        ],
       [60.        ],
       [62.        ],
       [61.        ],
       [60.        ],
       [48.        ],
       [21.        ],
       [42.        ],
       [61.        ],
       [75.        ],
       [69.        ],
       [75.        ],
       [68.        ],
       [51.48337596],
       [73.        ],
       [61.        ],
       [60.        ],
       [70.        ],
       [65.        ],
       [76.        ],
       [72.        ],
       [69.        ],
       [82.        ],
       [46.        ],
       [45.        ],
       [47.        ],
       [35.        ],
       [54.        ],
       [54

In [157]:
# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["age"])

In [158]:
imputed_df

Unnamed: 0,age
0,48.0
1,7.0
2,62.0
3,48.0
4,51.0
...,...
395,55.0
396,42.0
397,12.0
398,17.0


In [159]:
dataset[["age"]] =np.round(imputed_df).astype(int)

In [160]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80.0,1.020,1,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50.0,1.020,4,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80.0,1.010,2,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70.0,1.005,4,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80.0,1.010,2,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80.0,1.020,0,0.0,normal,normal,notpresent,notpresent,140.0,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70.0,1.025,0,0.0,normal,normal,notpresent,notpresent,75.0,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80.0,1.020,0,0.0,normal,normal,notpresent,notpresent,100.0,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60.0,1.025,0,0.0,normal,normal,notpresent,notpresent,114.0,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [161]:
dataset[["su"]].value_counts()

Unnamed: 0_level_0,count
su,Unnamed: 1_level_1
0.0,290
2.0,18
3.0,14
1.0,13
4.0,13
5.0,3


In [162]:
# Round & clip 'al'
dataset[["su"]] = np.clip(np.round(imputed_df), 0, 5).astype(int)


In [163]:
dataset[["su"]].value_counts()

Unnamed: 0_level_0,count
su,Unnamed: 1_level_1
5,397
2,1
3,1
4,1


In [164]:
numeric_cols = [
    'age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','pcv','wc','rc'
]

In [165]:
print(dataset[numeric_cols].isnull().sum())  # Expect all zeros


age       0
bp       12
sg       47
al        0
su        0
bgr      44
bu       19
sc       17
sod      87
pot      88
hemo     52
pcv      70
wc      105
rc      130
dtype: int64


In [166]:
# Step 1: Clean invalid values
dataset['bp'] = dataset['bp'].replace(['?', '', ' '], np.nan)
dataset['bp'] = pd.to_numeric(dataset['bp'], errors='coerce')

In [167]:
# Step 3: Scaling and KNN Imputation
scaler = StandardScaler()
bp_scaled = scaler.fit_transform(dataset[['bp']])

imputer = KNNImputer(n_neighbors=3)
bp_imputed_scaled = imputer.fit_transform(bp_scaled)

bp_imputed = scaler.inverse_transform(bp_imputed_scaled)

In [168]:
# Step 4: Replace and round
dataset['bp'] = np.clip(np.round(bp_imputed), 50, 180).astype(int)

In [169]:
print(dataset['bp'].isnull().sum())  # Should be 0
print(dataset['bp'].describe())

0
count    400.000000
mean      76.455000
std       13.476536
min       50.000000
25%       70.000000
50%       78.000000
75%       80.000000
max      180.000000
Name: bp, dtype: float64


In [170]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140.0,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75.0,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100.0,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114.0,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [171]:
# Check summary statistics
print(dataset['sg'].describe())

# Check unique values
print("Unique SG values:", dataset['sg'].unique())

# Count missing values
print("Missing values in sg:", dataset['sg'].isnull().sum())

count    353.000000
mean       1.017408
std        0.005717
min        1.005000
25%        1.010000
50%        1.020000
75%        1.020000
max        1.025000
Name: sg, dtype: float64
Unique SG values: [1.02  1.01  1.005 1.015   nan 1.025]
Missing values in sg: 47


In [172]:

dataset['sg'] = dataset['sg'].replace(['?', '', ' '], np.nan)
dataset['sg'] = pd.to_numeric(dataset['sg'], errors='coerce')

In [173]:
dataset['sg'].isnull().sum()

np.int64(47)

In [174]:
# Step 4.1: Scaling
scaler = StandardScaler()
sg_scaled = scaler.fit_transform(dataset[['sg']])

# Step 4.2: Apply KNN Imputer
imputer = KNNImputer(n_neighbors=3)
sg_imputed_scaled = imputer.fit_transform(sg_scaled)

# Step 4.3: Inverse scaling to original SG range
sg_imputed = scaler.inverse_transform(sg_imputed_scaled)

# Replace original SG column with imputed data
dataset['sg'] = sg_imputed

In [175]:
sg_imputed

array([[1.02      ],
       [1.02      ],
       [1.01      ],
       [1.005     ],
       [1.01      ],
       [1.015     ],
       [1.01      ],
       [1.015     ],
       [1.015     ],
       [1.02      ],
       [1.01      ],
       [1.01      ],
       [1.015     ],
       [1.01740793],
       [1.01      ],
       [1.015     ],
       [1.015     ],
       [1.01740793],
       [1.025     ],
       [1.015     ],
       [1.015     ],
       [1.01740793],
       [1.025     ],
       [1.01      ],
       [1.015     ],
       [1.025     ],
       [1.015     ],
       [1.01      ],
       [1.01740793],
       [1.005     ],
       [1.01740793],
       [1.015     ],
       [1.01      ],
       [1.02      ],
       [1.01      ],
       [1.02      ],
       [1.015     ],
       [1.01740793],
       [1.02      ],
       [1.01      ],
       [1.01      ],
       [1.01      ],
       [1.01      ],
       [1.01      ],
       [1.01      ],
       [1.02      ],
       [1.015     ],
       [1.01 

In [176]:
# Step 5: Round to nearest valid SG level
sg_levels = [1.005, 1.010, 1.015, 1.020, 1.025]
dataset['sg'] = dataset['sg'].apply(lambda x: min(sg_levels, key=lambda v: abs(v - x)))

In [177]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140.0,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75.0,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100.0,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114.0,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [178]:
print(dataset[numeric_cols].isnull().sum())

age       0
bp        0
sg        0
al        0
su        0
bgr      44
bu       19
sc       17
sod      87
pot      88
hemo     52
pcv      70
wc      105
rc      130
dtype: int64


In [179]:
print(dataset['bgr'].describe())

count    356.000000
mean     148.036517
std       79.281714
min       22.000000
25%       99.000000
50%      121.000000
75%      163.000000
max      490.000000
Name: bgr, dtype: float64


In [180]:
# Step 1: Clean placeholders
dataset['bgr'] = dataset['bgr'].replace(['?', '', ' '], np.nan)
dataset['bgr'] = pd.to_numeric(dataset['bgr'], errors='coerce')

In [181]:

# Step 3: Scaling
scaler = StandardScaler()
bgr_scaled = scaler.fit_transform(dataset[['bgr']])

# Step 4: KNN Imputation
imputer = KNNImputer(n_neighbors=3)
bgr_imputed_scaled = imputer.fit_transform(bgr_scaled)

# Step 5: Inverse scaling
bgr_imputed = scaler.inverse_transform(bgr_imputed_scaled)

In [182]:
# Step 6: Replace original column
dataset['bgr'] = bgr_imputed

# Step 7: Clip and round to realistic values
dataset['bgr'] = np.clip(np.round(dataset['bgr']), 50, 500).astype(int)

In [183]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49.0,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31.0,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26.0,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50.0,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [184]:
print(dataset[numeric_cols].isnull().sum())

age       0
bp        0
sg        0
al        0
su        0
bgr       0
bu       19
sc       17
sod      87
pot      88
hemo     52
pcv      70
wc      105
rc      130
dtype: int64


In [185]:
# Step 1: Clean placeholders
dataset['bu'] = dataset['bu'].replace(['?', '', ' '], np.nan)
dataset['bu'] = pd.to_numeric(dataset['bu'], errors='coerce')

In [186]:
# Step 3: Scaling
scaler = StandardScaler()
bu_scaled = scaler.fit_transform(dataset[['bu']])

# Step 4: KNN Imputation
imputer = KNNImputer(n_neighbors=3)
bu_imputed_scaled = imputer.fit_transform(bu_scaled)

# Step 5: Inverse scaling
bu_imputed = scaler.inverse_transform(bu_imputed_scaled)

# Step 6: Replace original column
dataset['bu'] = bu_imputed

In [187]:
# Step 7: Clip and round to realistic values
dataset['bu'] = np.clip(np.round(dataset['bu']), 5, 200).astype(int)

In [188]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0.5,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1.2,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,0.6,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1.0,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [189]:

# Step 1: Clean invalid placeholders
dataset['sc'] = dataset['sc'].replace(['?', '', ' '], np.nan)
dataset['sc'] = pd.to_numeric(dataset['sc'], errors='coerce')

In [190]:
# Step 3: Scale data
scaler = StandardScaler()
sc_scaled = scaler.fit_transform(dataset[['sc']])

# Step 4: KNN Imputation
imputer = KNNImputer(n_neighbors=3)
sc_imputed_scaled = imputer.fit_transform(sc_scaled)

# Step 5: Inverse scaling
sc_imputed = scaler.inverse_transform(sc_imputed_scaled)

# Step 6: Replace original column
dataset['sc'] = sc_imputed

In [191]:
# Step 7: Clip to realistic medical range
dataset['sc'] = np.clip(np.round(dataset['sc'], 2), 0.4, 15.0)
dataset['sc'] = dataset['sc'].round(0).astype(int)


In [192]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,1,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,2,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,4,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0,150.0,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1,141.0,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,1,137.0,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1,135.0,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [193]:
# Step 1: Clean invalid placeholders
dataset['sod'] = dataset['sod'].replace(['?', '', ' '], np.nan)
dataset['sod'] = pd.to_numeric(dataset['sod'], errors='coerce')

In [194]:
# Step 3: Scale data
scaler = StandardScaler()
sod_scaled = scaler.fit_transform(dataset[['sod']])

# Step 4: KNN Imputer
imputer = KNNImputer(n_neighbors=3)
sod_imputed_scaled = imputer.fit_transform(sod_scaled)

# Step 5: Inverse scaling
sod_imputed = scaler.inverse_transform(sod_imputed_scaled)

# Step 6: Update dataset
dataset['sod'] = sod_imputed

In [195]:
# Step 7: Optional rounding to integer
dataset['sod'] = dataset['sod'].round(0).astype(int)

In [196]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1,138,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,1,138,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,2,138,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,4,111,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1,138,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0,150,4.9,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1,141,3.5,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,1,137,4.4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1,135,4.9,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [197]:
# Step 1: Replace invalid placeholders
dataset['pot'] = dataset['pot'].replace(['?', '', ' '], np.nan)
dataset['pot'] = pd.to_numeric(dataset['pot'], errors='coerce')

In [198]:
# Step 3: Scaling
scaler = StandardScaler()
pot_scaled = scaler.fit_transform(dataset[['pot']])

# Step 4: KNN Imputation
imputer = KNNImputer(n_neighbors=3)
pot_imputed_scaled = imputer.fit_transform(pot_scaled)

# Step 5: Inverse scaling
pot_imputed = scaler.inverse_transform(pot_imputed_scaled)

# Step 6: Replace back into dataset
dataset['pot'] = pot_imputed

In [199]:

# Step 7: Round to 1 decimal place
dataset['pot'] = dataset['pot'].round(0).astype(int)

In [200]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1,138,5,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,1,138,5,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,2,138,5,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,4,111,2,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1,138,5,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0,150,5,15.7,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1,141,4,16.5,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,1,137,4,15.8,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1,135,5,14.2,51,7200,5.9,no,no,no,good,no,no,notckd


In [201]:
# Step 1: Clean invalid placeholders
dataset['hemo'] = dataset['hemo'].replace(['?', '', ' '], np.nan)
dataset['hemo'] = pd.to_numeric(dataset['hemo'], errors='coerce')

In [202]:
# Step 3: Scale data
scaler = StandardScaler()
hemo_scaled = scaler.fit_transform(dataset[['hemo']])

# Step 4: KNN Imputation
imputer = KNNImputer(n_neighbors=3)
hemo_imputed_scaled = imputer.fit_transform(hemo_scaled)

# Step 5: Inverse scaling
hemo_imputed = scaler.inverse_transform(hemo_imputed_scaled)

# Step 6: Update dataset
dataset['hemo'] = hemo_imputed

In [203]:
# Step 7: Round to 1 decimal place
dataset['hemo'] = dataset['hemo'].round(0).astype(int)

In [204]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1,138,5,15,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,1,138,5,11,38,6000,,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,2,138,5,10,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,4,111,2,11,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1,138,5,12,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0,150,5,16,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1,141,4,16,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,1,137,4,16,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1,135,5,14,51,7200,5.9,no,no,no,good,no,no,notckd


In [205]:
print(dataset['rc'].describe())


count     270
unique     49
top       5.2
freq       18
Name: rc, dtype: object


In [206]:
# Step 1: Clean invalid placeholders
dataset['rc'] = dataset['rc'].replace(['?', '', ' '], np.nan)
dataset['rc'] = pd.to_numeric(dataset['rc'], errors='coerce')

In [207]:
# Step 3: Scale the data
scaler = StandardScaler()
rc_scaled = scaler.fit_transform(dataset[['rc']])

# Step 4: Apply KNN Imputer
imputer = KNNImputer(n_neighbors=3)
rc_imputed_scaled = imputer.fit_transform(rc_scaled)

# Step 5: Inverse scale
rc_imputed = scaler.inverse_transform(rc_imputed_scaled)

# Step 6: Replace back into dataset
dataset['rc'] = rc_imputed

In [208]:
# Step 7: Round to 1 decimal place
dataset['rc'] = dataset['rc'].round(0).astype(int)

In [209]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1,138,5,15,44,7800,5,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,1,138,5,11,38,6000,5,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,2,138,5,10,31,7500,5,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,4,111,2,11,32,6700,4,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1,138,5,12,35,7300,5,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0,150,5,16,47,6700,5,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1,141,4,16,54,7800,6,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,1,137,4,16,49,6600,5,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1,135,5,14,51,7200,6,no,no,no,good,no,no,notckd


In [210]:
print(dataset[numeric_cols].isnull().sum())

age       0
bp        0
sg        0
al        0
su        0
bgr       0
bu        0
sc        0
sod       0
pot       0
hemo      0
pcv      70
wc      105
rc        0
dtype: int64


In [211]:
categorical_cols = [
    'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'
]

In [212]:
print(dataset[categorical_cols].isnull().sum())

rbc      152
pc        65
pcc        4
ba         4
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
dtype: int64


In [213]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48,80,1.020,1,5,,normal,notpresent,notpresent,121,36,1,138,5,15,44,7800,5,yes,yes,no,good,no,no,ckd
1,1,7,50,1.020,4,5,,normal,notpresent,notpresent,148,18,1,138,5,11,38,6000,5,no,no,no,good,no,no,ckd
2,2,62,80,1.010,2,5,normal,normal,notpresent,notpresent,423,53,2,138,5,10,31,7500,5,no,yes,no,poor,no,yes,ckd
3,3,48,70,1.005,4,5,normal,abnormal,present,notpresent,117,56,4,111,2,11,32,6700,4,yes,no,no,poor,yes,yes,ckd
4,4,51,80,1.010,2,5,normal,normal,notpresent,notpresent,106,26,1,138,5,12,35,7300,5,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,normal,notpresent,notpresent,140,49,0,150,5,16,47,6700,5,no,no,no,good,no,no,notckd
396,396,42,70,1.025,0,5,normal,normal,notpresent,notpresent,75,31,1,141,4,16,54,7800,6,no,no,no,good,no,no,notckd
397,397,12,80,1.020,0,5,normal,normal,notpresent,notpresent,100,26,1,137,4,16,49,6600,5,no,no,no,good,no,no,notckd
398,398,17,60,1.025,0,5,normal,normal,notpresent,notpresent,114,50,1,135,5,14,51,7200,6,no,no,no,good,no,no,notckd


In [214]:
dataset['rbc'].value_counts(dropna=False)


Unnamed: 0_level_0,count
rbc,Unnamed: 1_level_1
normal,201
,152
abnormal,47


In [215]:
print('rbc' in dataset.columns)


True


In [216]:
dataset = pd.get_dummies(dataset, columns=["rbc"], drop_first=True, dummy_na=False, dtype=int)

In [217]:
[col for col in dataset.columns if 'rbc' in col.lower()]


['rbc_normal']

In [218]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["rbc_normal"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["rbc_normal"])

dataset[["rbc_normal"]] =np.round(imputed_df).astype(int)

In [219]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification,rbc_normal
0,0,48,80,1.020,1,5,normal,notpresent,notpresent,121,36,1,138,5,15,44,7800,5,yes,yes,no,good,no,no,ckd,0
1,1,7,50,1.020,4,5,normal,notpresent,notpresent,148,18,1,138,5,11,38,6000,5,no,no,no,good,no,no,ckd,0
2,2,62,80,1.010,2,5,normal,notpresent,notpresent,423,53,2,138,5,10,31,7500,5,no,yes,no,poor,no,yes,ckd,1
3,3,48,70,1.005,4,5,abnormal,present,notpresent,117,56,4,111,2,11,32,6700,4,yes,no,no,poor,yes,yes,ckd,1
4,4,51,80,1.010,2,5,normal,notpresent,notpresent,106,26,1,138,5,12,35,7300,5,no,no,no,good,no,no,ckd,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,normal,notpresent,notpresent,140,49,0,150,5,16,47,6700,5,no,no,no,good,no,no,notckd,1
396,396,42,70,1.025,0,5,normal,notpresent,notpresent,75,31,1,141,4,16,54,7800,6,no,no,no,good,no,no,notckd,1
397,397,12,80,1.020,0,5,normal,notpresent,notpresent,100,26,1,137,4,16,49,6600,5,no,no,no,good,no,no,notckd,1
398,398,17,60,1.025,0,5,normal,notpresent,notpresent,114,50,1,135,5,14,51,7200,6,no,no,no,good,no,no,notckd,1


In [220]:
print(dataset['pc'].value_counts(dropna=False))

pc
normal      259
abnormal     76
NaN          65
Name: count, dtype: int64


In [221]:
dataset= pd.get_dummies(dataset, columns=["pc"], drop_first=True, dummy_na=False, dtype=int)

In [222]:
[col for col in dataset.columns if 'pc' in col.lower()]

['pcc', 'pcv', 'pc_normal']

In [223]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["pc_normal"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["pc_normal"])

dataset[["pc_normal"]] =np.round(imputed_df).astype(int)

In [224]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification,rbc_normal,pc_normal
0,0,48,80,1.020,1,5,notpresent,notpresent,121,36,1,138,5,15,44,7800,5,yes,yes,no,good,no,no,ckd,0,1
1,1,7,50,1.020,4,5,notpresent,notpresent,148,18,1,138,5,11,38,6000,5,no,no,no,good,no,no,ckd,0,1
2,2,62,80,1.010,2,5,notpresent,notpresent,423,53,2,138,5,10,31,7500,5,no,yes,no,poor,no,yes,ckd,1,1
3,3,48,70,1.005,4,5,present,notpresent,117,56,4,111,2,11,32,6700,4,yes,no,no,poor,yes,yes,ckd,1,0
4,4,51,80,1.010,2,5,notpresent,notpresent,106,26,1,138,5,12,35,7300,5,no,no,no,good,no,no,ckd,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,notpresent,notpresent,140,49,0,150,5,16,47,6700,5,no,no,no,good,no,no,notckd,1,1
396,396,42,70,1.025,0,5,notpresent,notpresent,75,31,1,141,4,16,54,7800,6,no,no,no,good,no,no,notckd,1,1
397,397,12,80,1.020,0,5,notpresent,notpresent,100,26,1,137,4,16,49,6600,5,no,no,no,good,no,no,notckd,1,1
398,398,17,60,1.025,0,5,notpresent,notpresent,114,50,1,135,5,14,51,7200,6,no,no,no,good,no,no,notckd,1,1


In [225]:
print(dataset['pcc'].value_counts(dropna=False))

pcc
notpresent    354
present        42
NaN             4
Name: count, dtype: int64


In [226]:
dataset= pd.get_dummies(dataset, columns=["pcc"], drop_first=True, dummy_na=False, dtype=int)

In [227]:
[col for col in dataset.columns if 'pcc' in col.lower()]

['pcc_present']

In [228]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["pcc_present"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["pcc_present"])

dataset[["pcc_present"]] =np.round(imputed_df).astype(int)

In [229]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification,rbc_normal,pc_normal,pcc_present
0,0,48,80,1.020,1,5,notpresent,121,36,1,138,5,15,44,7800,5,yes,yes,no,good,no,no,ckd,0,1,0
1,1,7,50,1.020,4,5,notpresent,148,18,1,138,5,11,38,6000,5,no,no,no,good,no,no,ckd,0,1,0
2,2,62,80,1.010,2,5,notpresent,423,53,2,138,5,10,31,7500,5,no,yes,no,poor,no,yes,ckd,1,1,0
3,3,48,70,1.005,4,5,notpresent,117,56,4,111,2,11,32,6700,4,yes,no,no,poor,yes,yes,ckd,1,0,1
4,4,51,80,1.010,2,5,notpresent,106,26,1,138,5,12,35,7300,5,no,no,no,good,no,no,ckd,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,notpresent,140,49,0,150,5,16,47,6700,5,no,no,no,good,no,no,notckd,1,1,0
396,396,42,70,1.025,0,5,notpresent,75,31,1,141,4,16,54,7800,6,no,no,no,good,no,no,notckd,1,1,0
397,397,12,80,1.020,0,5,notpresent,100,26,1,137,4,16,49,6600,5,no,no,no,good,no,no,notckd,1,1,0
398,398,17,60,1.025,0,5,notpresent,114,50,1,135,5,14,51,7200,6,no,no,no,good,no,no,notckd,1,1,0


In [230]:
dataset= pd.get_dummies(dataset, columns=["ba"], drop_first=True, dummy_na=False, dtype=int)

In [231]:
[col for col in dataset.columns if 'ba' in col.lower()]

['ba_present']

In [232]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["ba_present"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["ba_present"])

dataset[["ba_present"]] =np.round(imputed_df).astype(int)

In [233]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification,rbc_normal,pc_normal,pcc_present,ba_present
0,0,48,80,1.020,1,5,121,36,1,138,5,15,44,7800,5,yes,yes,no,good,no,no,ckd,0,1,0,0
1,1,7,50,1.020,4,5,148,18,1,138,5,11,38,6000,5,no,no,no,good,no,no,ckd,0,1,0,0
2,2,62,80,1.010,2,5,423,53,2,138,5,10,31,7500,5,no,yes,no,poor,no,yes,ckd,1,1,0,0
3,3,48,70,1.005,4,5,117,56,4,111,2,11,32,6700,4,yes,no,no,poor,yes,yes,ckd,1,0,1,0
4,4,51,80,1.010,2,5,106,26,1,138,5,12,35,7300,5,no,no,no,good,no,no,ckd,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,140,49,0,150,5,16,47,6700,5,no,no,no,good,no,no,notckd,1,1,0,0
396,396,42,70,1.025,0,5,75,31,1,141,4,16,54,7800,6,no,no,no,good,no,no,notckd,1,1,0,0
397,397,12,80,1.020,0,5,100,26,1,137,4,16,49,6600,5,no,no,no,good,no,no,notckd,1,1,0,0
398,398,17,60,1.025,0,5,114,50,1,135,5,14,51,7200,6,no,no,no,good,no,no,notckd,1,1,0,0


In [234]:
dataset["wc"].unique()

array(['7800', '6000', '7500', '6700', '7300', nan, '6900', '9600',
       '12100', '4500', '12200', '11000', '3800', '11400', '5300', '9200',
       '6200', '8300', '8400', '10300', '9800', '9100', '7900', '6400',
       '8600', '18900', '21600', '4300', '8500', '11300', '7200', '7700',
       '14600', '6300', '\t6200', '7100', '11800', '9400', '5500', '5800',
       '13200', '12500', '5600', '7000', '11900', '10400', '10700',
       '12700', '6800', '6500', '13600', '10200', '9000', '14900', '8200',
       '15200', '5000', '16300', '12400', '\t8400', '10500', '4200',
       '4700', '10900', '8100', '9500', '2200', '12800', '11200', '19100',
       '\t?', '12300', '16700', '2600', '26400', '8800', '7400', '4900',
       '8000', '12000', '15700', '4100', '5700', '11500', '5400', '10800',
       '9900', '5200', '5900', '9300', '9700', '5100', '6600'],
      dtype=object)

In [235]:
dataset['wc'] = dataset['wc'].replace('\t6200', 6200)
dataset['wc'] = dataset['wc'].replace('\t8400', 8400)
dataset['wc'] = dataset['wc'].replace('\t?', 3900)

In [236]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["wc"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)

# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["wc"])

dataset[["wc"]] =np.round(imputed_df).astype(int)

In [237]:
dataset['rc'].unique()

array([5, 4, 3, 2, 6, 8])

In [238]:
dataset['htn'].unique()

array(['yes', 'no', nan], dtype=object)

In [239]:
dataset = pd.get_dummies(dataset, columns=["htn"], drop_first=True, dummy_na=False, dtype=int)

In [240]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,dm,cad,appet,pe,ane,classification,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes
0,0,48,80,1.020,1,5,121,36,1,138,5,15,44,7800,5,yes,no,good,no,no,ckd,0,1,0,0,1
1,1,7,50,1.020,4,5,148,18,1,138,5,11,38,6000,5,no,no,good,no,no,ckd,0,1,0,0,0
2,2,62,80,1.010,2,5,423,53,2,138,5,10,31,7500,5,yes,no,poor,no,yes,ckd,1,1,0,0,0
3,3,48,70,1.005,4,5,117,56,4,111,2,11,32,6700,4,no,no,poor,yes,yes,ckd,1,0,1,0,1
4,4,51,80,1.010,2,5,106,26,1,138,5,12,35,7300,5,no,no,good,no,no,ckd,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,140,49,0,150,5,16,47,6700,5,no,no,good,no,no,notckd,1,1,0,0,0
396,396,42,70,1.025,0,5,75,31,1,141,4,16,54,7800,6,no,no,good,no,no,notckd,1,1,0,0,0
397,397,12,80,1.020,0,5,100,26,1,137,4,16,49,6600,5,no,no,good,no,no,notckd,1,1,0,0,0
398,398,17,60,1.025,0,5,114,50,1,135,5,14,51,7200,6,no,no,good,no,no,notckd,1,1,0,0,0


In [241]:
#step 1 standard scaler
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["htn_yes"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)


# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["htn_yes"])

dataset[["htn_yes"]] =np.round(imputed_df).astype(int)

In [242]:
dataset["dm"].unique()

array(['yes', 'no', ' yes', '\tno', '\tyes', nan], dtype=object)

In [243]:
dataset["dm"]=dataset["dm"].replace('\tno', 'no')
dataset["dm"]=dataset["dm"].replace('\tyes', 'yes')
dataset["dm"]=dataset["dm"].replace(' yes', 'yes')

In [244]:
dataset = pd.get_dummies(dataset, columns=["dm"], drop_first=True, dummy_na=False, dtype=int)

In [245]:
#step 1 standard scaler
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["dm_yes"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)


# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["dm_yes"])

dataset[["dm_yes"]] =np.round(imputed_df).astype(int)

In [246]:
dataset["cad"].unique()

array(['no', 'yes', '\tno', nan], dtype=object)

In [247]:
dataset["cad"]=dataset["cad"].replace('\tno', 'no')

In [248]:
dataset = pd.get_dummies(dataset, columns=["cad"], drop_first=True, dummy_na=False, dtype=int)

In [249]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["cad_yes"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)


# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["cad_yes"])

dataset[["cad_yes"]] =np.round(imputed_df).astype(int)

In [250]:
dataset["appet"].unique()

array(['good', 'poor', nan], dtype=object)

In [251]:
dataset = pd.get_dummies(dataset, columns=["appet"], drop_first=True, dummy_na=False, dtype=int)

In [252]:
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["appet_poor"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)


# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["appet_poor"])

dataset[["appet_poor"]] =np.round(imputed_df).astype(int)

In [253]:
dataset["pe"].unique()

array(['no', 'yes', nan], dtype=object)

In [254]:
dataset = pd.get_dummies(dataset, columns=["pe"], drop_first=True, dummy_na=False, dtype=int)

In [255]:
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["pe_yes"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)


# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["pe_yes"])

dataset[["pe_yes"]] =np.round(imputed_df).astype(int)

In [256]:
dataset["ane"].unique()

array(['no', 'yes', nan], dtype=object)

In [257]:
dataset = pd.get_dummies(dataset, columns=["ane"], drop_first=True, dummy_na=False, dtype=int)

In [258]:
from sklearn.preprocessing import StandardScaler
# Scale features before imputation
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset[["ane_yes"]])

# Step 2: Apply KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
data_imputed = imputer.fit_transform(data_scaled)

# Step 3: Inverse scaling back to original scale
data_imputed = scaler.inverse_transform(data_imputed)


# Step 4: Convert back to DataFrame
imputed_df = pd.DataFrame(data_imputed, columns=["ane_yes"])

dataset[["ane_yes"]] =np.round(imputed_df).astype(int)

In [259]:
dataset

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,classification,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_poor,pe_yes,ane_yes
0,0,48,80,1.020,1,5,121,36,1,138,5,15,44,7800,5,ckd,0,1,0,0,1,1,0,0,0,0
1,1,7,50,1.020,4,5,148,18,1,138,5,11,38,6000,5,ckd,0,1,0,0,0,0,0,0,0,0
2,2,62,80,1.010,2,5,423,53,2,138,5,10,31,7500,5,ckd,1,1,0,0,0,1,0,1,0,1
3,3,48,70,1.005,4,5,117,56,4,111,2,11,32,6700,4,ckd,1,0,1,0,1,0,0,1,1,1
4,4,51,80,1.010,2,5,106,26,1,138,5,12,35,7300,5,ckd,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55,80,1.020,0,5,140,49,0,150,5,16,47,6700,5,notckd,1,1,0,0,0,0,0,0,0,0
396,396,42,70,1.025,0,5,75,31,1,141,4,16,54,7800,6,notckd,1,1,0,0,0,0,0,0,0,0
397,397,12,80,1.020,0,5,100,26,1,137,4,16,49,6600,5,notckd,1,1,0,0,0,0,0,0,0,0
398,398,17,60,1.025,0,5,114,50,1,135,5,14,51,7200,6,notckd,1,1,0,0,0,0,0,0,0,0
