In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("kidney_disease.csv")
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [3]:
dataset.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [4]:
from Univariate import Univariate
quan, qual = Univariate.quanQual(dataset)

In [5]:
quan

['id', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']

In [6]:
qual

['rbc',
 'pc',
 'pcc',
 'ba',
 'pcv',
 'wc',
 'rc',
 'htn',
 'dm',
 'cad',
 'appet',
 'pe',
 'ane',
 'classification']

In [7]:
#impute Numerical Columns
import numpy as np
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro

for columnName in quan:
    if dataset[columnName].isnull().sum() > 0:
        skew_val = dataset[columnName].skew()
        col_data = dataset[columnName].dropna() #removes any NaN (missing) values in the specified column - would disrupt the calculation.

        # Check normality with Shapiro-Wilk Test (only if enough data)
        if len(col_data) > 3:
            stat, p = shapiro(col_data)
        else:
            p = 0  # Too little data to test, assume not normal
            
        # Strategy Decision:
        if -1 < skew_val < 1 and p > 0.05:
            strategy = 'mean'
        else:
            strategy = 'median'

        print(f"{columnName}: Skew = {skew_val:.2f}, p = {p} → {strategy}")

        #Apply imputation
        imp = SimpleImputer(missing_values=np.nan, strategy=strategy)
        imp.fit(dataset[[columnName]])
        dataset[[columnName]] = imp.transform(dataset[[columnName]])           

age: Skew = -0.67, p = 3.155049297644778e-08 → median
bp: Skew = 1.61, p = 1.2313409228428595e-17 → median
sg: Skew = -0.17, p = 7.511105213426834e-16 → median
al: Skew = 1.00, p = 5.505137709775377e-23 → median
su: Skew = 2.46, p = 5.4171833666879725e-31 → median
bgr: Skew = 2.01, p = 4.309587713071842e-22 → median
bu: Skew = 2.63, p = 1.0721529229554324e-24 → median
sc: Skew = 7.51, p = 1.0443480469097894e-33 → median
sod: Skew = -7.00, p = 3.7410576559410765e-26 → median
pot: Skew = 11.58, p = 2.4627358176265627e-34 → median
hemo: Skew = -0.34, p = 8.645817119903378e-05 → median


In [8]:
'''
=> skew_val ≈ 0 → Perfect bell curve
=> -1 < skew_val < 0 → Slight left skew
=> 0 < skew_val < 1 → Slight right skew
=> skew_val ≤ -1 or ≥ 1 → Data is heavily skewed
    => If > +1 → right-skewed (positive skew)  
    => If < -1 → left-skewed (negative skew)

=> Mean is good for symmetric/slightly skewed
=> Median is better when skewness is stronger (to avoid influence of outliers)

=> stat - Normality score (closer to 1 = more normal)
=> p-Probability that the data is normal
=> p > 0.05 - Acceptable → Use mean (if not skewed)
=> p ≤ 0.05 - Not normal → Use median
=> If p > 0.05, you fail to reject the null hypothesis → data is likely normal.
=> If p ≤ 0.05, the data is not normally distributed.

=> 3.155049297644778e-08 - 3.155 × 10⁻⁸ = 0.00000003155
'''

'\n=> skew_val ≈ 0 → Perfect bell curve\n=> -1 < skew_val < 0 → Slight left skew\n=> 0 < skew_val < 1 → Slight right skew\n=> skew_val ≤ -1 or ≥ 1 → Data is heavily skewed\n    => If > +1 → right-skewed (positive skew)  \n    => If < -1 → left-skewed (negative skew)\n\n=> Mean is good for symmetric/slightly skewed\n=> Median is better when skewness is stronger (to avoid influence of outliers)\n\n=> stat - Normality score (closer to 1 = more normal)\n=> p-Probability that the data is normal\n=> p > 0.05 - Acceptable → Use mean (if not skewed)\n=> p ≤ 0.05 - Not normal → Use median\n=> If p > 0.05, you fail to reject the null hypothesis → data is likely normal.\n=> If p ≤ 0.05, the data is not normally distributed.\n'

In [9]:
df = pd.DataFrame(dataset, columns = quan) 
df

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
0,0,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.2,138.0,4.4,15.4
1,1,7.0,50.0,1.020,4.0,0.0,121.0,18.0,0.8,138.0,4.4,11.3
2,2,62.0,80.0,1.010,2.0,3.0,423.0,53.0,1.8,138.0,4.4,9.6
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2
4,4,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.4,138.0,4.4,11.6
...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,140.0,49.0,0.5,150.0,4.9,15.7
396,396,42.0,70.0,1.025,0.0,0.0,75.0,31.0,1.2,141.0,3.5,16.5
397,397,12.0,80.0,1.020,0.0,0.0,100.0,26.0,0.6,137.0,4.4,15.8
398,398,17.0,60.0,1.025,0.0,0.0,114.0,50.0,1.0,135.0,4.9,14.2


In [10]:
#impute Categorical Columns
for columnName in qual:
    if dataset[columnName].isnull().sum() > 0:
        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp.fit(dataset[[columnName]])
        dataset[[columnName]] = imp.transform(dataset[[columnName]])    

In [11]:
df = pd.DataFrame(dataset, columns = qual) 
df

Unnamed: 0,rbc,pc,pcc,ba,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,normal,normal,notpresent,notpresent,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,normal,normal,notpresent,notpresent,38,6000,5.2,no,no,no,good,no,no,ckd
2,normal,normal,notpresent,notpresent,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,normal,abnormal,present,notpresent,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,normal,normal,notpresent,notpresent,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,normal,normal,notpresent,notpresent,47,6700,4.9,no,no,no,good,no,no,notckd
396,normal,normal,notpresent,notpresent,54,7800,6.2,no,no,no,good,no,no,notckd
397,normal,normal,notpresent,notpresent,49,6600,5.4,no,no,no,good,no,no,notckd
398,normal,normal,notpresent,notpresent,51,7200,5.9,no,no,no,good,no,no,notckd


In [12]:
dataset.isnull().sum()

id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
#Find and Replace Outliers
from Univariate import Univariate
descriptive = Univariate.descriptive_Univariate(dataset, quan)
descriptive

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
Mean,199.5,51.5625,76.575,1.017712,0.9,0.395,145.0625,56.693,2.997125,137.63125,4.57725,12.5425
Median,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
Mode,0.0,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.2,138.0,4.4,12.65
Q1:25%,99.75,42.0,70.0,1.015,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.875
Q2:50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
Q3:75%,299.25,64.0,80.0,1.02,2.0,0.0,150.0,61.75,2.725,141.0,4.8,14.625
99%,395.01,80.01,110.0,1.025,4.0,4.0,425.22,235.06,18.159,150.0,6.501,17.601
Q4:100%,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8
IQR,199.5,22.0,10.0,0.005,2.0,0.0,49.0,34.75,1.825,6.0,0.8,3.75
1.5Rule,299.25,33.0,15.0,0.0075,3.0,0.0,73.5,52.125,2.7375,9.0,1.2,5.625


In [15]:
Lesser, Greater = Univariate.FindOutliers(descriptive, quan)
Lesser, Greater

(['age', 'bp', 'sg', 'bgr', 'sod', 'pot', 'hemo'],
 ['bp', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot'])

In [16]:
dataset = Univariate.ReplaceOutliers(dataset, descriptive, Lesser, Greater)
dataset

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.0200,1.0,0.0,normal,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,9.0,55.0,1.0200,4.0,0.0,normal,normal,notpresent,notpresent,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.0100,2.0,0.0,normal,normal,notpresent,notpresent,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.0075,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.0100,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.0200,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.0250,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.0200,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.0250,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [17]:
descriptive = Univariate.descriptive_Univariate(dataset, quan)
descriptive

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
Mean,199.5,51.6475,75.825,1.017756,0.9,0.0,134.01125,50.277687,2.071406,138.2175,4.38275,12.549
Median,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
Mode,0.0,55.0,80.0,1.02,0.0,0.0,223.5,113.875,5.4625,138.0,4.4,12.65
Q1:25%,99.75,42.0,70.0,1.015,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.875
Q2:50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
Q3:75%,299.25,64.0,80.0,1.02,2.0,0.0,150.0,61.75,2.725,141.0,4.8,14.625
99%,395.01,80.01,95.0,1.025,4.0,0.0,223.5,113.875,5.4625,150.0,6.0,17.601
Q4:100%,399.0,90.0,95.0,1.025,5.0,0.0,223.5,113.875,5.4625,150.0,6.0,17.8
IQR,199.5,22.0,10.0,0.005,2.0,0.0,49.0,34.75,1.825,6.0,0.8,3.75
1.5Rule,299.25,33.0,15.0,0.0075,3.0,0.0,73.5,52.125,2.7375,9.0,1.2,5.625


In [18]:
Lesser, Greater = Univariate.FindOutliers(descriptive, quan)
Lesser, Greater

([], [])