In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("medical_conditions.csv")
dataset

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,,male,Non-Smoker,,,,Pneumonia
1,2,User0002,30.0,male,Non-Smoker,,105.315064,,Diabetic
2,3,User0003,18.0,male,Non-Smoker,35.612486,,,Pneumonia
3,4,User0004,,male,Non-Smoker,,99.119829,,Pneumonia
4,5,User0005,76.0,male,Non-Smoker,,,,Diabetic
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,,male,Non-Smoker,25.029002,152.540355,137.551451,Pneumonia
9996,9997,User9997,,male,Non-Smoker,27.017487,,,Diabetic
9997,9998,User9998,23.0,male,Smoker,,148.833321,173.931480,Pneumonia
9998,9999,User9999,,female,Non-Smoker,,,,Pneumonia


In [3]:
dataset.dtypes

id                  int64
full_name          object
age               float64
gender             object
smoking_status     object
bmi               float64
blood_pressure    float64
glucose_levels    float64
condition          object
dtype: object

In [4]:
dataset.isnull().sum()

id                   0
full_name            0
age               4555
gender               0
smoking_status       0
bmi               5348
blood_pressure    6234
glucose_levels    5244
condition            0
dtype: int64

#In this Dataset, Missing Values present only in Numerical Columns

In [5]:
from Univariate import Univariate
quan,qual = Univariate.quanQual(dataset)

In [6]:
quan

['id', 'age', 'bmi', 'blood_pressure', 'glucose_levels']

In [7]:
qual

['full_name', 'gender', 'smoking_status', 'condition']

In [8]:
#impute Numerical Columns 
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro

for columnName in quan:
    if dataset[columnName].isnull().sum() > 0:
        skew_val = dataset[columnName].skew()
        col_data = dataset[columnName].dropna() #removes any NaN (missing) values in the specified column - would disrupt the calculation.

        # Default to median
        strategy = 'median'
        
        #Strategy Decision: 
        #Check normality with Shapiro-Wilk Test (only if enough data)
        if abs(skew_val) < 0.5:
                strategy = 'mean'
        elif abs(skew_val) < 1 and len(col_data) > 3:
            stat, p = shapiro(col_data)
            if p > 0.05:
                strategy = 'mean'
          
        print(f"{columnName}: Skew = {skew_val:.2f}, {strategy}")
           
        #Apply imputation
        imp = SimpleImputer(missing_values=np.nan, strategy=strategy)
        imp.fit(dataset[[columnName]])
        dataset[[columnName]] = imp.transform(dataset[[columnName]])           

age: Skew = -0.00, mean
bmi: Skew = 0.02, mean
blood_pressure: Skew = 0.00, mean
glucose_levels: Skew = 0.01, mean


In [9]:
dataset

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,53.541598,male,Non-Smoker,27.423420,135.209429,135.219608,Pneumonia
1,2,User0002,30.000000,male,Non-Smoker,27.423420,105.315064,135.219608,Diabetic
2,3,User0003,18.000000,male,Non-Smoker,35.612486,135.209429,135.219608,Pneumonia
3,4,User0004,53.541598,male,Non-Smoker,27.423420,99.119829,135.219608,Pneumonia
4,5,User0005,76.000000,male,Non-Smoker,27.423420,135.209429,135.219608,Diabetic
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,53.541598,male,Non-Smoker,25.029002,152.540355,137.551451,Pneumonia
9996,9997,User9997,53.541598,male,Non-Smoker,27.017487,135.209429,135.219608,Diabetic
9997,9998,User9998,23.000000,male,Smoker,27.423420,148.833321,173.931480,Pneumonia
9998,9999,User9999,53.541598,female,Non-Smoker,27.423420,135.209429,135.219608,Pneumonia


In [10]:
df = pd.DataFrame(dataset, columns = quan) 
df

Unnamed: 0,id,age,bmi,blood_pressure,glucose_levels
0,1,53.541598,27.423420,135.209429,135.219608
1,2,30.000000,27.423420,105.315064,135.219608
2,3,18.000000,35.612486,135.209429,135.219608
3,4,53.541598,27.423420,99.119829,135.219608
4,5,76.000000,27.423420,135.209429,135.219608
...,...,...,...,...,...
9995,9996,53.541598,25.029002,152.540355,137.551451
9996,9997,53.541598,27.017487,135.209429,135.219608
9997,9998,23.000000,27.423420,148.833321,173.931480
9998,9999,53.541598,27.423420,135.209429,135.219608


In [11]:
dataset.isnull().sum()

id                0
full_name         0
age               0
gender            0
smoking_status    0
bmi               0
blood_pressure    0
glucose_levels    0
condition         0
dtype: int64

In [12]:
#For this Dataset - Replace only Missing Values
#Why Because - in healthcare dataset:
#You lose real, valid data.
#You introduce bias by making all values identical or clustered unnaturally.
#Outliers and distribution patterns vanish, which are often critical in healthcare datasets (e.g., for anomaly detection).

In [13]:
#Save cleaned data to a CSV file
dataset.to_csv("cleaned_medical_conditions.csv", index=False)
#index=False prevents pandas from writing the row numbers as a new column in the file.