# Cleaning & Normalization
---

### Remove useless column

In [60]:
import pandas as pd

df = pd.read_csv("../data/raw_data.csv")
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


## Handle missing values 

In [61]:
import numpy as np
from sklearn.impute import KNNImputer
import numpy as np

cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

df.isnull().sum()
# df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].var()


Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

## KNN Imputation for Missing Values

In [63]:
# df['Glucose'].fillna(df['Glucose'].median(), inplace=True)
# df['BMI'].fillna(df['BMI'].median(), inplace=True)
# df['BloodPressure'].fillna(df['BloodPressure'].median(), inplace=True)

numeric_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

numeric_df = df[numeric_cols]

imputer = KNNImputer(n_neighbors=5)

imputed_data = imputer.fit_transform(numeric_df)

df[numeric_cols] = pd.DataFrame(imputed_data, columns=numeric_cols)

## Detect outliers IQR

In [62]:
numeric_cols = df.select_dtypes(include=['number']).columns

outlier_counts = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    df.loc[(df[col] < lower_limit) | (df[col] > upper_limit), col] = np.nan

df.isnull().sum()



Pregnancies                   4
Glucose                       5
BloodPressure                49
SkinThickness               230
Insulin                     398
BMI                          19
DiabetesPedigreeFunction     29
Age                           9
dtype: int64

In [65]:
file_path = '../data/clean_data.csv' 

df.to_csv(
    file_path, 
    index=False, 
    encoding='utf-8' 
)

print(f"DataFrame successfully saved to {file_path}")

DataFrame successfully saved to ../data/clean_data.csv
