In [1]:
import pandas as pd
df = pd.read_csv("/Users/asishkarthikeyagogineni/Desktop/ML/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# DATA CLEANING

In [2]:
df.info()

# Example: In diabetes dataset, some columns (like Glucose, BloodPressure, SkinThickness, Insulin, BMI)
# have '0' which is medically impossible → treat as missing.
cols_with_zero_invalid = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols_with_zero_invalid:
    df[col] = df[col].replace(0, pd.NA)

df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,,33.6,0.627,50,1
1,1,85,66,29.0,,26.6,0.351,31,0
2,8,183,64,,,23.3,0.672,32,1
3,1,89,66,23.0,94.0,28.1,0.167,21,0
4,0,137,40,35.0,168.0,43.1,2.288,33,1


# HANDLING MISSING VALUES

In [4]:
from sklearn.impute import SimpleImputer
import pandas as pd

# 1. Select only numeric columns
numeric_cols = df.select_dtypes(include=["number"]).columns

# 2. Create imputer
imputer = SimpleImputer(strategy="median")

# 3. Fit + transform only numeric data
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# 4. Verify no missing values
print(df.isna().sum())


Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


# Outlier Removal / Noise Handling

In [5]:
import numpy as np

def winsorize_iqr(frame, cols, k=1.5):
    capped = frame.copy()
    for c in cols:
        q1, q3 = capped[c].quantile(0.25), capped[c].quantile(0.75)
        iqr = q3 - q1
        low, high = q1 - k*iqr, q3 + k*iqr
        capped[c] = capped[c].clip(lower=low, upper=high)
    return capped

df = winsorize_iqr(df, df.columns, k=1.5)
df.describe()


Unnamed: 0,Pregnancies,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0
mean,3.83724,0.458914,33.19987,0.348958
std,3.344157,0.285596,11.628404,0.476951
min,0.0,0.078,21.0,0.0
25%,1.0,0.24375,24.0,0.0
50%,3.0,0.3725,29.0,0.0
75%,6.0,0.62625,41.0,1.0
max,13.5,1.2,66.5,1.0


# Data Transformation

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# 1) pick feature columns (exclude target if present)
feat_cols = df.columns.drop("Outcome", errors="ignore")

# 2) use only numeric columns for scaling
num_cols = df[feat_cols].select_dtypes(include="number").columns

# 3) impute numerics (median) -> removes <NA> so scaler won't crash
df[num_cols] = SimpleImputer(strategy="median").fit_transform(df[num_cols])

# 4) min–max scale numerics
df[num_cols] = MinMaxScaler().fit_transform(df[num_cols])

# (optional) peek
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Outcome_Label
0,0.352941,148,72,35.0,,33.6,0.234415,0.483333,1.0,Diabetes
1,0.058824,85,66,29.0,,26.6,0.116567,0.166667,0.0,No Diabetes
2,0.470588,183,64,,,23.3,0.253629,0.183333,1.0,Diabetes
3,0.058824,89,66,23.0,94.0,28.1,0.038002,0.0,0.0,No Diabetes
4,0.0,137,40,35.0,168.0,43.1,0.943638,0.2,1.0,Diabetes
