## Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler



## Load and Understand Dataset

In [None]:
df = pd.read_csv("/content/diabetes.csv")
print(df.head(6))


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print(df.isnull().sum())

## Handle Missing Values

In [None]:
#replace with mean
df.fillna(df.mean(), inplace = True)
#replace with median
df.fillna(df.median(), inplace = True)

## Encoding

In [None]:
print(df.dtypes)

In [None]:
#Label Encoding
le = LabelEncoder()
df['Outcome'] = le.fit_transform(df['Outcome'])


#Onehot Encoding
Encoder = OneHotEncoder()
OneHotEncoded = Encoder.fit_transform(df[['Outcome']])
#Here we have Binary categorial Data so we dont need OneHotEncoding
# print(OneHotEncoded)



## Standardize and Normlize Data

In [None]:
df.dtypes

In [None]:
#Standardize
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print(df[num_cols] )

In [None]:
#Normalize

scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df[num_cols]

## Outlier plotting and Removing

In [None]:
num_cols = len(df.columns)
fig, axs = plt.subplots(num_cols, 1, dpi=95, figsize=(7, 2*num_cols))

for i, col in enumerate(df.columns):
    axs[i].boxplot(df[col], vert=False)
    axs[i].set_ylabel(col)

plt.tight_layout()
plt.show()


In [None]:
#copy the dataframe
clean_data = df.copy()

#select columns to clean
columns = [
    'Pregnancies','Glucose','BloodPressure','Insulin','Age', 'BMI', 'DiabetesPedigreeFunction'
]
# Use 1.5 * IQR for most columns, but a custom rule for BloodPressure
for col in columns:
    q1, q3 = np.percentile(clean_data[col], [25, 75])
    iqr = q3 - q1

    # Use 0.75 * IQR only for BloodPressure
    factor = 0.75 if col == 'BloodPressure' else 1.5

    lower_bound = q1 - factor * iqr
    upper_bound = q3 + factor * iqr

    # Filter the data
    clean_data = clean_data[(clean_data[col] >= lower_bound) & (clean_data[col] <= upper_bound)]

# Final cleaned dataset
print("Original shape:", df.shape)
print("Cleaned shape:", clean_data.shape)

In [None]:
num_cols = len(clean_data.columns)
fig, axs = plt.subplots(num_cols, 1, dpi=95, figsize=(7, 2*num_cols))

for i, col in enumerate(clean_data.columns):
    axs[i].boxplot(clean_data[col], vert=False)
    axs[i].set_ylabel(col)

plt.tight_layout()
plt.show()
