In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('C:\\Python Intern\\athiraanil\\DIABETES_PROJECT\\raw\\cleaned_diabetes.csv')

cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols] = df[cols].replace(0, np.nan)

print(df.isnull().sum())



Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [21]:

df['Glucose'] = df['Glucose'].fillna(df['Glucose'].median())
df['BloodPressure'].fillna(df['BloodPressure'].median())
df['SkinThickness'].fillna(df['SkinThickness'].median())
df['Insulin'].fillna(df['Insulin'].median())
df['BMI'].fillna(df['BMI'].median())


0      0.166619
1     -0.852200
2     -1.332500
3     -0.633881
4      1.549303
         ...   
763    0.064737
764    0.632365
765   -0.910418
766   -0.342790
767   -0.299127
Name: BMI, Length: 768, dtype: float64

In [25]:
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in cols_with_zero:
    df[col] = df[col].replace(0, df[col].median())
print(df.isnull().sum())  

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [26]:
df["Glucose"] = df["Glucose"].fillna(df["Glucose"].median())
df["BloodPressure"] = df["BloodPressure"].fillna(df["BloodPressure"].median())
df["BMI"] = df["BMI"].fillna(df["BMI"].median())
df["Insulin"] = df["Insulin"].fillna(df["Insulin"].median())
df["SkinThickness"] = df["SkinThickness"].fillna(df["SkinThickness"].median())

In [7]:
print(df.isnull().sum()) 


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [27]:
cols_with_nan = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df.fillna({col: df[col].median() for col in cols_with_nan}, inplace=True)



In [28]:
#Feature Creation
# Glucose Cat
df.loc[df['Glucose'] < 70, 'GLUCOSE_CAT'] = 'hypoglycemia'
df.loc[(df['Glucose'] >= 70) & (df['Glucose'] < 100), 'GLUCOSE_CAT'] = 'normal'
df.loc[(df['Glucose'] >= 100) & (df['Glucose'] < 126), 'GLUCOSE_CAT'] = 'impaired'
df.loc[df['Glucose'] >= 126, 'GLUCOSE_CAT'] = 'hyperglycemia'
print(df.groupby("GLUCOSE_CAT")["Outcome"].mean())
print(df["GLUCOSE_CAT"].value_counts())

GLUCOSE_CAT
hyperglycemia    0.592593
hypoglycemia     0.000000
impaired         0.279570
normal           0.077348
Name: Outcome, dtype: float64
GLUCOSE_CAT
hyperglycemia    297
impaired         279
normal           181
hypoglycemia      11
Name: count, dtype: int64


In [29]:
df.loc[df['BMI'] < 16, 'BMI_CAT'] = 'underweight'
df.loc[(df['BMI'] >= 16) & (df['BMI'] < 18.5), 'BMI_CAT'] = 'thin'
df.loc[(df['BMI'] >= 18.5) & (df['BMI'] < 25), 'BMI_CAT'] = 'normal'
df.loc[(df['BMI'] >= 25) & (df['BMI'] < 30), 'BMI_CAT'] = 'overweight'
df.loc[(df['BMI'] >= 30) & (df['BMI'] < 35), 'BMI_CAT'] = 'obese_1'
df.loc[(df['BMI'] >= 35) & (df['BMI'] < 45), 'BMI_CAT'] = 'obese_2'
df.loc[df['BMI'] >= 45, 'BMI_CAT'] = 'obese_3'
print(df.groupby("BMI_CAT")["Outcome"].mean())
print(df["BMI_CAT"].value_counts())

BMI_CAT
normal        0.068627
obese_1       0.438298
obese_2       0.452830
obese_3       0.611111
overweight    0.223464
thin          0.000000
Name: Outcome, dtype: float64
BMI_CAT
obese_1       235
obese_2       212
overweight    179
normal        102
obese_3        36
thin            4
Name: count, dtype: int64


In [30]:
df.loc[(df['Age'] >= 18) & (df['Age'] < 30), 'AGE_CAT'] = "young"
df.loc[(df['Age'] >= 30) & (df['Age'] < 45), 'AGE_CAT'] = "mature"
df.loc[(df['Age'] >= 45) & (df['Age'] < 65), 'AGE_CAT'] = "middle_age"
df.loc[(df['Age'] >= 65), 'AGE_CAT'] = "old"
print(df.groupby("AGE_CAT")["Outcome"].mean())
print(df["AGE_CAT"].value_counts())


AGE_CAT
mature        0.493724
middle_age    0.529915
old           0.250000
young         0.212121
Name: Outcome, dtype: float64
AGE_CAT
young         396
mature        239
middle_age    117
old            16
Name: count, dtype: int64


In [31]:
df.loc[df['BloodPressure'] < 70, 'DIASTOLIC_CAT'] = 'low'
df.loc[(df['BloodPressure'] >= 70) & (df['BloodPressure'] < 90), 'DIASTOLIC_CAT'] = 'normal'
df.loc[df['BloodPressure'] >= 90, 'DIASTOLIC_CAT'] = 'high'
print(df.groupby("DIASTOLIC_CAT")["Outcome"].mean())
print(df["DIASTOLIC_CAT"].value_counts())


DIASTOLIC_CAT
high      0.483333
low       0.247350
normal    0.397647
Name: Outcome, dtype: float64
DIASTOLIC_CAT
normal    425
low       283
high       60
Name: count, dtype: int64


In [32]:
df.loc[(df['Insulin'] < 120), 'INSULIN_CAT'] = "normal"
df.loc[(df['Insulin'] >= 120), 'INSULIN_CAT'] = "abnormal"
print(df.groupby("INSULIN_CAT")["Outcome"].mean())
print(df["INSULIN_CAT"].value_counts())


INSULIN_CAT
abnormal    0.414384
normal      0.141304
Name: Outcome, dtype: float64
INSULIN_CAT
abnormal    584
normal      184
Name: count, dtype: int64


In [33]:
df.loc[(df['Pregnancies'] == 0), 'PREG_CAT'] = "unpregnant"
df.loc[(df['Pregnancies'] > 0) & (df['Pregnancies'] <= 5), 'PREG_CAT'] = "normal"
df.loc[(df['Pregnancies'] > 5) & (df['Pregnancies'] <= 10), 'PREG_CAT'] = "high"
df.loc[(df['Pregnancies'] > 10), 'PREG_CAT'] = "very high"
print(df.groupby("PREG_CAT")["Outcome"].mean())
print(df["PREG_CAT"].value_counts())


PREG_CAT
high          0.491892
normal        0.271689
unpregnant    0.342342
very high     0.588235
Name: Outcome, dtype: float64
PREG_CAT
normal        438
high          185
unpregnant    111
very high      34
Name: count, dtype: int64


In [None]:
# Label encoding 
le = LabelEncoder()
binary_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() == 2]
for col in binary_cols:
    df[col] = le.fit_transform(df[col])
print(df[binary_cols].head())

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [34]:

# One hot encding
ohe_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() > 2]
df = pd.get_dummies(df, columns=ohe_cols, drop_first=True)
print(df.head()) 

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0           72.0           35.0    125.0  33.6   
1            1     85.0           66.0           29.0    125.0  26.6   
2            8    183.0           64.0           29.0    125.0  23.3   
3            1     89.0           66.0           23.0     94.0  28.1   
4            0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  Outcome INSULIN_CAT  ...  \
0                     0.627   50        1    abnormal  ...   
1                     0.351   31        0    abnormal  ...   
2                     0.672   32        1    abnormal  ...   
3                     0.167   21        0      normal  ...   
4                     2.288   33        1    abnormal  ...   

   BMI_CAT_overweight  BMI_CAT_thin  AGE_CAT_middle_age  AGE_CAT_old  \
0               False         False                True        False   
1                True         False               

In [17]:
#standardization
num_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 
            'SkinThickness', 'Insulin', 'BMI', 
            'DiabetesPedigreeFunction', 'Age']


In [18]:
# Scaling
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('Outcome')
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print(df.head())
print(df.columns)


   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.866045      -0.031990       0.670643 -0.181541  0.166619   
1    -0.844885 -1.205066      -0.528319      -0.012301 -0.181541 -0.852200   
2     1.233880  2.016662      -0.693761      -0.012301 -0.181541 -1.332500   
3    -0.844885 -1.073567      -0.528319      -0.695245 -0.540642 -0.633881   
4    -1.141852  0.504422      -2.679076       0.670643  0.316566  1.549303   

   DiabetesPedigreeFunction       Age  Outcome  INSULIN_CAT  ...  \
0                  0.468492  1.425995        1    -0.561310  ...   
1                 -0.365061 -0.190672        0    -0.561310  ...   
2                  0.604397 -0.105584        1    -0.561310  ...   
3                 -0.920763 -1.041549        0     1.781548  ...   
4                  5.484909 -0.020496        1    -0.561310  ...   

   BMI_CAT_overweight  BMI_CAT_thin  AGE_CAT_middle_age  AGE_CAT_old  \
0               False         False               

In [19]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,INSULIN_CAT,...,BMI_CAT_overweight,BMI_CAT_thin,AGE_CAT_middle_age,AGE_CAT_old,AGE_CAT_young,DIASTOLIC_CAT_low,DIASTOLIC_CAT_normal,PREG_CAT_normal,PREG_CAT_unpregnant,PREG_CAT_very high
0,0.639947,0.866045,-0.03199,0.670643,-0.181541,0.166619,0.468492,1.425995,1,-0.56131,...,False,False,True,False,False,False,True,False,False,False
1,-0.844885,-1.205066,-0.528319,-0.012301,-0.181541,-0.8522,-0.365061,-0.190672,0,-0.56131,...,True,False,False,False,False,True,False,True,False,False
2,1.23388,2.016662,-0.693761,-0.012301,-0.181541,-1.3325,0.604397,-0.105584,1,-0.56131,...,False,False,False,False,False,True,False,False,False,False
3,-0.844885,-1.073567,-0.528319,-0.695245,-0.540642,-0.633881,-0.920763,-1.041549,0,1.781548,...,True,False,False,False,True,True,False,True,False,False
4,-1.141852,0.504422,-2.679076,0.670643,0.316566,1.549303,5.484909,-0.020496,1,-0.56131,...,False,False,False,False,False,True,False,False,True,False
