In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
print("Dataset Information:")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())


Dataset Information:
Shape: (768, 9)

First 5 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
print("\nDataset description:")
print(df.describe())


Dataset description:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.00

In [5]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [6]:
print("\nZero values in columns where zero might represent missing data:")
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    zero_count = len(df[df[column] == 0])
    print(f"{column}: {zero_count} zeros ({zero_count/len(df)*100:.2f}%)")


Zero values in columns where zero might represent missing data:
Glucose: 5 zeros (0.65%)
BloodPressure: 35 zeros (4.56%)
SkinThickness: 227 zeros (29.56%)
Insulin: 374 zeros (48.70%)
BMI: 11 zeros (1.43%)


In [8]:
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[column] = df[column].replace(0, np.nan)

In [9]:
print("\nMissing values after replacing zeros with NaN:")
print(df.isnull().sum())


Missing values after replacing zeros with NaN:
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [10]:
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    median_value = df[column].median()
    df[column] = df[column].fillna(median_value)

print("\nChecking if any missing values remain:")
print(df.isnull().sum())


Checking if any missing values remain:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [11]:
age_bins = [20, 30, 40, 50, 60, 100]
age_labels = ['20-30', '31-40', '41-50', '51-60', '61+']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

In [12]:
age_dummies = pd.get_dummies(df['AgeGroup'], prefix='Age')
df = pd.concat([df, age_dummies], axis=1)

In [13]:
bmi_bins = [0, 18.5, 25, 30, 35, 100]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese_Class_1', 'Obese_Class_2+']
df['BMICategory'] = pd.cut(df['BMI'], bins=bmi_bins, labels=bmi_labels)

In [14]:
bmi_dummies = pd.get_dummies(df['BMICategory'], prefix='BMI')
df = pd.concat([df, bmi_dummies], axis=1)

In [15]:
glucose_bins = [0, 70, 100, 126, 300]
glucose_labels = ['Low', 'Normal', 'Prediabetes', 'Diabetes']
df['GlucoseCategory'] = pd.cut(df['Glucose'], bins=glucose_bins, labels=glucose_labels)

In [16]:
glucose_dummies = pd.get_dummies(df['GlucoseCategory'], prefix='Glucose')
df = pd.concat([df, glucose_dummies], axis=1)

In [17]:
df['DiabetesRiskScore'] = (
    (df['Glucose'] / 100) * 0.3 + 
    (df['BMI'] / 30) * 0.25 + 
    (df['Age'] / 50) * 0.15 + 
    (df['Insulin'] / 100) * 0.15 + 
    (df['DiabetesPedigreeFunction']) * 0.15
)

df['BMI_Age_Interaction'] = df['BMI'] * df['Age'] / 100

df['Insulin_Glucose_Ratio'] = df['Insulin'] / df['Glucose']

df['BP_Age_Ratio'] = df['BloodPressure'] / df['Age']

In [18]:
scaler = StandardScaler()
numerical_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                      'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age',
                      'DiabetesRiskScore', 'BMI_Age_Interaction', 
                      'Insulin_Glucose_Ratio', 'BP_Age_Ratio']

df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [19]:
df = df.drop(['AgeGroup', 'BMICategory', 'GlucoseCategory'], axis=1)

In [20]:
df.to_csv('../data/diabetes_preprocessed.csv', index=False)

print("\nPreprocessed dataset information:")
print(f"Shape: {df.shape}")
print("\nColumns in the preprocessed dataset:")
print(df.columns.tolist())


Preprocessed dataset information:
Shape: (768, 27)

Columns in the preprocessed dataset:
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_20-30', 'Age_31-40', 'Age_41-50', 'Age_51-60', 'Age_61+', 'BMI_Underweight', 'BMI_Normal', 'BMI_Overweight', 'BMI_Obese_Class_1', 'BMI_Obese_Class_2+', 'Glucose_Low', 'Glucose_Normal', 'Glucose_Prediabetes', 'Glucose_Diabetes', 'DiabetesRiskScore', 'BMI_Age_Interaction', 'Insulin_Glucose_Ratio', 'BP_Age_Ratio']


In [21]:
correlations = df[numerical_columns + ['Outcome']].corr()['Outcome'].sort_values(ascending=False)
print("\nFeature correlations with the target variable (Outcome):")
print(correlations)


Feature correlations with the target variable (Outcome):
Outcome                     1.000000
Glucose                     0.492782
DiabetesRiskScore           0.458755
BMI_Age_Interaction         0.362990
BMI                         0.312038
Age                         0.238356
Pregnancies                 0.221898
SkinThickness               0.214873
Insulin                     0.203790
DiabetesPedigreeFunction    0.173844
BloodPressure               0.165723
Insulin_Glucose_Ratio       0.013639
BP_Age_Ratio               -0.205416
Name: Outcome, dtype: float64
