In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/diabetes_dataset.csv")


In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
le = LabelEncoder()

df["gender"] = le.fit_transform(df["gender"])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,never,25.19,6.6,140,0
1,0,54.0,0,0,No Info,27.32,6.6,80,0
2,1,28.0,0,0,never,27.32,5.7,158,0
3,0,36.0,0,0,current,23.45,5.0,155,0
4,1,76.0,1,1,current,20.14,4.8,155,0


In [None]:
df["smoking_history"].value_counts()

Unnamed: 0_level_0,count
smoking_history,Unnamed: 1_level_1
No Info,35816
never,35095
former,9352
current,9286
not current,6447
ever,4004


In [None]:
ohe = OneHotEncoder(drop='first',sparse_output=False)

smoking_encoded = ohe.fit_transform(df[['smoking_history']])

smoking_cols = ohe.get_feature_names_out(['smoking_history'])

smoking_df = pd.DataFrame(smoking_encoded, columns=smoking_cols)

df = pd.concat([df,smoking_df], axis=1)

df = df.drop("smoking_history", axis=1)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,0,80.0,0,1,25.19,6.6,140,0,0.0,0.0,0.0,1.0,0.0
1,0,54.0,0,0,27.32,6.6,80,0,0.0,0.0,0.0,0.0,0.0
2,1,28.0,0,0,27.32,5.7,158,0,0.0,0.0,0.0,1.0,0.0
3,0,36.0,0,0,23.45,5.0,155,0,1.0,0.0,0.0,0.0,0.0
4,1,76.0,1,1,20.14,4.8,155,0,1.0,0.0,0.0,0.0,0.0


In [None]:
x = df.drop("diabetes", axis=1)
y = df["diabetes"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)


In [None]:
scaler = MinMaxScaler()
scaled_columns = ["age","bmi","HbA1c_level","blood_glucose_level"]
x_train[scaled_columns] = scaler.fit_transform(x_train[scaled_columns])
x_test[scaled_columns] = scaler.transform(x_test[scaled_columns])


In [None]:
early_stop = EarlyStopping(monitor="val_loss", patience=15,restore_best_weights=True)


In [None]:
# Build the Model

model = Sequential([
    Dense(128, activation="relu", input_shape=(x_train.shape[1],)),
    Dropout(0.4),
    Dense(64, activation = "relu"),
    Dropout(0.3),
    Dense(32, activation = "relu"),
    Dropout(0.2),
    Dense(16, activation = "relu"),
    Dropout(0.1),
    Dense(8, activation = "relu"),
    Dense(1, activation = "sigmoid")
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the Model

model.compile(
    optimizer = "adam",
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
    )

In [None]:
# Train the Model

history = model.fit(
    x_train,
    y_train,
    validation_split = 0.2,
    epochs = 400,
    batch_size = 128,
    callbacks = [early_stop],
    verbose = 1
    )

Epoch 1/400
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8675 - loss: 0.3891 - val_accuracy: 0.9549 - val_loss: 0.1239
Epoch 2/400
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9512 - loss: 0.1423 - val_accuracy: 0.9581 - val_loss: 0.1154
Epoch 3/400
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9557 - loss: 0.1292 - val_accuracy: 0.9597 - val_loss: 0.1139
Epoch 4/400
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9583 - loss: 0.1217 - val_accuracy: 0.9582 - val_loss: 0.1122
Epoch 5/400
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9595 - loss: 0.1178 - val_accuracy: 0.9603 - val_loss: 0.1093
Epoch 6/400
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9605 - loss: 0.1157 - val_accuracy: 0.9610 - val_loss: 0.1073
Epoch 7/400
[1m500/50

In [None]:
loss, accuracy = model.evaluate(x_test,y_test)

print(f"The accuracy is {accuracy:.3f}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9727 - loss: 0.0788
The accuracy is 0.972


In [None]:
joblib.dump(model, "Diabetes_Predictor.pkl")

['Diabetes_Predictor.pkl']