In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import RandomOverSampler


# Load the dataset
data = pd.read_csv("dataset/diabetes_012_health_indicators_BRFSS2015.csv")

# Separate features and target variable
X = data.drop(columns=["Diabetes_012"])
y = data["Diabetes_012"]

Preprocessing

In [None]:
# Handle Missing Values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Scale Numerical Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [21]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predictions on the testing set
y_pred = logreg.predict(X_test)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8482537054556922
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.54      0.18      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.47      0.39      0.40     50736
weighted avg       0.80      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted')

0.8041560118322353

Preprocessing

In [20]:
# delete income too due to inapplicability for future predictions
X = data.drop(['Diabetes_012', 'Income'], axis=1)
y = data['Diabetes_012']

In [3]:
# Drop columns 'Diabetes_012', 'Education', 'Income', and 'DiffWalk' from features
X = data.drop(['Diabetes_012', 'Education', 'Income', 'DiffWalk'], axis=1)

# Target variable
y = data['Diabetes_012']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Drop columns 'Diabetes_012', 'Education', 'Income', and 'DiffWalk' from features
X = data.drop(['Diabetes_012', 'Education', 'Income', 'DiffWalk'], axis=1)

# Target variable
y = data['Diabetes_012']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Handling Missing Values
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Encoding Categorical Variables (if any)
# Assuming 'Sex' is a categorical variable
encoder = OneHotEncoder(drop='first')
X_train_encoded = encoder.fit_transform(X_train_scaled[:, [16]]).toarray()
X_test_encoded = encoder.transform(X_test_scaled[:, [16]]).toarray()

# Concatenating encoded features with numerical features
X_train_final = pd.concat([pd.DataFrame(X_train_scaled[:, :16]), pd.DataFrame(X_train_encoded)], axis=1)
X_test_final = pd.concat([pd.DataFrame(X_test_scaled[:, :16]), pd.DataFrame(X_test_encoded)], axis=1)

# Handling Imbalanced Data (if needed)
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_final, y_train)


In [35]:
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,Sex,Age
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,0.0,9.0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,7.0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,0.0,9.0
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,11.0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,11.0


In [13]:
# Initialize the ANN
model = Sequential()

# Add the input layer and first hidden layer
model.add(Dense(units=64, activation='relu', input_dim=X_train_scaled.shape[1]))
model.add(Dropout(0.5))  # Dropout regularization to prevent overfitting

# Add the second hidden layer
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))

# Add the output layer
model.add(Dense(units=1, activation='sigmoid'))  # Sigmoid activation for binary classification

# Compile the ANN
model.compile(optimizer=Adam(learning_rate=0.1), loss='binary_crossentropy', metrics=['accuracy'])

# Train the ANN
model.fit(X_train_scaled, y_train, batch_size=32, epochs=50, validation_split=0.1)

# Evaluate the ANN on the test set
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print('Test Accuracy:', accuracy)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6778/6778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.8275 - loss: 0.8435 - val_accuracy: 0.8419 - val_loss: 0.6128
Epoch 2/50
[1m6778/6778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.8428 - loss: 0.6316 - val_accuracy: 0.8419 - val_loss: 0.6136
Epoch 3/50
[1m6778/6778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.8413 - loss: 1.8969 - val_accuracy: 0.8419 - val_loss: 0.6091
Epoch 4/50
[1m6778/6778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.8430 - loss: 0.6988 - val_accuracy: 0.8419 - val_loss: 0.6108
Epoch 5/50
[1m6778/6778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.8427 - loss: 0.6125 - val_accuracy: 0.8419 - val_loss: 0.6202
Epoch 6/50
[1m6778/6778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.8423 - loss: 0.6243 - val_accuracy: 0.8419 - val_loss: 0.6117
Epoch 7/50
[1m6778/6778[0