In [1]:
# Import our dependencies
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import tensorflow as tf


# Load in CSV as a Dataframe
dm_pd = pd.read_csv("https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/JV/Resources/diabetes_dataset.csv")

In [2]:
# Preview Dataframe
dm_pd.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [3]:
# Dropping smoking_history due to vagueness
dm_pd = dm_pd.drop(['smoking_history', 'hbA1c_level', 'blood_glucose_level'], axis=1)

dm_pd.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,bmi,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,27.32,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,19.95,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,23.76,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,27.32,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,23.75,0


In [4]:
# Replacing gender with binary values
dm_pd =  dm_pd.replace({"Male": 1, "Female":0, "Other":2})
dm_pd.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,bmi,diabetes
0,2020,0,32.0,Alabama,0,0,0,0,1,0,0,27.32,0
1,2015,0,29.0,Alabama,0,1,0,0,0,0,0,19.95,0
2,2015,1,18.0,Alabama,0,0,0,0,1,0,0,23.76,0
3,2015,1,41.0,Alabama,0,0,1,0,0,0,0,27.32,0
4,2016,0,52.0,Alabama,1,0,0,0,0,0,0,23.75,0


In [5]:
# Create a copy of the original DataFrame to avoid modifying the original data
df = dm_pd.copy()


# Perform one-hot encoding on the 'location' column
one_hot_encoded = pd.get_dummies(df['location'], prefix='state')

# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded], axis=1)

# Drop the original 'location' column
df.drop(['location'], axis=1, inplace=True)

df = df.replace({False: 0, True:1, "Male":1, "Female":0, "Other":2})

df.head()

Unnamed: 0,year,gender,age,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,...,state_Texas,state_United States,state_Utah,state_Vermont,state_Virgin Islands,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,2020,0,32.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015,0,29.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015,1,18.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015,1,41.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016,0,52.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.columns

Index(['year', 'gender', 'age', 'race:AfricanAmerican', 'race:Asian',
       'race:Caucasian', 'race:Hispanic', 'race:Other', 'hypertension',
       'heart_disease', 'bmi', 'diabetes', 'state_Alabama', 'state_Alaska',
       'state_Arizona', 'state_Arkansas', 'state_California', 'state_Colorado',
       'state_Connecticut', 'state_Delaware', 'state_District of Columbia',
       'state_Florida', 'state_Georgia', 'state_Guam', 'state_Hawaii',
       'state_Idaho', 'state_Illinois', 'state_Indiana', 'state_Iowa',
       'state_Kansas', 'state_Kentucky', 'state_Louisiana', 'state_Maine',
       'state_Maryland', 'state_Massachusetts', 'state_Michigan',
       'state_Minnesota', 'state_Mississippi', 'state_Missouri',
       'state_Montana', 'state_Nebraska', 'state_Nevada',
       'state_New Hampshire', 'state_New Jersey', 'state_New Mexico',
       'state_New York', 'state_North Carolina', 'state_North Dakota',
       'state_Ohio', 'state_Oklahoma', 'state_Oregon', 'state_Pennsylvania',
  

In [7]:
# Get the target variables
target = df['diabetes']
target_names = ["0","1"]

In [8]:
#Get the features
data = df.drop("diabetes", axis=1)

feature_names = data.columns

data.head()

Unnamed: 0,year,gender,age,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,...,state_Texas,state_United States,state_Utah,state_Vermont,state_Virgin Islands,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,2020,0,32.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015,0,29.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015,1,18.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015,1,41.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016,0,52.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

# SVM

In [10]:
# Support vector machine linear classifier
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [11]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.916


In [12]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     22895
           1       0.50      0.00      0.01      2105

    accuracy                           0.92     25000
   macro avg       0.71      0.50      0.48     25000
weighted avg       0.88      0.92      0.88     25000



# Identifying Strong/Weak Features

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = 66
layer1 = 9
layer2 = 3
layer3 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=layer1, input_dim=number_input_features, activation='relu')
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=layer2, activation='sigmoid'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=layer2, activation='sigmoid'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [16]:
!pip install scikeras scikit-learn
!pip install --upgrade scipy scikit-learn scikeras



In [20]:

from scikeras.wrappers import KerasClassifier
from sklearn.inspection import permutation_importance

# Ensure that nn is defined as your Keras model
# wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Define the wrapped model (assuming nn is your Keras Sequential model)
wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Train the wrapped model
wrapped_nn.fit(X_train_scaled, y_train)

# Compute permutation importance
result = permutation_importance(wrapped_nn, X_test_scaled, y_test, n_repeats=10, random_state=10)

# Get feature importances
importances = result.importances_mean

# Ensure X_test_scaled is a DataFrame or convert it
if not isinstance(X_test_scaled, pd.DataFrame):
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Sort the features by importance in ascending order (weakest to strongest)
sorted_indices = np.argsort(importances)

# Get the top 20 weakest features
top_20_weakest_indices = sorted_indices[:20]
top_20_weakest_features = X_test_scaled.columns[top_20_weakest_indices]
top_20_weakest_importances = importances[top_20_weakest_indices]

# Display the weakest features and their importances
for feature, importance in zip(top_20_weakest_features, top_20_weakest_importances):
    print(f"Weakest feature: {feature} with importance {importance}")

Weakest feature: state_New Mexico with importance -0.00020800000000004149
Weakest feature: state_North Carolina with importance -0.0001560000000000561
Weakest feature: state_Kentucky with importance -0.0001240000000000463
Weakest feature: race:AfricanAmerican with importance -0.00012000000000003119
Weakest feature: state_Maryland with importance -0.0001160000000000605
Weakest feature: state_Arkansas with importance -9.60000000000516e-05
Weakest feature: state_Washington with importance -8.40000000000285e-05
Weakest feature: state_West Virginia with importance -8.000000000005781e-05
Weakest feature: race:Asian with importance -8.00000000000356e-05
Weakest feature: state_California with importance -7.200000000006091e-05
Weakest feature: state_Indiana with importance -7.200000000004981e-05
Weakest feature: state_Hawaii with importance -7.20000000000276e-05
Weakest feature: state_Pennsylvania with importance -6.40000000000529e-05
Weakest feature: state_Oregon with importance -6.00000000000

In [21]:
# Get feature importances
importances = result.importances_mean

# Ensure X_test_scaled is a DataFrame or convert it
if not isinstance(X_test_scaled, pd.DataFrame):
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Sort the features by importance in descending order (strongest to weakest)
sorted_indices = np.argsort(importances)[::-1]

# Get the top 20 strongest features
top_20_strongest_indices = sorted_indices[:20]
top_20_strongest_features = X_test_scaled.columns[top_20_strongest_indices]
top_20_strongest_importances = importances[top_20_strongest_indices]

# Display the strongest features and their importances
for feature, importance in zip(top_20_strongest_features, top_20_strongest_importances):
    print(f"Strongest feature: {feature} with importance {importance}")

Strongest feature: bmi with importance 0.0017159999999999732
Strongest feature: age with importance 0.0007879999999999665
Strongest feature: state_New Hampshire with importance 0.00018799999999995486
Strongest feature: hypertension with importance 0.00017199999999994997
Strongest feature: state_Oklahoma with importance 0.00013999999999997348
Strongest feature: state_Iowa with importance 0.00011599999999997168
Strongest feature: state_Montana with importance 0.00010399999999997079
Strongest feature: state_Illinois with importance 9.999999999996679e-05
Strongest feature: state_Arizona with importance 8.399999999997299e-05
Strongest feature: state_Louisiana with importance 8.399999999996189e-05
Strongest feature: state_Virgin Islands with importance 7.599999999995388e-05
Strongest feature: state_Virginia with importance 6.799999999995698e-05
Strongest feature: state_Nebraska with importance 6.399999999996408e-05
Strongest feature: race:Other with importance 6.399999999996408e-05
Strongest