In [None]:
# Import our dependencies
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the diabetes_data.csv.
import pandas as pd
diabetes_data_df = pd.read_csv('https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/Stephen/diabetes_data.csv')
# Display the first few rows
diabetes_data_df.head()


In [None]:
# List of all the columns in the data set
print(diabetes_data_df.columns)

## Drop Non-Beneficial Columns

In [None]:
# Drop the non-beneficial ID columns, 'PatientID' and 'DoctorInCharge'.
diabetes_data_clean_df = diabetes_data_df.drop(columns=['PatientID', 'DoctorInCharge'])

In [None]:
diabetes_data_clean_df

In [None]:
print(diabetes_data_clean_df.nunique())

## Binning w/ Logarithmic Transformation

Columns: 'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP', 'FastingBloodSugar', 'HbA1c', 'SerumCreatinine', 'BUNLevels', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'FatigueLevels', 'QualityOfLifeScore', 'MedicalCheckupsFrequency', 'MedicationAdherence', 'HealthLiteracy'

In [None]:
def log_transform_and_bin(diabetes_data_clean_df, columns, num_bins=10, drop_original=True):
    """
    Logarithmically transform specified columns, bin them, and optionally drop the original columns.

    Parameters:
        diabetes_data_clean_df (pd.DataFrame): The DataFrame to be transformed.
        columns (list): List of columns to log-transform and bin.
        num_bins (int): The number of bins to create. Default is 10.
        drop_original (bool): If True, drop the original columns. Default is True.

    Returns:
        pd.DataFrame: The transformed DataFrame with binned columns.
    """

    transformed_df = diabetes_data_clean_df.copy()

    for col in columns:
        # Log-transform the specified column
        transformed_df[f'Log_{col}'] = np.log1p(transformed_df[col])

        # Create bins based on the log-transformed values
        log_bins = pd.cut(transformed_df[f'Log_{col}'], bins=num_bins)

        # Access the bin edges from the categorical object
        bin_edges_log_scale = log_bins.cat.categories

        # Reverse the logarithmic transformation to get the bin edges on the original scale
        bin_edges = np.expm1([bin_edges_log_scale.left.min()] + list(bin_edges_log_scale.right))

        # Ensure bin edges are unique
        bin_edges = np.unique(bin_edges)

        # Create bin labels corresponding to the reversed log-transformed bin edges
        bin_labels = [f'{int(bin_edges[i])}-{int(bin_edges[i+1]-1)}' for i in range(len(bin_edges)-1)]

        # Assign the custom labels and create the binned column in the original DataFrame
        transformed_df[f'{col}_Bins'] = pd.cut(transformed_df[col], bins=bin_edges, labels=bin_labels, right=False, ordered=False)

        # Optionally drop the original and temporary Log column
        if drop_original:
            transformed_df.drop(columns=[col, f'Log_{col}'], inplace=True)
        else:
            transformed_df.drop(columns=[f'Log_{col}'], inplace=True)

    return transformed_df

# Example usage
# Assuming 'diabetes_data_clean_df' is already defined and has the specified columns
columns_to_transform = ['Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
                        'SleepQuality', 'SystolicBP', 'DiastolicBP', 'FastingBloodSugar', 'HbA1c',
                        'SerumCreatinine', 'BUNLevels', 'CholesterolTotal', 'CholesterolLDL',
                        'CholesterolHDL', 'CholesterolTriglycerides', 'FatigueLevels',
                        'QualityOfLifeScore', 'MedicalCheckupsFrequency', 'MedicationAdherence',
                        'HealthLiteracy']

transformed_df = log_transform_and_bin(diabetes_data_clean_df, columns_to_transform)

# Display the first few rows of the transformed DataFrame
print(transformed_df.head())

In [None]:
transformed_df.head(5)

In [None]:
print(transformed_df.nunique())

## Dummies

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
transformed_dummies_df = pd.get_dummies(transformed_df, columns=['Ethnicity', 'SocioeconomicStatus', 'EducationLevel',
       'Age_Bins', 'BMI_Bins', 'AlcoholConsumption_Bins',
       'PhysicalActivity_Bins', 'DietQuality_Bins', 'SleepQuality_Bins',
       'SystolicBP_Bins', 'DiastolicBP_Bins', 'FastingBloodSugar_Bins',
       'HbA1c_Bins', 'SerumCreatinine_Bins', 'BUNLevels_Bins',
       'CholesterolTotal_Bins', 'CholesterolLDL_Bins', 'CholesterolHDL_Bins',
       'CholesterolTriglycerides_Bins', 'FatigueLevels_Bins',
       'QualityOfLifeScore_Bins', 'MedicalCheckupsFrequency_Bins',
       'MedicationAdherence_Bins', 'HealthLiteracy_Bins'])
transformed_dummies_df.head()

## Identify Target (y) and Features (X)

In [None]:
# Split our preprocessed data into our features and target arrays
y = transformed_dummies_df["Diagnosis"]
X = transformed_dummies_df.drop(columns="Diagnosis")

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=10,
                                                    stratify=y)
X_train.shape

In [None]:
print(X_train.dtypes)

## StandardScaler

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Neural Network #1

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
n_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=n_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Create a callback that saves the model's weights every five epochs.
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='model_weights_epoch_{epoch:02d}.weights.h5',
    save_weights_only=True,
    save_freq=5 * len(X_train_scaled)
)

# Train the model and pass the callback
fit_model = nn.fit(X_train_scaled, y_train, epochs=50, callbacks=[checkpoint_callback])

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## Plot the loss and accuracy

In [None]:
# Create a DataFrame containing training history
history_df = pd.DataFrame(fit_model.history)

# Increase the index by 1 to match the number of epochs
history_df.index += 1

# Plot the loss
history_df.plot(y="loss")
plt.show()

In [None]:
history_df.plot(y="accuracy")
plt.show()

# Weakest and Strongest Features

In [None]:
# !pip install scikeras scikit-learn

In [None]:
# !pip install --upgrade scipy scikit-learn scikeras

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.inspection import permutation_importance
import numpy as np

# Ensure that nn is defined as your Keras model

# Define the wrapped model (assuming nn is your Keras Sequential model)
wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Train the wrapped model
wrapped_nn.fit(X_train_scaled, y_train)

# Compute permutation importance
result = permutation_importance(wrapped_nn, X_test_scaled, y_test, n_repeats=10, random_state=10)

# Get feature importances
importances = result.importances_mean

# Ensure X_test_scaled is a DataFrame or convert it
if not isinstance(X_test_scaled, pd.DataFrame):
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Sort the features by importance in ascending order (weakest to strongest)
sorted_indices = np.argsort(importances)

# Get the top 20 weakest features
top_20_weakest_indices = sorted_indices[:20]
top_20_weakest_features = X_test_scaled.columns[top_20_weakest_indices]
top_20_weakest_importances = importances[top_20_weakest_indices]

# Display the weakest features and their importances
for feature, importance in zip(top_20_weakest_features, top_20_weakest_importances):
    print(f"Weakest feature: {feature} with importance {importance}")

In [None]:
# Define the wrapped model (assuming nn is your Keras Sequential model)
wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Train the wrapped model
wrapped_nn.fit(X_train_scaled, y_train)

# Compute permutation importance
result = permutation_importance(wrapped_nn, X_test_scaled, y_test, n_repeats=10, random_state=10)

# Get feature importances
importances = result.importances_mean

# Ensure X_test_scaled is a DataFrame or convert it
if not isinstance(X_test_scaled, pd.DataFrame):
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Sort the features by importance in descending order (strongest to weakest)
sorted_indices = np.argsort(importances)[::-1]

# Get the top 20 strongest features
top_20_strongest_indices = sorted_indices[:20]
top_20_strongest_features = X_test_scaled.columns[top_20_strongest_indices]
top_20_strongest_importances = importances[top_20_strongest_indices]

# Display the strongest features and their importances
for feature, importance in zip(top_20_strongest_features, top_20_strongest_importances):
    print(f"Strongest feature: {feature} with importance {importance}")