## Support Vector Machine

In [1]:
!pip install scikeras



In [2]:
# Import our dependencies
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import tensorflow as tf

#  Import and read the diabetes_data.csv.
import pandas as pd
diabetes_df = pd.read_csv('https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/Maribel/diabetes_data_upload.csv')
# Display the first few rows
diabetes_df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:
# List of all the columns in the data set
print(diabetes_df.columns)

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')


In [4]:
# Determine the number of unique values in each column.
print(diabetes_df.nunique())

Age                   51
Gender                 2
Polyuria               2
Polydipsia             2
sudden weight loss     2
weakness               2
Polyphagia             2
Genital thrush         2
visual blurring        2
Itching                2
Irritability           2
delayed healing        2
partial paresis        2
muscle stiffness       2
Alopecia               2
Obesity                2
class                  2
dtype: int64


In [5]:
#Checking Dtypes
diabetes_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

In [6]:
#change all "yes" to 1 and all "no" to 0
diabetes_df = diabetes_df.replace({"Yes": 1, "No": 0})
diabetes_df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,0,1,0,1,0,0,0,1,0,1,0,1,1,1,Positive
1,58,Male,0,0,0,1,0,0,1,0,0,0,1,0,1,0,Positive
2,41,Male,1,0,0,1,1,0,0,1,0,1,0,1,1,0,Positive
3,45,Male,0,0,1,1,1,1,0,1,0,1,0,0,0,0,Positive
4,60,Male,1,1,1,1,1,0,1,1,1,1,1,1,1,1,Positive


In [7]:
#change "female" to 1 and "male" to 0
diabetes_df = diabetes_df.replace({"Female": 1, "Male": 0})
diabetes_df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1,Positive
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,Positive
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,Positive
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,Positive
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,Positive


In [8]:
# Get the target variables
target=diabetes_df["class"]
target_names = ["negative", "positive"]

In [9]:
#Get the features
data = diabetes_df.drop("class", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1


In [10]:
#export table to excel
diabetes_df.to_excel('diabetes_data_upload.xlsx', index=False)

In [11]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [12]:
# Support vector machine linear classifier
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [13]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.923


In [14]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.89      0.89      0.89        46
    positive       0.94      0.94      0.94        84

    accuracy                           0.92       130
   macro avg       0.92      0.92      0.92       130
weighted avg       0.92      0.92      0.92       130



In [15]:
#save model to reusable variable
svm_model = model

# **Identify Weak and Strong Features**

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
n_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=100, activation="relu", input_dim=n_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=75, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=50, activation="sigmoid"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=25, activation="sigmoid"))

# Fifth hidden layer
nn.add(tf.keras.layers.Dense(units=10, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
from scikeras.wrappers import KerasClassifier
from sklearn.inspection import permutation_importance

# Ensure that nn is defined as your Keras model
# wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Define the wrapped model (assuming nn is your Keras Sequential model)
wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Train the wrapped model
wrapped_nn.fit(X_train_scaled, y_train)

# Compute permutation importance
result = permutation_importance(wrapped_nn, X_test_scaled, y_test, n_repeats=10, random_state=10)

# Get feature importances
importances = result.importances_mean

# Ensure X_test_scaled is a DataFrame or convert it
if not isinstance(X_test_scaled, pd.DataFrame):
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Sort the features by importance in ascending order (weakest to strongest)
sorted_indices = np.argsort(importances)

# Get the top 20 weakest features
top_20_weakest_indices = sorted_indices[:20]
top_20_weakest_features = X_test_scaled.columns[top_20_weakest_indices]
top_20_weakest_importances = importances[top_20_weakest_indices]

# Display the weakest features and their importances
for feature, importance in zip(top_20_weakest_features, top_20_weakest_importances):
    print(f"Weakest feature: {feature} with importance {importance}")

Weakest feature: weakness with importance 0.003076923076923066
Weakest feature: muscle stiffness with importance 0.003076923076923066
Weakest feature: Alopecia with importance 0.0038461538461538325
Weakest feature: Obesity with importance 0.0038461538461538438
Weakest feature: Irritability with importance 0.009230769230769242
Weakest feature: Age with importance 0.013076923076923087
Weakest feature: Itching with importance 0.013846153846153864
Weakest feature: Polyphagia with importance 0.01769230769230771
Weakest feature: sudden weight loss with importance 0.020000000000000028
Weakest feature: delayed healing with importance 0.020000000000000028
Weakest feature: Genital thrush with importance 0.020769230769230772
Weakest feature: partial paresis with importance 0.02230769230769234
Weakest feature: visual blurring with importance 0.030000000000000016
Weakest feature: Gender with importance 0.055384615384615386
Weakest feature: Polydipsia with importance 0.05615384615384618
Weakest feat

In [20]:
# Wrap your Keras model
wrapped_nn = KerasClassifier(model=nn, epochs=20, batch_size=32, verbose=0)

# Train the wrapped model
wrapped_nn.fit(X_train_scaled, y_train)

# Compute permutation importance
result = permutation_importance(wrapped_nn, X_test_scaled, y_test, n_repeats=10, random_state=10)

# Get feature importances
importances = result.importances_mean

# Sort the features by importance in descending order (strongest to weakest)
sorted_indices = np.argsort(importances)[::-1]

# Get the top 20 strongest features
top_20_strongest_indices = sorted_indices[:20]
top_20_strongest_features = X_test.columns[top_20_strongest_indices]
top_20_strongest_importances = importances[top_20_strongest_indices]

# Display the strongest features and their importances
for feature, importance in zip(top_20_strongest_features, top_20_strongest_importances):
    print(f"Strongest feature: {feature} with importance {importance}")

Strongest feature: Polyuria with importance 0.05076923076923082
Strongest feature: Polydipsia with importance 0.05076923076923081
Strongest feature: Gender with importance 0.03615384615384618
Strongest feature: visual blurring with importance 0.027692307692307738
Strongest feature: partial paresis with importance 0.016923076923076985
Strongest feature: delayed healing with importance 0.015384615384615441
Strongest feature: Itching with importance 0.00846153846153851
Strongest feature: Genital thrush with importance 0.00846153846153851
Strongest feature: Irritability with importance 0.006153846153846199
Strongest feature: sudden weight loss with importance 0.0030769230769231216
Strongest feature: muscle stiffness with importance 0.0007692307692308109
Strongest feature: Polyphagia with importance 4.4408920985006264e-17
Strongest feature: Obesity with importance 2.2204460492503132e-17
Strongest feature: Age with importance -0.0007692307692307665
Strongest feature: weakness with importance