## Diagnostic features dataset with no Feature Selection and TabNet

<h4>Importing Libraries</h4>

In [None]:
import sys #This module provides access to some variables used or maintained by the interpreter and to functions that interact strongly with the interpreter. 
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import keras
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

**Loading the Dataset**

In [None]:
import time
# store starting time
begin_dataprep = time.time()

In [None]:
#read diabetes data 
df = pd.read_csv('diabetes_16fdata_3targetclasses.csv')
df.head()

In [None]:
df.info()

In [None]:
train=df
target = 'Outcome'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [None]:
# dropping ALL duplicate values
train.drop_duplicates(keep = False, inplace = True)

In [None]:
#Simple preprocessing
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 10:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

In [None]:
train.info()

In [None]:
# check that pipeline accepts strings
train.loc[train[target]==0, target] = "Normal"
train.loc[train[target]==2, target] = "Prediabetic"
train.loc[train[target]==1, target] = "Diabetic"

In [None]:
#Define categorical features for categorical embeddings
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
train.replace([np.inf, -np.inf], np.nan, inplace=True)

### Training

In [None]:
start_clf= time.time()

In [None]:
#Network parameters
tabnet_params = {"cat_idxs":cat_idxs,
                 "cat_dims":cat_dims,
                 "cat_emb_dim":1,
                 "optimizer_fn":torch.optim.Adam,
                 "optimizer_params":dict(lr=2e-2),
                 "scheduler_params":{"step_size":50, # how to use learning rate scheduler
                                 "gamma":0.9},
                 "scheduler_fn":torch.optim.lr_scheduler.StepLR,
                 "mask_type":'entmax' # "sparsemax"
                }

clf = TabNetClassifier(**tabnet_params
                      )

In [None]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
max_epochs = 100 if not os.getenv("CI", False) else 2

In [None]:
from pytorch_tabnet.augmentations import ClassificationSMOTE
aug = ClassificationSMOTE(p=0.2)

In [None]:
# This illustrates the warm_start=False behaviour
save_history = []
for _ in range(2):
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['accuracy'],
        max_epochs=max_epochs , patience=20,
        batch_size=8, 
        num_workers=0,
        weights=1,
        drop_last=False,
        augmentations=aug, #aug, None
    )
    save_history.append(clf.history["valid_accuracy"])

assert(np.all(np.array(save_history[0]==np.array(save_history[1]))))

In [None]:
# plot losses
plt.plot(clf.history['loss'])

In [None]:
# plot auc
plt.plot(clf.history['train_accuracy'])
plt.plot(clf.history['valid_accuracy'])

In [None]:
# plot learning rates
plt.plot(clf.history['lr'])

### Prediction

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_pred.shape

In [None]:
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [None]:
#print(multiclass_roc_auc_score(y_test, y_pred))

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
print('Accuracy = ', accuracy_score(y_test, y_pred))
print('-')
print(confusion_matrix(y_test,y_pred))
print('-')
print(classification_report(y_test,y_pred))
print('-')

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
'''y_pred_keras_pr = clf.predict_proba(X_test)
from sklearn.metrics import roc_curve
import scikitplot as skplt
plot = skplt.metrics.plot_roc(y_test, y_pred_keras_pr)'''

In [None]:
end_clf= time.time()
time_clf = end_clf-start_clf
print('Time taken:',time_clf)

In [None]:
print('Time taken in minutes:',time_clf/60)

#### Local Explainability and Masks 

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
fig, axs = plt.subplots(1,3,figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")