In [None]:
import numpy as np 
import pandas as pd 
import os
import datetime
import seaborn as sns
from tqdm.notebook import tqdm
    

import torch
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import Wide, TabMlp, WideDeep,FTTransformer,TabTransformer,TabResnet, FTTransformer
from pytorch_widedeep.metrics import Accuracy, Precision,F1Score,Recall
from pytorch_widedeep.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler


import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')
import itertools

In [None]:
original = pd.read_csv('./dataset.csv')
original['recovery_class'] = pd.cut(original['recovery ratio'], bins=[-1, 0.5, 1.1], labels=[0, 1])
original['recovery_class'].value_counts()
#Check the shape of data
print(f'The Whole dataset has {original.shape[0]} rows and {original.shape[1]} columns')

In [None]:
def get_variable_types(dataframe):
    continuous_vars = []
    categorical_vars = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            categorical_vars.append(column)
        else:
            continuous_vars.append(column)

    return continuous_vars, categorical_vars

continuous_vars, categorical_vars = get_variable_types(original)
continuous_vars.remove('JOA')
continuous_vars.remove('post JOA')
continuous_vars.remove('recovery ratio')
categorical_vars.remove('Case')
categorical_vars.remove('whole cord at the max com level')

In [None]:
del_columns = []
for columns_name in original.columns:
    if 'Case' in columns_name:
        del_columns.append(columns_name)
    # elif 'JOA' in columns_name:
    #     del_columns.append(columns_name)
    elif 'QCL_' in columns_name:
        del_columns.append(columns_name)
    elif 'post JOA' in columns_name:
        del_columns.append(columns_name)
    elif 'recovery ratio' in columns_name:
        del_columns.append(columns_name)
    elif 'whole cord at the max com level' in columns_name:
        del_columns.append(columns_name)
    # elif 'MD' in columns_name:
    #     del_columns.append(columns_name)
    # elif 'AD' in columns_name:
    #     del_columns.append(columns_name)
    # elif 'RD' in columns_name:
    #     del_columns.append(columns_name)
print(del_columns)
train = original.drop(del_columns, axis=1)

In [None]:
train = pd.get_dummies(train, columns=categorical_vars, drop_first=True)

In [None]:
#Let's check the Shape of data
print(f'The encoded Train dataset has {train.shape[0]} rows and {train.shape[1]} columns')

In [None]:
X = train.drop(['recovery_class'], axis=1)
y = train['recovery_class']

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_original, X_test, y_train_original, y_test = train_test_split(X, y, test_size = 0.2,stratify=y)

In [None]:
# Categorical columns
cat_embed_cols = [
    'Gender_M'
]
# Continous columns
continuous_cols = list(X_train_original.columns[:-1])

In [None]:
# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=continuous_cols,
    random_state=42,
    cols_to_scale='all'
)
X_tab = tab_preprocessor.fit_transform(X_train_original)
tab_preprocessor.cat_embed_input

In [None]:
# Model 
# tab_mlp = TabMlp(
#     column_idx=tab_preprocessor.column_idx,
#     cat_embed_input=tab_preprocessor.cat_embed_input,
#     cat_embed_dropout=0.1,
#     continuous_cols=continuous_cols,
#     mlp_hidden_dims=[200,100],
#     mlp_dropout=0.5,
#     mlp_activation="leaky_relu",
# )
# tabresnet = TabResnet(
#     column_idx=tab_preprocessor.column_idx,
#     cat_embed_input=tab_preprocessor.cat_embed_input, 
#     continuous_cols=continuous_cols,
#     # cat_embed_activation = "relu",
#     # cont_embed_activation = "relu",
#     # mlp_activation = "relu",
#     blocks_dims=[200, 100, 100, 100],
#     mlp_hidden_dims=[100, 100, 50],
# )
fttransformer = FTTransformer(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input, 
    continuous_cols=continuous_cols,
    n_blocks = 4
)

tab_model = WideDeep(deeptabular=fttransformer,pred_dim=1,random=42)

In [None]:
tab_trainer = Trainer(
    model=tab_model,
    objective="binary",
    optimizers=torch.optim.Adam(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision,F1Score,Recall],
    seed=42,
    callbacks=[EarlyStopping(patience=5)]
)
tab_trainer.fit(X_tab=X_tab, target=np.array(y_train_original.values), n_epochs=20, batch_size=16, val_split=0.2)

In [None]:
# validation performance
tab_trainer.metric._metrics[0].correct_count/tab_trainer.metric._metrics[0].total_count

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score,recall_score,precision_score,f1_score, roc_curve, auc
from scipy import interpolate
X_tab_te = tab_preprocessor.transform(X_test)
preds_proba = tab_trainer.predict_proba(X_tab=X_tab_te)
preds = tab_trainer.predict(X_tab=X_tab_te)
 
fpr, tpr, thersholds = roc_curve(y_test, preds)
linear_interpolator = interpolate.interp1d(fpr, tpr, kind='linear')
fpr_new = np.linspace(0, 1, 100)
tpr_new = linear_interpolator(fpr_new)
roc_auc = auc(fpr_new, tpr_new)

print("| accuracy | recall   | precision | f1_score | auc      |\n")
print("| -------- | -------- | --------- | -------- | -------- |\n")
print("| {:f} | {:f} | {:f} | {:f} | {:f}|".format(accuracy_score(y_test, preds), 
                                                    recall_score(y_test, preds), 
                                                    precision_score(y_test, preds), 
                                                    f1_score(y_test, preds), 
                                                    auc(fpr_new, tpr_new)))

pd.DataFrame(tpr_new).to_csv('./tpr.csv')

In [None]:
import captum

# explain all the predictions in the test set
X_tab_test = torch.from_numpy(X_tab_te).to("cuda")
background = X_tab[0:5]
explainer = captum.attr.DeepLiftShap(tab_model.deeptabular.eval())
shap_values = explainer.attribute(X_tab_test, baselines=torch.Tensor(background).to("cuda"), target=0)
pd_shap_values = pd.DataFrame(shap_values.detach().cpu().numpy()).apply(lambda x:abs(x)).mean(axis=0)

shap_sum = np.abs(shap_values.detach().cpu().numpy()).mean(axis=0)

importance_df = pd.DataFrame(np.expand_dims(shap_sum,axis=0),columns=tab_preprocessor.column_idx.keys())
importance_df.to_csv("fttransformer_feature_importance.csv")