In [1]:
# %% Imports
import pandas as pd
# Makes sure we see all columns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Install interpret module
!pip install interpret

Collecting interpret
  Downloading interpret-0.6.10-py3-none-any.whl.metadata (1.2 kB)
Collecting interpret-core==0.6.10 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Downloading interpret_core-0.6.10-py3-none-any.whl.metadata (2.9 kB)
Collecting SALib>=1.3.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Downloading salib-1.5.1-py3-none-any.whl.metadata (11 kB)
Collecting dill>=0.2.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting aplr>=10.6.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interpret)
  Downloading aplr-10.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting dash<3.0.0,>=1.0.0 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.10->interp

In [4]:
from interpret.glassbox import (LogisticRegression, ClassificationTree)
from interpret import show
from sklearn.metrics import f1_score, accuracy_score

In [24]:
# Create DataLoader class and pre-process data
class DataLoader():
    def __init__(self):
        self.data = None

    def load_data(self, path='/content/drive/MyDrive/healthcare-dataset-stroke-data.csv'):
        self.data = pd.read_csv(path)

    def preprocess_data(self):
        categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
        encode = pd.get_dummies(self.data[categorical_cols], prefix=categorical_cols)

        self.data = pd.concat([encode, self.data], axis=1)

        self.data = self.data.drop(categorical_cols, axis=1)

        self.data.bmi = self.data.bmi.fillna(0)

        self.data.drop(['id'], axis=1, inplace=True)


        for col in self.data.select_dtypes(include=['bool']).columns:
            self.data[col] = self.data[col].astype(int)

    def get_data_split(self):
        X = self.data.iloc[:, :-1]
        y = self.data.iloc[:, -1]
        return train_test_split(X, y, test_size=0.2, random_state=2021)

    def Oversample(self, X_train, y_train):

        over_sampling = RandomOverSampler(sampling_strategy='not majority')
        x_np = X_train.to_numpy()
        y_np = y_train.to_numpy()
        x_np, y_np = over_sampling.fit_resample(x_np, y_np)


        x_np = pd.DataFrame(x_np, columns=X_train.columns)
        y_np = pd.Series(y_np, name=y_train.name)

        return x_np, y_np

In [25]:
# %% Load and preprocess data
data_loader = DataLoader()
data_loader.load_data()
data_loader.preprocess_data()

In [26]:
# Split the data for evaluation
X_train, X_test, y_train, y_test = data_loader.get_data_split()
print(X_train.shape, X_test.shape)
# Oversample the train data
X_train, y_train = data_loader.Oversample(X_train, y_train)
print("After oversampling: ", X_train.shape)

(4088, 21) (1022, 21)
After oversampling:  (7778, 21)


In [27]:
# %% Fit logistic regression model
lr = LogisticRegression(random_state=20, feature_names=X_train.columns, penalty='l1', solver='liblinear')
lr.fit(X_train, y_train)
print("Training finished")

Training finished


In [28]:
# %% Evaluate logistic regression model
y_pred = lr.predict(X_test)
print("F1 score: ", {f1_score(y_test, y_pred, average='macro')})
print("Accuracy: ", {accuracy_score(y_test, y_pred)})

F1 score:  {0.5151383678745968}
Accuracy:  {0.7367906066536204}


In [29]:
# %% Explain local prediction
lr_local = lr.explain_local(X_test[:100], y_test[:100], name='Logistic Regression')
show(lr_local)

In [30]:
# %% Fit decision tree model
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training finished")
y_pred = tree.predict(X_test)
print("F1 score: ", {f1_score(y_test, y_pred, average='macro')})
print("Accuracy: ", {accuracy_score(y_test, y_pred)})


Training finished
F1 score:  {0.5259920468974536}
Accuracy:  {0.761252446183953}


In [32]:
# %% Explain local prediction
tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree')
show(tree_local)