In [34]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from interpret.blackbox import LimeTabular
from interpret import show
import numpy as np

In [35]:
dataset=pd.read_csv("healthcare-dataset-stroke-data.csv")

In [36]:

categorical_cols = ["gender","ever_married","work_type","Residence_type","smoking_status"]
encoded = pd.get_dummies(dataset[categorical_cols])


In [37]:
dataset = pd.concat([encoded, dataset], axis=1)
dataset.drop(categorical_cols, axis=1, inplace=True)
dataset.bmi = dataset.bmi.fillna(dataset['bmi'].mean())
dataset.drop(["id"], axis=1, inplace=True)

In [38]:
x= dataset.iloc[:,:-1]
y= dataset.iloc[:,-1]

In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [40]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
x_np = x_train.to_numpy()
y_np = y_train.to_numpy()
x_np, y_np = oversample.fit_resample(x_np, y_np)
x_over = pd.DataFrame(x_np, columns=x_train.columns)
y_over = pd.Series(y_np, name=y_train.name)
        

In [41]:
x_train=x_over.values
y_train=y_over.values

Random Forest Classifier

In [42]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.5160984848484849
Accuracy 0.9412915851272016



X has feature names, but RandomForestClassifier was fitted without feature names



In [43]:
lime = LimeTabular(predict_fn=rf.predict_proba, 
                   data=x_train, 
                   random_state=1)

In [44]:
lime_local = lime.explain_local(x_test[-10:], 
                                y_test[-10:], 
                                name='LIME')

show(lime_local)