In [None]:
import numpy as np
import pandas as pd
import warnings
import pandas as pd
from time import  time
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from sklearn.model_selection  import KFold
from interpret import show
from interpret.blackbox import LimeTabular
import dalex as dx

df = pd.read_csv('data.csv')
# remove columns
df = df.drop(['id','Unnamed: 32'], axis=1)

#replace values
df['diagnosis'] = df['diagnosis'].replace(['M'], 1)
df['diagnosis'] = df['diagnosis'].replace(['B'], 0)

# Random Forest
X =  df.drop(['diagnosis'], axis=1)
Y = df['diagnosis']



X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

#simple random forrest 
print()
print("Training the Random Forest Classifier")
start = time()
random_forest = RandomForestClassifier(random_state=1)
random_forest.fit(X_train, y_train)
end = time()
tr_time = round(end - start, 2)
print("The training completed in : {} seconds.".format(tr_time))

y_predict = random_forest.predict(X_test)

def metrics_eval_clas(y_test, y_pred,):
    print('accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)*100,2))
    print('f1-score: {:.3f}'.format(f1_score(y_test, y_pred, average='macro')*100,2))
    print('precision: {:.3f}'.format(precision_score(y_test, y_pred, average='macro')*100,2))
    print('recall: {:.3f}'.format(recall_score(y_test, y_pred, average='macro')*100,2))
    
y_predict = random_forest.predict(X_test)
metrics_eval_clas(y_test, y_predict)

#Finding the best Hyperparameter of Random Forest
#Ορίζω το Pipe του RandomForest 
scaler = StandardScaler()
selector = VarianceThreshold()
pca = PCA()
pipeline = Pipeline(steps=[('scaler', scaler),('selector', selector), ('pca', pca), ("clf",RandomForestClassifier())])


param_grid = {'clf__n_estimators': [500,200,100],
               'clf__max_features': ['sqrt'],
               'clf__max_depth': [40,60,90],
               'clf__min_samples_split': [3,5],
               'clf__min_samples_leaf': [1,3],
               'clf__bootstrap': [True,False]}

# initialize
grid_pipeline = GridSearchCV(pipeline,param_grid=param_grid, cv=3, n_jobs=-1,verbose=1)
# fit
grid_pipeline.fit(X_train,y_train)


best_model = grid_pipeline.best_estimator_['clf'].fit(X_train,y_train)
print('score of the model =', round(grid_pipeline.best_score_*100,2),'%')
#predictions
y_predict = best_model.predict(X_test)



In [None]:
#Breakdown Plot explainer
explainer = dx.Explainer(best_model,X,Y) # create explainer from Dalex
# Generate breakdown plot
explainer.predict_parts(X.iloc[2], type = "break_down").plot(max_vars=15)

In [None]:
#Lime Explainer
lime = LimeTabular(predict_fn=best_model.predict, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])
show(lime_local)