In [None]:
import pandas as pd
from data_pipeline.dataset import Dataset

In [None]:
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
df.head()

In [None]:
from hydra.utils import instantiate
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from data_pipeline.pipelinesteps import data_splitter


GlobalHydra.instance().clear()
initialize(config_path="../../", version_base=None)
cfg = compose(config_name="config")

data_pipeline = instantiate(cfg.data_pipeline)
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
df = data_pipeline.apply(df)

dataset = Dataset(data=df, data_splitter=data_splitter, target_column='target')



In [None]:
dataset.X.head()

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load a binary classification dataset



# Split the data into training and test sets
X_train, X_test, y_train, y_test = dataset.X_train, dataset.X_test, dataset.y_train, dataset.y_test 

# Train a HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier()
clf.fit(X_train, y_train)

# Predict the probabilities of the positive class
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Compute the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC score: {auc_score}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
# Plot the AUC score
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
from datetime import datetime
print(f"shap_{datetime.now()}.png")

In [None]:
import shap
import matplotlib.pyplot as plt
from datetime import datetime
import os

def create_shap_beeswarm(model, X_data, num_of_features):

    '''Function to create a shap beeswarm plot. The input parameters are the model that is trained and from which you want to
    test the effect of the different features. X_data is a parameter that represents the training data on which the model is trained.
    At last the num_of_features parameter represents the number of features you want to show in the beeswarm plot.'''
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_data)
    fig = shap.plots.beeswarm(shap_values, max_display=num_of_features, show=False)
    path = os.path.abspath(os.path.join(os.getcwd(), '..','..','model_evaluation'))
    plt.savefig(path+f"\\beeswarm_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.png")
    print(path+f"\\beeswarm_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.png")
    return fig

In [None]:
import shap
import matplotlib.pyplot as plt
from datetime import datetime
import os

def create_heatmap(model, X_data):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_data)
    fig = shap.plots.heatmap(shap_values[:1000], show=False)
    path = os.path.abspath(os.path.join(os.getcwd(), '..','..','model_evaluation'))
    plt.savefig(path+f"\\heatmap_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.png")
    return fig


In [None]:
import shap
import matplotlib.pyplot as plt
from datetime import datetime
import os

def create_waterfall(model, X_data, row):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_data)
    fig = shap.plots.waterfall(shap_values[row], show=False)
    path = os.path.abspath(os.path.join(os.getcwd(), '..','..','model_evaluation'))
    plt.savefig(path+f"\\waterfall_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.png")
    return fig


In [None]:
create_shap_beeswarm(clf, X_train, 10)

In [None]:
create_heatmap(clf, X_train)

In [None]:
create_waterfall(clf, X_train, 1)

In [None]:
import shap

explainer = shap.TreeExplainer(clf)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.beeswarm(shap_values, max_display=100)