In [None]:
from pathlib import Path
import numpy as np
from pdpilot import PDPilotWidget
from pmlb import fetch_data
import lightgbm as lgb
import shap

In [None]:
dataset_group = "debug"
dataset = "irish"

In [None]:
def get_dataset_and_model(dataset_group, dataset):
    df_all = fetch_data(dataset, local_cache_dir=f"./results/{dataset_group}/datasets")
    df_reduced = (
        df_all if df_all.shape[0] <= 2000 else df_all.sample(2000, random_state=1)
    )
    df_X = df_reduced.drop(columns=["target"])
    features = list(df_X.columns)

    # convert float columns that contain only integers to integers
    for feature in features:
        as_int = df_X[feature].astype(int)
        if np.array_equal(df_X[feature], as_int):
            df_X[feature] = as_int

    y = df_reduced["target"].to_numpy()

    booster = lgb.Booster(model_file=f"./results/{dataset_group}/models/{dataset}.txt")
    pd_data = Path(f"./results/{dataset_group}/pdpilot/{dataset}.json")

    return df_X, y, features, booster, pd_data

In [None]:
df, y, features, booster, pd_data = get_dataset_and_model(dataset_group, dataset)

In [None]:
w = PDPilotWidget(
    predict=booster.predict, df=df, labels=y, pd_data=pd_data, seed=56, height=650
)

w

In [None]:
explainer = shap.TreeExplainer(booster)
shap_values = explainer(df)

In [None]:
shap.plots.bar(shap_values)

In [None]:
explainer.expected_value

In [None]:
np.abs(shap_values.values).mean(axis=0)

In [None]:
dict(zip(df.columns, np.abs(shap_values.values).mean(axis=0)))