In [28]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [3]:
X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)

rng = np.random.RandomState(seed=42)

X["random_cat"] = rng.randint(3, size=X.shape[0])
X["random_num"] = rng.randn(X.shape[0])

categorical_columns = ["pclass", "sex", "embarked", "random_cat"]
numerical_columns = ["age", "sibsp", "parch", "fare", "random_num"]

X = X[categorical_columns + numerical_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [94]:
rng.randint(3, size=10)

array([1, 2, 1, 0, 1, 1, 0, 0, 2, 2])

In [4]:
categorical_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1
)
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

rf = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

In [6]:
rf.fit(X_train, y_train)

In [7]:
print(f"RF train accuracy: {rf.score(X_train, y_train):.3f}")
print(f"RF test accuracy: {rf.score(X_test, y_test):.3f}")

RF train accuracy: 1.000
RF test accuracy: 0.814


In [9]:
feature_names = rf[:-1].get_feature_names_out()

In [87]:
def plot_rf_importance(clf):
    feature_names = clf[:-1].get_feature_names_out()
    mdi_importances = pd.Series(
        clf[-1].feature_importances_, index=feature_names
    ).sort_values(ascending=True)


    fig = px.bar(mdi_importances, orientation="h", title="Random Forest Feature Importances (MDI)")
    fig.update_layout(showlegend=False, xaxis_title="Importance", yaxis_title="Feature")
    
    return fig

In [29]:
result = permutation_importance(
    rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

In [31]:
sorted_importances_idx = result.importances_mean.argsort()
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)

In [76]:
def plot_permutation_boxplot(clf, X: np.ndarray, y: np.array, set_: str=None):

    result = permutation_importance(
        clf, X, y, n_repeats=10, random_state=42, n_jobs=2
    )

    sorted_importances_idx = result.importances_mean.argsort()
    importances = pd.DataFrame(
        result.importances[sorted_importances_idx].T,
        columns=X.columns[sorted_importances_idx],
    )

    fig = px.box(
        importances.melt(),
        y="variable",
        x="value"
    )

    # Add dashed vertical line
    fig.add_shape(
        type="line",
        x0=0,
        y0=-1,
        x1=0,
        y1=len(importances.columns),
        opacity=0.5,
        line=dict(
            dash="dash"
        ),
    )
    # Adapt x-range
    x_min = importances.min().min() 
    x_min = x_min - 0.005 if x_min < 0 else -0.005
    x_max = importances.max().max() + 0.005
    fig.update_xaxes(range=[x_min, x_max])
    fig.update_layout(
        title=f"Permutation Importances {set_ if set_ else ''}",
        xaxis_title="Importance",
        yaxis_title="Feature",
        showlegend=False
    )

    return fig

In [77]:
fig = plot_permutation_boxplot(rf, X_test, y_test, "test set")
fig.show()

In [78]:
fig = plot_permutation_boxplot(rf, X_train, y_train, "train set")
fig.show()