# Create a semi-synthetic dataset
From an existing dataset, create the target by using a fixed size random sub sample of covariates and taking the predictions of a decision tree on the real output. Sample with the predicted class probabilities in classification or add gaussian noise with variance equal to the variance of the samples in each leaf in regression. We can permute the unimportant features to avoid conditional dependancies.

In [14]:
import numpy as np

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import check_random_state


def generate_semi_synth_dataset(
    X, y, is_classification, n_important, random_seed, permute=False
):
    rng = check_random_state(random_seed)
    n_samples, n_feature = X.shape
    assert n_feature >= n_important
    important_feature_indices = rng.choice(n_feature, size=n_important, replace=False)
    unimportant_feature_indices = np.setdiff1d(
        np.arange(n_feature), important_feature_indices
    )
    if is_classification:
        tree_model = DecisionTreeClassifier(min_samples_leaf=10, random_state=rng)
        tree_model.fit(X[:, important_feature_indices], y)
        output_leaves = tree_model.apply(X[:, important_feature_indices])
        y_new = np.array(
            [
                rng.multinomial(1, p).argmax()
                for p in tree_model.tree_.value[output_leaves].squeeze()
            ]
        )
        X_new = X.copy()
        if permute:
            X_new[unimportant_feature_indices] = rng.permutation(
                X_new[unimportant_feature_indices]
            )
    else:
        tree_model = DecisionTreeRegressor(min_samples_leaf=10, random_state=rng)
        tree_model.fit(X[:, important_feature_indices], y)
        output_leaves = tree_model.apply(X[:, important_feature_indices])
        y_new = np.array(
            [
                rng.normal(
                    mu,
                    np.sqrt(
                        np.var(y[tree_model.tree_.value[output_leaves].squeeze() == mu])
                    ),
                )
                for mu in tree_model.tree_.value[output_leaves].squeeze()
            ]
        )
        X_new = X.copy()
        if permute:
            X_new[unimportant_feature_indices] = rng.permutation(
                X_new[unimportant_feature_indices]
            )
    return X_new, y_new, important_feature_indices


In [15]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
X, y = make_classification(random_state=0, n_samples=100, n_features=5)
X, y_new, indices = generate_semi_synth_dataset(X, y, is_classification=True, n_important=2, random_seed=0, permute=True)

rf = RandomForestClassifier(n_estimators=500, oob_score=True, random_state=0)
rf.fit(X, y_new)
rf.unbiased_feature_importances_

array([0.63885286, 0.09746316, 0.00085007, 0.24304517, 0.01978874])

In [16]:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
X, y = make_regression(random_state=0, n_samples=100, n_features=5)
X, y_new, indices = generate_semi_synth_dataset(X, y, is_classification=False, n_important=2, random_seed=0, permute=True)

rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
rf.fit(X, y_new)
rf.unbiased_feature_importances_

array([ 4.02288589e+01, -1.73235297e+00, -9.88675112e+00,  9.31744905e-03,
       -2.76957554e+01])

In [7]:
rf.feature_importances_

[array([0.71633888, 0.07848916, 0.03729091, 0.08483667, 0.08304438]), array([0.68079759, 0.0191533 , 0.19518296, 0.06997218, 0.03489397]), array([0.68538438, 0.04208481, 0.07272958, 0.13228851, 0.06751273]), array([0.67031733, 0.0775241 , 0.03684996, 0.1664023 , 0.04890631]), array([0.77451017, 0.10400672, 0.01412961, 0.08069932, 0.02665417]), array([0.63962618, 0.04795704, 0.21615336, 0.06243922, 0.0338242 ]), array([0.78155324, 0.00637545, 0.05870887, 0.08121053, 0.0721519 ]), array([0.73565787, 0.01489017, 0.06257156, 0.16380918, 0.02307122]), array([0.72057638, 0.07109918, 0.05538417, 0.05113792, 0.10180234]), array([0.70672173, 0.07706056, 0.02481872, 0.1481023 , 0.04329669])]


array([0.71114838, 0.05386405, 0.07738197, 0.10408981, 0.05351579])