## Imports

In [None]:
import numpy as np
import pandas as pd
import time
import math
from tqdm.contrib import itertools

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

from powershap import Powershap
from itershap import IterSHAP

## Global parameters

In [1]:
# Variables
TRAIN_SAMPLES = 100
NR_RUNS_PER_EXPEIMERNT = 5
MAX_ITER = 3
STEP_SIZE = 0.50

## Get data

In [None]:
def get_data(TOTAL_SAMPLES, TOTAL_FEATURES, NR_INFORMATIVE, RANDOM_SEED):
    # Create a synthesized classification dataset
    X, y = make_classification(n_samples=TOTAL_SAMPLES, n_features=TOTAL_FEATURES, n_informative=NR_INFORMATIVE, n_redundant=0, shuffle=False, random_state=RANDOM_SEED)
    column_names = np.array(['feature_'+str(f) for f in range(TOTAL_FEATURES)])
    X = pd.DataFrame(X, columns=column_names)
    return X, y

## PowerSHAP
For comparison

In [None]:
def power_shap(X, y):
    X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.25)
    pipe = Pipeline(
        [
            (
                "selector",
                PowerShap(
                    CatBoostClassifier(verbose=0, n_estimators=250, allow_writing_files=False),
                ),
            ),
            (f"CatBoostClassifier", CatBoostClassifier(verbose=0, n_estimators=250, allow_writing_files=False)),
        ]
    )
    pipe.fit(X_train, y_train)
    X_test = pipe[0].transform(X_test)
    SELECTED_FEATURES = X_test.columns    

    return X.iloc[:, SELECTED_FEATURES]

# Simulations

In [None]:
# Simulation parameters
LARGE_DATASET = [False, True]
TOTAL_SAMPLES_OPTIONS = [5000]
TOTAL_FEATURES_OPTIONS = [20, 100, 250, 500]
PERC_INFORMATIVE_OPTIONS = [0.10, 0.33, 0.50, 0.90]
NR_RUNS_PER_EXPERIMENT = 5

In [None]:
def run_experiment(S, F, I, LARGE, RS):
    # Define the parameters of this experiment
    TOTAL_SAMPLES = S
    TOTAL_FEATURES = F
    PERC_INFORMATIVE = I
    NR_INF_FEATURES = math.floor(TOTAL_FEATURES * PERC_INFORMATIVE)
    LARGE_DATA = LARGE
    RANDOM_SEED = RS

    # Retrieve the data for this experiment
    X, y = get_data(TOTAL_SAMPLES, TOTAL_FEATURES, NR_INF_FEATURES, RANDOM_SEED)

    if not LARGE_DATA:
        X, y = train_test_split(X, y, train_size=160)

    # Start time of the feature selection
    powershap_start_time = time.time()
    
    # Iteratively reduce the features of the RF classifier
    X_after_powershap = power_shap(X, y)
    # Note the end time of the feature selection
    powershap_end_time = time.time()
    TOTAL_POWERSHAP_RUNTIME = time.time() - powershap_start_time

    itershap_fs = IterSHAP(CatBoostClassifier)
    itershap_fs.fit(X, y)
    X_after_itershap = itershap_fs.transform()
    TOTAL_ITERSHAP_RUNTIME = time.time() - powershap_end_time

    # Test the outcoming accuracy on after Powershap feature selection
    powershap_test_clf = CatBoostClassifier(verbose=0, n_estimators=250, allow_writing_files=False)
    X_train, X_test, y_train, y_test = train_test_split(X_after_powershap, y, test_size=0.25)
    powershap_test_clf.fit(X_train, y_train)
    y_pred_test = powershap_test_clf.predict(X_test)
    POWERSHAP_ACCURACY = accuracy_score(y_test, y_pred_test)
    print(f"Accuracy after applying Powershap: \t{POWERSHAP_ACCURACY}, in runtime \t{TOTAL_POWERSHAP_RUNTIME}")

    # Test the outcoming accuracy on after IterSHAP feature selection
    itershap_test_clf = CatBoostClassifier(verbose=0, n_estimators=250, allow_writing_files=False)
    X_train, X_test, y_train, y_test = train_test_split(X_after_itershap, y, test_size=0.25)
    itershap_test_clf.fit(X_train, y_train)
    y_pred_test = itershap_test_clf.predict(X_test)
    ITERSHAP_ACCURACY = accuracy_score(y_test, y_pred_test)
    print(f"Accuracy after applying IterSHAP: \t{ITERSHAP_ACCURACY}, in runtime \t{TOTAL_ITERSHAP_RUNTIME}")

In [None]:
def run_experiments():
    combinations = [[S, F, I, LARGE] for S in TOTAL_SAMPLES_OPTIONS for F in TOTAL_FEATURES_OPTIONS 
                    for I in PERC_INFORMATIVE_OPTIONS for LARGE in LARGE_DATASET]
    # Loop over all combinations and add the results to the CSV.
    for x, i in itertools.product(range(NR_RUNS_PER_EXPERIMENT), range(len(combinations))):
        [S, F, I, LARGE] = combinations[i]
        RANDOM_SEED = i + x*NR_RUNS_PER_EXPEIMERNT
        print(f"Running experiment: {RANDOM_SEED}")
        run_experiment(S, F, I, LARGE, RANDOM_SEED)
        print("\n")