## Example case: breast-cancer classification

### Imports

In [None]:
import sys
sys.path.append("../itershap")

import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from itershap import IterSHAP

### Load data and apply IterSHAP feature selection

In [None]:
# Load data from SK Learn's datasets
X, y = make_classification(300, 250, n_informative=20, shuffle=False)

# Create a data copy to test model performance without feature selection 
X_without_fs = pd.DataFrame(X)
# X = pd.DataFrame(X)

# Check the current shape of the dataset
print(X.shape)

# Create and fit IterSHAP using a RandomForestClassifier
itershap_fs = IterSHAP()
itershap_fs.fit(X, y)

print(f"Column names of features in the best subset: {itershap_fs.best_subset}")

# Transform the input data to only include selected features and print its shape
X = itershap_fs.transform(X)
print(X.shape)

### Perform classification

#### Without feature selection

In [None]:
total_accuracy = 0
ITERATIONS = 10
for x in range(ITERATIONS):
    X_train, X_test, y_train, y_test = train_test_split(X_without_fs, y, test_size=0.25)

    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_pred, y_test)
    total_accuracy += accuracy

avg_acc = total_accuracy / ITERATIONS
print(F"Average accuracy over {ITERATIONS} runs without feature selection: \t{round(avg_acc,3)}")

#### With feature selection

In [None]:
total_accuracy = 0
ITERATIONS = 10
for x in range(ITERATIONS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_pred, y_test)
    total_accuracy += accuracy

avg_acc = total_accuracy / ITERATIONS
print(F"Average accuracy over {ITERATIONS} runs with feature selection: \t{round(avg_acc,3)}")