## Example case: breast-cancer classification

### Imports

In [2]:
import sys
sys.path.append("../itershap")

import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from itershap import IterSHAP

### Load data and apply IterSHAP feature selection

In [4]:
# Load data from SK Learn's datasets
X, y = load_breast_cancer(return_X_y=True)

# Only use 100 samples, remainder for testing
X, X_not_used, y, y_not_used = train_test_split(X, y, train_size=100)

# Create a data copy to test model performance without feature selection 
X_without_fs = pd.DataFrame(X)

# Check the current shape of the dataset
print(X.shape)

# Create and fit IterSHAP using a RandomForestClassifier
itershap_fs = IterSHAP(RandomForestClassifier)
itershap_fs.fit(X, y)

# Transform the input data to only include selected features and print its shape
X = itershap_fs.transform()
print(X.shape)

(100, 30)


In the future `np.bool` will be defined as the corresponding NumPy scalar.


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

### Perform classification

#### Without feature selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_without_fs, y, test_size=0.25)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

#### With feature selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print(accuracy)
