In [1]:


from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the breast cancer dataset
data = load_breast_cancer()
x_data = pd.DataFrame(data.data, columns=data.feature_names)
y_data = pd.Series(data.target)

# Add two columns with random values to the DataFrame
x_data['random_1'] = np.random.rand(x_data.shape[0])
x_data['random_2'] = np.random.rand(x_data.shape[0])

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=0)

# Create an Random Forest classifier
model_rf = RandomForestClassifier()
boruta = BorutaPy(estimator = model_rf, n_estimators = 'auto', max_iter = 100)

#workaround
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

# Boruta Fit
boruta.fit(np.array(x_train), np.array(y_train))

############### Calculate the important Columns
boruta_support = x_data.columns[boruta.support_]

# Perform cross-validation with accuracy as the scoring metric
scores = cross_val_score(model_rf, x_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})')

# Select the important features
x_data_top = x_data[boruta_support]

# Recalculate the cross-validation using only the important Features
scores_top = cross_val_score(model_rf, x_data_top, y_data, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy (top 20 features): {scores_top.mean():.2f} (+/- {scores_top.std() * 2:.2f})')


Cross-validation accuracy: 0.95 (+/- 0.03)
Cross-validation accuracy (top 20 features): 0.96 (+/- 0.04)
