In [26]:
import pymc3
import imodels
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from bartpy2.sklearnmodel import SklearnModel 

In [28]:
X, y, feature_names = imodels.get_clean_dataset('breast_cancer', data_source='imodels')

In [29]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [30]:
# Assuming SklearnModel exists in bartpy2
model = SklearnModel(n_trees=50, n_burn=200, n_samples=200)

# Fit the model to the data
model.fit(X_train, y_train)

predictions = model.predict(X_test)



Mean Squared Error: 0.16113493324115327


In [31]:
predictions

array([0.29221325, 0.33582939, 0.29955711, 0.23180746, 0.48532823,
       0.27527773, 0.14107054, 0.33500485, 0.13282632, 0.10602598,
       0.17717608, 0.18127687, 0.74704655, 0.20634628, 0.45976231,
       0.45581675, 0.1758603 , 0.13514411, 0.34870905, 0.23225181,
       0.55053908, 0.36229912, 0.10424293, 0.20770407, 0.45387185,
       0.21898169, 0.1797071 , 0.23075426, 0.50723522, 0.1758603 ,
       0.20162393, 0.19431384, 0.35082985, 0.67603768, 0.11638495,
       0.12364467, 0.14204919, 0.56268163, 0.29351534, 0.56372784,
       0.37553377, 0.46563249, 0.7332341 , 0.11494959, 0.20380027,
       0.17288797, 0.26622594, 0.52839261, 0.39716886, 0.58196454,
       0.27319129, 0.28128109, 0.15860567, 0.34300403, 0.2800706 ,
       0.22307055, 0.19237895, 0.21984915, 0.23075426, 0.5783731 ,
       0.4958261 , 0.2208643 , 0.15862839, 0.43022042, 0.30676655,
       0.27118428, 0.68228759, 0.1350158 , 0.28737866, 0.39385217])

In [35]:
y_test

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0], dtype=int64)

In [34]:
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split



y_pred_proba = model.predict(X_test)  
y_pred = (y_pred_proba > 0.5).astype(int) 


auc = roc_auc_score(y_test, y_pred_proba)


f1 = f1_score(y_test, y_pred)

print("AUC:", auc)
print("F1 Score:", f1)

AUC: 0.7781217750257997
F1 Score: 0.39999999999999997


In [36]:
# Search for the best parameters using GridSearch
from sklearn.model_selection import GridSearchCV
parameters = {'n_trees': (10, 20, 50),
              'n_burn':(50,100,200),
              'n_samples':(100,200,500)}
grid_search = GridSearchCV(model, parameters)
grid_search.fit(X, y)
grid_search.best_params_

{'n_burn': 100, 'n_samples': 200, 'n_trees': 50}

In [38]:
model_choose = SklearnModel(n_trees=50, n_burn=100, n_samples=200)
model_choose.fit(X_train, y_train)

predictions_choose = model_choose.predict(X_test)
y_pred_proba_choose = model_choose.predict(X_test)  
y_pred_choose = (y_pred_proba_choose > 0.5).astype(int) 


auc = roc_auc_score(y_test, y_pred_proba_choose)


f1 = f1_score(y_test, y_pred_choose)

print("AUC:", auc)
print("F1 Score:", f1)

AUC: 0.7801857585139318
F1 Score: 0.39999999999999997
