In [33]:

import imodels
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from bartpy2.sklearnmodel import SklearnModel 
from sklearn.metrics import mean_squared_error
from time import time
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import MinMaxScaler



In [34]:
X_bc, y_bc, feature_names = imodels.get_clean_dataset('breast_cancer', data_source='imodels')
X_bc_train, X_bc_test, y_bc_train, y_bc_test = train_test_split(X_bc, y_bc, test_size=0.25, random_state=42)

In [35]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_trees': [10, 20, 50],
#     'n_burn': [50, 100, 200],
#     'n_samples': [100, 200, 500]
# }

# # Initialize the BART model
# bart_model = SklearnModel()

# # Create the GridSearchCV object
# grid_search = GridSearchCV(bart_model, param_grid, cv=3, scoring='roc_auc')

# # Fit GridSearchCV
# grid_search.fit(X_bc_train, y_bc_train)

# # Print the best parameters and the best score
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_) 


In [36]:

#scaler = MinMaxScaler(feature_range=(0, 1))

# Define the parameter grid
# param_grid = {
#     'n_trees': [10, 20, 50],
#     'n_burn': [50, 100, 200],
#     'n_samples': [100, 200, 500]
# }

# # Initialize a list to store the results
# results = []

# # Iterate over each combination of parameters
# for params in ParameterGrid(param_grid):
#     # Initialize the BART model with current parameters
#     bart_model = SklearnModel(n_trees=params['n_trees'], n_burn=params['n_burn'], n_samples=params['n_samples'])
    
#     # Start timing
#     start_time = time()
    
#     # Fit the model
#     bart_model.fit(X_bc_train, y_bc_train)
    
#     # Predict
#     y_pred_proba = bart_model.predict(X_bc_test)
#     #normalized_proba = scaler.fit_transform(y_pred_proba)
#     y_pred = (y_pred_proba > 0.5).astype(int) 

    
#     # Calculate F1 score
#     mse = mean_squared_error(y_bc_test, y_pred)
    
    
#     # Stop timing
#     time_elapsed = time() - start_time
    
#     # Append the results to the list
#     results.append({
#         'n_trees': params['n_trees'],
#         'n_burn': params['n_burn'],
#         'n_samples': params['n_samples'],
#         'time_elapsed': time_elapsed,
#         'mse': mse
#     })
# results_bc = pd.DataFrame(results)

In [37]:
# results_bc

In [38]:

def get_model_score(X_train, y_train, X_test, y_test, model, metric):
    model.fit(X_train, y_train)
    preds_test_prob = model.predict(X_test)
    #y_pred = (preds_test_prob > 0.5).astype(int) 
    score = metric(y_test, preds_test_prob)
    return score

def make_model(model_params_dict):
    model =  SklearnModel(**model_params_dict)
    return model

def make_dgp(dataset_name):
    X, y, feature_names = imodels.get_clean_dataset(dataset_name, data_source='imodels')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return X_train, X_test, y_train,y_test

In [39]:
def run_one_dgp_iter(dataset_name, model_params_dict, metric=mean_squared_error
                     ):
    X_train, X_test, y_train,y_test = make_dgp(dataset_name)
    results = []
    
    model = make_model(model_params_dict)
    start_time = time()
    score = get_model_score(X_train, y_train, X_test, y_test, model, metric)
    time_elapsed = time() - start_time
    results.append({
        'dataset': dataset_name,
        'n_trees': model_params_dict['n_trees'],
        'sigma_a': model_params_dict['sigma_a'],
        'sigma_b': model_params_dict['sigma_b'],
        'time_elapsed': time_elapsed,
        'mse': score
            })
    return results

In [40]:
params = {'n_trees': 50,'sigma_a' : 0.5,"sigma_b" : 0.2}

In [41]:
run_one_dgp_iter(dataset_name='breast_cancer',model_params_dict=params)

[{'dataset': 'breast_cancer',
  'n_trees': 50,
  'sigma_a': 0.5,
  'sigma_b': 0.2,
  'time_elapsed': 6.179091215133667,
  'mse': 0.15749812177607034}]

In [43]:
from tqdm import tqdm
n_iter = 5
def bartpy2(dataset_name,dgp_params_dict_list_bartpy2, metric=mean_squared_error,
         n_iter=n_iter):
    results = []
    for iter_num in tqdm(range(n_iter)):
        results+= run_one_dgp_iter(dataset_name, dgp_params_dict_list_bartpy2)
    result= pd.DataFrame(results)
    return result

result = bartpy2('breast_cancer',params,n_iter)

100%|██████████| 5/5 [00:29<00:00,  5.99s/it]


In [47]:
result

Unnamed: 0,dataset,n_trees,sigma_a,sigma_b,time_elapsed,mse
0,breast_cancer,50,0.5,0.2,6.06398,0.157723
1,breast_cancer,50,0.5,0.2,6.003208,0.159391
2,breast_cancer,50,0.5,0.2,5.873144,0.158899
3,breast_cancer,50,0.5,0.2,5.841604,0.16171
4,breast_cancer,50,0.5,0.2,5.881692,0.157872


In [66]:
mean_time = result['time_elapsed'].mean()
sd_time = result['time_elapsed'].std()
mean_mse = result['mse'].mean()
sd_mse = result['mse'].std()




In [72]:
stats_df = pd.DataFrame({
    'dataset':result['dataset'].drop_duplicates(),
    'Mean_time':mean_time,
    'SD_time':sd_time,
    'Mean_MSE':mean_mse,
    'SD_MSE':sd_mse
})
stats_df

Unnamed: 0,dataset,Mean_time,SD_time,Mean_MSE,SD_MSE
0,breast_cancer,5.932726,0.095725,0.159119,0.001608
