In [5]:

import imodels
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from bartpy2.sklearnmodel import SklearnModel 
from sklearn.metrics import mean_squared_error
from time import time
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor


In [2]:
X_bc, y_bc, feature_names = imodels.get_clean_dataset('breast_cancer', data_source='imodels')
X_bc_train, X_bc_test, y_bc_train, y_bc_test = train_test_split(X_bc, y_bc, test_size=0.25, random_state=42)

In [35]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_trees': [10, 20, 50],
#     'n_burn': [50, 100, 200],
#     'n_samples': [100, 200, 500]
# }

# # Initialize the BART model
# bart_model = SklearnModel()

# # Create the GridSearchCV object
# grid_search = GridSearchCV(bart_model, param_grid, cv=3, scoring='roc_auc')

# # Fit GridSearchCV
# grid_search.fit(X_bc_train, y_bc_train)

# # Print the best parameters and the best score
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_) 


In [36]:

#scaler = MinMaxScaler(feature_range=(0, 1))

# Define the parameter grid
# param_grid = {
#     'n_trees': [10, 20, 50],
#     'n_burn': [50, 100, 200],
#     'n_samples': [100, 200, 500]
# }

# # Initialize a list to store the results
# results = []

# # Iterate over each combination of parameters
# for params in ParameterGrid(param_grid):
#     # Initialize the BART model with current parameters
#     bart_model = SklearnModel(n_trees=params['n_trees'], n_burn=params['n_burn'], n_samples=params['n_samples'])
    
#     # Start timing
#     start_time = time()
    
#     # Fit the model
#     bart_model.fit(X_bc_train, y_bc_train)
    
#     # Predict
#     y_pred_proba = bart_model.predict(X_bc_test)
#     #normalized_proba = scaler.fit_transform(y_pred_proba)
#     y_pred = (y_pred_proba > 0.5).astype(int) 

    
#     # Calculate F1 score
#     mse = mean_squared_error(y_bc_test, y_pred)
    
    
#     # Stop timing
#     time_elapsed = time() - start_time
    
#     # Append the results to the list
#     results.append({
#         'n_trees': params['n_trees'],
#         'n_burn': params['n_burn'],
#         'n_samples': params['n_samples'],
#         'time_elapsed': time_elapsed,
#         'mse': mse
#     })
# results_bc = pd.DataFrame(results)

In [37]:
# results_bc

In [3]:

def get_model_score(X_train, y_train, X_test, y_test, model, metric):
    model.fit(X_train, y_train)
    preds_test_prob = model.predict(X_test)
    #y_pred = (preds_test_prob > 0.5).astype(int) 
    score = metric(y_test, preds_test_prob)
    return score

def make_model(model_params_dict):
    model =  SklearnModel(**model_params_dict)
    return model

def make_dgp(dataset_name):
    X, y, feature_names = imodels.get_clean_dataset(dataset_name, data_source='imodels')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return X_train, X_test, y_train,y_test

In [35]:
## running one step of  simulation 

def run_one_dgp_iter(dataset_name, model_params_dict, metric=mean_squared_error
                     ):
    X_train, X_test, y_train,y_test = make_dgp(dataset_name)
    results = []
    
    model = make_model(model_params_dict)
    start_time = time()
    score = get_model_score(X_train, y_train, X_test, y_test, model, metric)
    time_elapsed = time() - start_time
    results.append({
        'dataset': dataset_name,
        'n_trees': model_params_dict['n_trees'],
        'sigma_a': model_params_dict['sigma_a'],
        'sigma_b': model_params_dict['sigma_b'],
        'time_elapsed': time_elapsed,
        'mse': score
            })
    return results

def one_comparsion(dataset_names, model_params_dict, metric=mean_squared_error
                     ):
    results = []
    
    for dataset_name in dataset_names:
        X_train, X_test, y_train,y_test = make_dgp(dataset_name)
    
    # bart model
        model = make_model(model_params_dict)
        start_time = time()
        score = get_model_score(X_train, y_train, X_test, y_test, model, metric)
        time_elapsed = time() - start_time
        results.append({
            'dataset': dataset_name,
            'method':'BART',
            'n_trees': model_params_dict['n_trees'],
            'sigma_a': model_params_dict['sigma_a'],
            'sigma_b': model_params_dict['sigma_b'],
            'time_elapsed': time_elapsed,
            'mse': score
        })
    
    #random forest part
        modelr = RandomForestRegressor()
        start_time_r = time()
        score_r = get_model_score(X_train, y_train, X_test, y_test, modelr, metric)
        time_elapsed_r = time() - start_time_r

        results.append({
            'dataset': dataset_name,
            'method':'RandomForest',
            'n_trees': model_params_dict['n_trees'],
            'sigma_a': model_params_dict['sigma_a'],
            'sigma_b': model_params_dict['sigma_b'],
            'time_elapsed': time_elapsed_r,
            'mse': score_r
            })
    #results = pd.DataFrame(results)
    return results

In [37]:
params = {'n_trees': 100,'sigma_a' : 0.5,"sigma_b" : 0.25}
dataset_names=['breast_cancer','heart']

In [38]:
one_comparsion(dataset_names=dataset_names,model_params_dict=params)

fetching heart from imodels


[{'dataset': 'breast_cancer',
  'method': 'BART',
  'n_trees': 100,
  'sigma_a': 0.5,
  'sigma_b': 0.25,
  'time_elapsed': 11.772703647613525,
  'mse': 0.15981156363278937},
 {'dataset': 'breast_cancer',
  'method': 'RandomForest',
  'n_trees': 100,
  'sigma_a': 0.5,
  'sigma_b': 0.25,
  'time_elapsed': 0.10523867607116699,
  'mse': 0.16712843017492715},
 {'dataset': 'heart',
  'method': 'BART',
  'n_trees': 100,
  'sigma_a': 0.5,
  'sigma_b': 0.25,
  'time_elapsed': 11.691163301467896,
  'mse': 0.11624428309627328},
 {'dataset': 'heart',
  'method': 'RandomForest',
  'n_trees': 100,
  'sigma_a': 0.5,
  'sigma_b': 0.25,
  'time_elapsed': 0.12054252624511719,
  'mse': 0.14587205882352944}]

In [39]:
from tqdm import tqdm
n_iter = 5
def bartpy2(dataset_names,dgp_params_dict_list_bartpy2, metric=mean_squared_error,
         n_iter=n_iter):
    results = []
    for iter_num in tqdm(range(n_iter)):
        results+= one_comparsion(dataset_names, dgp_params_dict_list_bartpy2)
    result= pd.DataFrame(results)
    return result

result = bartpy2(dataset_names,params,n_iter)

  0%|          | 0/5 [00:00<?, ?it/s]

fetching heart from imodels


 20%|██        | 1/5 [00:23<01:34, 23.75s/it]

fetching heart from imodels


 40%|████      | 2/5 [00:47<01:10, 23.63s/it]

fetching heart from imodels


 60%|██████    | 3/5 [01:10<00:47, 23.62s/it]

fetching heart from imodels


 80%|████████  | 4/5 [01:34<00:23, 23.59s/it]

fetching heart from imodels


100%|██████████| 5/5 [01:57<00:00, 23.51s/it]


In [40]:
result

Unnamed: 0,dataset,method,n_trees,sigma_a,sigma_b,time_elapsed,mse
0,breast_cancer,BART,100,0.5,0.25,11.65749,0.161257
1,breast_cancer,RandomForest,100,0.5,0.25,0.095423,0.165366
2,heart,BART,100,0.5,0.25,11.725797,0.116105
3,heart,RandomForest,100,0.5,0.25,0.118258,0.142491
4,breast_cancer,BART,100,0.5,0.25,11.747947,0.158791
5,breast_cancer,RandomForest,100,0.5,0.25,0.097731,0.160765
6,heart,BART,100,0.5,0.25,11.430515,0.116355
7,heart,RandomForest,100,0.5,0.25,0.119701,0.141106
8,breast_cancer,BART,100,0.5,0.25,11.692869,0.159674
9,breast_cancer,RandomForest,100,0.5,0.25,0.096566,0.160881


In [41]:

grouped = result.groupby('method')
grouped['mse'].mean()
#grouped['time_elapsed'].std()
print(grouped['dataset'])


<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000011FD35B88B0>


In [44]:
result['Combined'] = result['method'] + result['dataset']
grouped = result.groupby('Combined')
grouped['mse'].mean()
#grouped['time_elapsed'].std()
print(grouped['dataset'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000011FD6ACF040>


In [46]:
stats_df = pd.DataFrame({
    'Mean_MSE':grouped['mse'].mean(),
    'Mean_time':grouped['time_elapsed'].mean(),
    'SD_MSE':grouped['mse'].std(),
    'SD_time':grouped['time_elapsed'].std()
})
stats_df

Unnamed: 0_level_0,Mean_MSE,Mean_time,SD_MSE,SD_time
Combined,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BARTbreast_cancer,0.160446,11.689117,0.001258,0.035345
BARTheart,0.115391,11.454373,0.001079,0.24763
RandomForestbreast_cancer,0.163731,0.100084,0.003929,0.005107
RandomForestheart,0.143586,0.120736,0.002504,0.002801
