In [6]:
import pymc3 as pm
import imodels
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from bartpy2.sklearnmodel import SklearnModel 
from sklearn.metrics import f1_score, roc_auc_score

In [3]:
X_bc, y_bc, feature_names = imodels.get_clean_dataset('breast_cancer', data_source='imodels')
X_bc_train, X_bc_test, y_bc_train, y_bc_test = train_test_split(X_bc, y_bc, test_size=0.25, random_state=42)

In [69]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_trees': [10, 20, 50],
#     'n_burn': [50, 100, 200],
#     'n_samples': [100, 200, 500]
# }

# # Initialize the BART model
# bart_model = SklearnModel()

# # Create the GridSearchCV object
# grid_search = GridSearchCV(bart_model, param_grid, cv=3, scoring='roc_auc')

# # Fit GridSearchCV
# grid_search.fit(X_bc_train, y_bc_train)

# # Print the best parameters and the best score
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_) 


In [7]:
from time import time
from sklearn.model_selection import ParameterGrid

# Define the parameter grid
param_grid = {
    'n_trees': [10, 20, 50],
    'n_burn': [50, 100, 200],
    'n_samples': [100, 200, 500]
}

# Initialize a list to store the results
results = []

# Iterate over each combination of parameters
for params in ParameterGrid(param_grid):
    # Initialize the BART model with current parameters
    bart_model = SklearnModel(n_trees=params['n_trees'], n_burn=params['n_burn'], n_samples=params['n_samples'])
    
    # Start timing
    start_time = time()
    
    # Fit the model
    bart_model.fit(X_bc_train, y_bc_train)
    
    # Predict
    y_pred_proba = bart_model.predict(X_bc_test)
    
    y_pred = (y_pred_proba > 0.5).astype(int) 

    
    # Calculate F1 score
    f1 = f1_score(y_bc_test, y_pred)
    auc = roc_auc_score(y_bc_test, y_pred_proba)
    
    # Stop timing
    time_elapsed = time() - start_time
    
    # Append the results to the list
    results.append({
        'n_trees': params['n_trees'],
        'n_burn': params['n_burn'],
        'n_samples': params['n_samples'],
        'time_elapsed': time_elapsed,
        'f1_score': f1,
        'AUC': auc
    })
results_bc = pd.DataFrame(results)

In [8]:
results_bc

Unnamed: 0,n_trees,n_burn,n_samples,time_elapsed,f1_score,AUC
0,10,50,100,0.778175,0.5,0.789474
1,20,50,100,1.287857,0.451613,0.789474
2,50,50,100,2.541757,0.451613,0.781218
3,10,50,200,1.048141,0.428571,0.78741
4,20,50,200,1.850687,0.387097,0.776058
5,50,50,200,4.558413,0.4375,0.775026
6,10,50,500,2.16867,0.428571,0.801858
7,20,50,500,4.174431,0.413793,0.779154
8,50,50,500,10.571699,0.413793,0.781218
9,10,100,100,0.884038,0.444444,0.785346


In [72]:

model_choose = SklearnModel(n_trees=20, n_burn=50, n_samples=100)
model_choose.fit(X_bc_train, y_bc_train)

predictions_choose = model_choose.predict(X_bc_test)
y_pred_proba_choose = model_choose.predict(X_bc_test)  
y_pred_choose = (y_pred_proba_choose > 0.5).astype(int) 


auc = roc_auc_score(y_bc_test, y_pred_proba_choose)


f1 = f1_score(y_bc_test, y_pred_choose)

print("AUC:", auc)
print("F1 Score:", f1)

AUC: 0.7760577915376677
F1 Score: 0.4375


In [73]:
X_h, y_h, feature_names = imodels.get_clean_dataset('breast_cancer', data_source='imodels')
X_h_train, X_h_test, y_h_train, y_h_test = train_test_split(X_h, y_h, test_size=0.25, random_state=42)

In [74]:

# Define the parameter grid
param_grid = {
    'n_trees': [10, 20, 50],
    'n_burn': [50, 100, 200],
    'n_samples': [100, 200, 500]
}

# Initialize a list to store the results
results = []

# Iterate over each combination of parameters
for params in ParameterGrid(param_grid):
    # Initialize the BART model with current parameters
    bart_model = SklearnModel(n_trees=params['n_trees'], n_burn=params['n_burn'], n_samples=params['n_samples'])
    
    # Start timing
    start_time = time()
    
    # Fit the model
    bart_model.fit(X_h_train, y_h_train)
    
    # Predict
    y_pred_proba = bart_model.predict(X_h_test)
    
    y_pred = (y_pred_proba > 0.5).astype(int) 

    
    # Calculate F1 score
    f1 = f1_score(y_h_test, y_pred)
    auc = roc_auc_score(y_h_test, y_pred_proba)
    
    # Stop timing
    time_elapsed = time() - start_time
    
    # Append the results to the list
    results.append({
        'n_trees': params['n_trees'],
        'n_burn': params['n_burn'],
        'n_samples': params['n_samples'],
        'time_elapsed': time_elapsed,
        'f1_score': f1,
        'AUC': auc
    })
results_h = pd.DataFrame(results)

In [75]:
results_h

Unnamed: 0,n_trees,n_burn,n_samples,time_elapsed,f1_score,AUC
0,10,50,100,0.75248,0.428571,0.803922
1,20,50,100,1.173944,0.4375,0.775026
2,50,50,100,2.505236,0.387097,0.781218
3,10,50,200,1.208127,0.387097,0.789474
4,20,50,200,1.789594,0.482759,0.78741
5,50,50,200,4.598457,0.451613,0.779154
6,10,50,500,1.878454,0.451613,0.769866
7,20,50,500,3.662333,0.4,0.770898
8,50,50,500,9.508078,0.4,0.786378
9,10,100,100,0.842675,0.4,0.737874


In [76]:
X_f, y_f, feature_names = imodels.get_clean_dataset('fico', data_source='imodels')
X_f_train, X_f_test, y_f_train, y_f_test = train_test_split(X_f, y_f, test_size=0.25, random_state=42)

fetching fico from imodels


In [77]:
# from time import time
# from sklearn.model_selection import ParameterGrid

# # Define the parameter grid
# param_grid = {
#     'n_trees': [20, 50,100],
#     'n_burn': [100, 200, 500],
#     'n_samples': [100, 200, 500]
# }

# # Initialize a list to store the results
# results = []

# # Iterate over each combination of parameters
# for params in ParameterGrid(param_grid):
#     # Initialize the BART model with current parameters
#     bart_model = SklearnModel(n_trees=params['n_trees'], n_burn=params['n_burn'], n_samples=params['n_samples'])
    
#     # Start timing
#     start_time = time()
    
#     # Fit the model
#     bart_model.fit(X_f_train, y_f_train)
    
#     # Predict
#     y_pred_proba = bart_model.predict(X_f_test)
    
#     y_pred = (y_pred_proba > 0.5).astype(int) 

    
#     # Calculate F1 score
#     f1 = f1_score(y_f_test, y_pred)
#     auc = roc_auc_score(y_f_test, y_pred_proba)
    
#     # Stop timing
#     time_elapsed = time() - start_time
    
#     # Append the results to the list
#     results.append({
#         'n_trees': params['n_trees'],
#         'n_burn': params['n_burn'],
#         'n_samples': params['n_samples'],
#         'time_elapsed': time_elapsed,
#         'f1_score': f1,
#         'AUC': auc
#     })
# results_f = pd.DataFrame(results)

OSError: [WinError 1450] 系统资源不足，无法完成请求的服务。

In [None]:

param_grid = {
    'n_trees': [10, 50, 100],
    'alpha': [0.8, 0.95, 0.98]
    #'k': [2, 5, 10]
}

# Create the parameter grid
grid = ParameterGrid(param_grid)

# Initialize lists to store results
results = []

# Loop over each combination of parameters
for params in grid:
    # Set up the PyMC3 model with BART using the current set of parameters
    with pm.Model() as model:
        σ = pm.HalfCauchy("σ", beta=1)
        μ = pm.BART("μ", X_bc_train, y_bc_train, m=params['n_trees'], alpha=params['alpha'])
        y = pm.Normal("y", mu=μ, sigma=σ, observed=y_bc_train)
        
        # Time the training process
        start_time = time()
        trace = pm.sample(10, tune=10,cores=1)
        elapsed_time = time() - start_time
        
        # Predict using the BART model
        with model:
        #     #pm.set_data({"μ": X_bc_test})
        #     posterior_pred = pm.sample_posterior_predictive(trace, var_names=["μ"])
        # with model:
        #     ppc = pm.sample_posterior_predictive(trace, samples=500, var_names=["μ"], data={"μ": X_bc_test})
            #pm.set_data(X_bc_test)  # let's set the shared x to the test dataset
            posterior_pred = pm.sample_posterior_predictive(trace, var_names=["μ"])  # performs PPC
        predictions = posterior_pred['μ'].mean(axis=0)  # compute the mean of the samples draws from each new y

# 计算预测概率的均值
        #y_pred_proba = np.mean(posterior_pred['μ'], axis=0)

# 将概率转为二进制分类
        y_pred = (predictions > 0.5).astype(int)

# 计算 AUC
        auc = roc_auc_score(y_bc_train, y_pred_proba)
        # Extract the mean predictions
        # y_pred_proba = trace['μ'].mean(axis=0)
        # y_pred = (y_pred_proba > 0.5).astype(int)

        
        # Calculate F1 Score
        f1 = f1_score(y_bc_train, y_pred)
        
        # Store results
        results.append({
            'params': params,
            'time': elapsed_time,
            'f1_score': f1
        })


In [21]:
results_bc_pymc3 = pd.DataFrame(results)
results_bc_pymc3

Unnamed: 0,params,time,f1_score
0,"{'alpha': 0.8, 'k': 2, 'n_trees': 10}",8.310664,0.626263
1,"{'alpha': 0.8, 'k': 2, 'n_trees': 50}",36.656572,0.784314
2,"{'alpha': 0.8, 'k': 2, 'n_trees': 100}",70.655161,0.833333
3,"{'alpha': 0.8, 'k': 5, 'n_trees': 10}",7.117329,0.589474
4,"{'alpha': 0.8, 'k': 5, 'n_trees': 50}",34.811579,0.764706
5,"{'alpha': 0.8, 'k': 5, 'n_trees': 100}",74.267206,0.844037
6,"{'alpha': 0.8, 'k': 10, 'n_trees': 10}",7.389547,0.646465
7,"{'alpha': 0.8, 'k': 10, 'n_trees': 50}",35.586567,0.807692
8,"{'alpha': 0.8, 'k': 10, 'n_trees': 100}",71.3566,0.851852
9,"{'alpha': 0.95, 'k': 2, 'n_trees': 10}",9.395776,0.732673
