In [None]:
import os
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
os.environ['TF_NUM_INTEROP_THREADS'] = '1'

import cv2
import time
import random
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from scipy.stats import entropy

In [None]:
img_size = 28
img_flat_size = img_size * img_size

num_label = 10

# Parameters of training
Learning_rate = 0.0005
epsilon = 1e-2

num_iter = 5100
batch_size = 128

validation_ratio = 0.1

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
(X, Y), (X_test, Y_test) = tf.keras.datasets.mnist.load_data()
X = X.astype("float32") / 255.0
X_test = X_test.astype("float32") / 255.0
Y = to_categorical(Y, num_classes=num_label)
Y_test = to_categorical(Y_test, num_classes=num_label)

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

X_train = X_train.reshape(-1, img_flat_size)
X_val = X_val.reshape(-1, img_flat_size)
X_test = X_test.reshape(-1, img_flat_size)

print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_val shape:", X_val.shape)
print("Y_val shape:", Y_val.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)


X_train shape: (48000, 784)
Y_train shape: (48000, 10)
X_val shape: (12000, 784)
Y_val shape: (12000, 10)
X_test shape: (10000, 784)
Y_test shape: (10000, 10)


In [40]:
folders_notMNIST = os.listdir('./notMNIST_small')

NotMNIST_x_list = []
NotMNIST_y_list = []

for idx, folder in enumerate(folders_notMNIST):
    files_notMNIST = os.listdir('./notMNIST_small/' + folder)
    
    for file in files_notMNIST:
        img_NotMNIST = cv2.imread('./notMNIST_small/' + folder + '/' + file, 0)
        if img_NotMNIST is None:
            continue
        NotMNIST_flat = np.reshape(img_NotMNIST, (img_flat_size))
        NotMNIST_x_list.append(NotMNIST_flat)
        
        label_temp = np.zeros([num_label])
        label_temp[idx] = 1
        
        NotMNIST_y_list.append(label_temp)
        
NotMNIST_x = np.stack(NotMNIST_x_list, axis = 0)
NotMNIST_y = np.stack(NotMNIST_y_list, axis = 0)

print("NotMNIST X shape: " + str(NotMNIST_x.shape))
print("NotMNIST Y shape: " + str(NotMNIST_y.shape))

NotMNIST X shape: (18724, 784)
NotMNIST Y shape: (18724, 10)


In [None]:
def create_ensemble_model():
    model = models.Sequential([
        layers.Input(shape=(img_flat_size,)),
        layers.Dense(200, activation='relu'),
        layers.Dense(200, activation='relu'),
        layers.Dense(200, activation='relu'),
        layers.Dense(num_label, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

ensemble_size = 5
models_ensemble = []

for i in range(ensemble_size):
    print(f"Training model {i + 1}/{ensemble_size}")
    model = create_ensemble_model()
    model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=128, verbose=2)
    models_ensemble.append(model)

def ensemble_predictions(models, X):
    predictions = np.array([model.predict(X) for model in models])
    mean_prediction = predictions.mean(axis=0)
    variance_prediction = predictions.var(axis=0)
    return mean_prediction, variance_prediction

mean_pred, var_pred = ensemble_predictions(models_ensemble, X_test)

predicted_classes = np.argmax(mean_pred, axis=1)
true_classes = np.argmax(Y_test, axis=1)
accuracy = np.mean(predicted_classes == true_classes)
print(f"Ensemble accuracy on test set: {accuracy:.4f}")
print(f"Variance for the first test sample: {var_pred[0]}")


Training model 1/5
Epoch 1/10
375/375 - 3s - 8ms/step - accuracy: 0.9136 - loss: 0.2983 - val_accuracy: 0.9588 - val_loss: 0.1353
Epoch 2/10
375/375 - 2s - 5ms/step - accuracy: 0.9680 - loss: 0.1084 - val_accuracy: 0.9679 - val_loss: 0.1024
Epoch 3/10
375/375 - 2s - 5ms/step - accuracy: 0.9779 - loss: 0.0719 - val_accuracy: 0.9696 - val_loss: 0.1017
Epoch 4/10
375/375 - 2s - 5ms/step - accuracy: 0.9844 - loss: 0.0507 - val_accuracy: 0.9692 - val_loss: 0.1068
Epoch 5/10
375/375 - 2s - 5ms/step - accuracy: 0.9877 - loss: 0.0386 - val_accuracy: 0.9730 - val_loss: 0.1007
Epoch 6/10
375/375 - 2s - 5ms/step - accuracy: 0.9890 - loss: 0.0326 - val_accuracy: 0.9728 - val_loss: 0.1084
Epoch 7/10
375/375 - 2s - 5ms/step - accuracy: 0.9918 - loss: 0.0254 - val_accuracy: 0.9695 - val_loss: 0.1293
Epoch 8/10
375/375 - 2s - 5ms/step - accuracy: 0.9921 - loss: 0.0239 - val_accuracy: 0.9750 - val_loss: 0.1091
Epoch 9/10
375/375 - 2s - 5ms/step - accuracy: 0.9937 - loss: 0.0190 - val_accuracy: 0.9731 -

In [None]:

def create_MC_Dropout_model():
    model = models.Sequential([
        layers.Input(shape=(28 * 28,)),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.2),  
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.2), 
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

model_mc_dropout = create_MC_Dropout_model()
model_mc_dropout.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=128, verbose=2)

def mc_dropout_inference(model, X, n_iter=5):
    @tf.function
    def predict_with_dropout(X):
        return model(X, training=True)  # Ensure dropout is active during inference

    predictions = np.array([predict_with_dropout(X).numpy() for _ in range(n_iter)])
    
    mean_prediction = predictions.mean(axis=0)
    variance_prediction = predictions.var(axis=0)
    
    return mean_prediction, variance_prediction

mean_pred_mc, var_pred_mc = mc_dropout_inference(model_mc_dropout, X_test, n_iter=5)

predicted_classes = np.argmax(mean_pred_mc, axis=1)
true_classes = np.argmax(Y_test, axis=1)
accuracy = np.mean(predicted_classes == true_classes)
print(f"Ensemble accuracy with MC Dropout on test set: {accuracy:.4f}")
print(f"Variance for the first test sample with MC Dropout: {var_pred_mc[0]}")


Epoch 1/10
375/375 - 3s - 8ms/step - accuracy: 0.8769 - loss: 0.4000 - val_accuracy: 0.9562 - val_loss: 0.1447
Epoch 2/10
375/375 - 2s - 5ms/step - accuracy: 0.9509 - loss: 0.1634 - val_accuracy: 0.9680 - val_loss: 0.1083
Epoch 3/10
375/375 - 2s - 5ms/step - accuracy: 0.9635 - loss: 0.1200 - val_accuracy: 0.9700 - val_loss: 0.0960
Epoch 4/10
375/375 - 2s - 5ms/step - accuracy: 0.9698 - loss: 0.0963 - val_accuracy: 0.9722 - val_loss: 0.0939
Epoch 5/10
375/375 - 2s - 5ms/step - accuracy: 0.9747 - loss: 0.0818 - val_accuracy: 0.9741 - val_loss: 0.0870
Epoch 6/10
375/375 - 2s - 5ms/step - accuracy: 0.9780 - loss: 0.0696 - val_accuracy: 0.9762 - val_loss: 0.0842
Epoch 7/10
375/375 - 2s - 4ms/step - accuracy: 0.9801 - loss: 0.0614 - val_accuracy: 0.9759 - val_loss: 0.0849
Epoch 8/10
375/375 - 2s - 5ms/step - accuracy: 0.9819 - loss: 0.0558 - val_accuracy: 0.9787 - val_loss: 0.0797
Epoch 9/10
375/375 - 2s - 5ms/step - accuracy: 0.9841 - loss: 0.0524 - val_accuracy: 0.9774 - val_loss: 0.0839
E

In [None]:

def create_bootstrap_model():
    model = models.Sequential([
        layers.Input(shape=(28 * 28,)),
        layers.Dense(200, activation='relu'),
        layers.Dense(200, activation='relu'),
        layers.Dense(200, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

ensemble_size = 5
models_bootstrap = []

for i in range(ensemble_size):
    print(f"Training model {i + 1}/{ensemble_size}")
    
    indices = np.random.choice(X_train.shape[0], size=X_train.shape[0], replace=True)
    X_train_bootstrap = X_train[indices]
    Y_train_bootstrap = Y_train[indices]
    
    model = create_bootstrap_model()
    model.fit(X_train_bootstrap, Y_train_bootstrap, validation_data=(X_val, Y_val), epochs=10, batch_size=128, verbose=2)
    models_bootstrap.append(model)

def ensemble_predictions(models, X):
    predictions = np.array([model.predict(X) for model in models])
    mean_prediction = predictions.mean(axis=0)
    variance_prediction = predictions.var(axis=0)
    return mean_prediction, variance_prediction

mean_pred, var_pred = ensemble_predictions(models_bootstrap, X_test)

predicted_classes = np.argmax(mean_pred, axis=1)
true_classes = np.argmax(Y_test, axis=1)
accuracy = np.mean(predicted_classes == true_classes)
print(f"Bootstrapped ensemble accuracy on test set: {accuracy:.4f}")
print(f"Variance for the first test sample: {var_pred[0]}")

Training model 1/5
Epoch 1/10
375/375 - 3s - 9ms/step - accuracy: 0.9167 - loss: 0.2889 - val_accuracy: 0.9543 - val_loss: 0.1454
Epoch 2/10
375/375 - 2s - 5ms/step - accuracy: 0.9722 - loss: 0.0926 - val_accuracy: 0.9588 - val_loss: 0.1295
Epoch 3/10
375/375 - 2s - 5ms/step - accuracy: 0.9842 - loss: 0.0531 - val_accuracy: 0.9676 - val_loss: 0.1124
Epoch 4/10
375/375 - 2s - 5ms/step - accuracy: 0.9893 - loss: 0.0339 - val_accuracy: 0.9702 - val_loss: 0.1099
Epoch 5/10
375/375 - 2s - 5ms/step - accuracy: 0.9928 - loss: 0.0232 - val_accuracy: 0.9709 - val_loss: 0.1142
Epoch 6/10
375/375 - 2s - 5ms/step - accuracy: 0.9941 - loss: 0.0184 - val_accuracy: 0.9628 - val_loss: 0.1549
Epoch 7/10
375/375 - 2s - 5ms/step - accuracy: 0.9946 - loss: 0.0159 - val_accuracy: 0.9694 - val_loss: 0.1343
Epoch 8/10
375/375 - 2s - 5ms/step - accuracy: 0.9954 - loss: 0.0131 - val_accuracy: 0.9717 - val_loss: 0.1460
Epoch 9/10
375/375 - 2s - 5ms/step - accuracy: 0.9948 - loss: 0.0149 - val_accuracy: 0.9740 -

In [None]:
from tensorflow.keras import layers, models, Input, Model
import numpy as np
import tensorflow as tf

def create_branch_model():
    input_dim = 784  

    input_layer = Input(shape=(input_dim,))
    shared_output = layers.Dense(200, activation='relu')(input_layer)
    shared_output = layers.Dropout(0.2)(shared_output)

    branch_output = layers.Dense(200, activation='relu')(shared_output)
    branch_output = layers.Dropout(0.2)(branch_output)
    branch_output = layers.Dense(200, activation='relu')(branch_output)
    branch_output = layers.Dropout(0.2)(branch_output)
    branch_output = layers.Dense(10, activation='softmax')(branch_output)

    model = Model(inputs=input_layer, outputs=branch_output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

num_branches = 5
branch_models = [create_branch_model() for _ in range(num_branches)]

for i, model in enumerate(branch_models):
    print(f"Training branch model {i + 1}/{num_branches}")
    model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=128, verbose=2)

def combine_branches(branch_models):
    input_dim = 784  # Input dimension
    input_layer = Input(shape=(input_dim,))
    
    branch_outputs = [branch(input_layer) for branch in branch_models]
    combined_output = layers.Concatenate()(branch_outputs)
    
    combined_model = Model(inputs=input_layer, outputs=combined_output)
    return combined_model

inference_model = combine_branches(branch_models)

def ensemble_predictions(model, X, num_branches):
    predictions = model.predict(X)
    branch_outputs = np.split(predictions, num_branches, axis=1)
    mean_prediction = np.mean(branch_outputs, axis=0)
    variance_prediction = np.var(branch_outputs, axis=0)
    return mean_prediction, variance_prediction

mean_pred, var_pred = ensemble_predictions(inference_model, X_test, num_branches)

predicted_classes = np.argmax(mean_pred, axis=1)
true_classes = np.argmax(Y_test, axis=1)
accuracy = np.mean(predicted_classes == true_classes)
print(f"Ensemble accuracy on test set: {accuracy:.4f}")
print(f"Variance for the first test sample: {var_pred[0]}")


Training branch model 1/5
Epoch 1/10
375/375 - 3s - 8ms/step - accuracy: 0.8816 - loss: 0.3908 - val_accuracy: 0.9578 - val_loss: 0.1432
Epoch 2/10
375/375 - 2s - 4ms/step - accuracy: 0.9537 - loss: 0.1577 - val_accuracy: 0.9653 - val_loss: 0.1130
Epoch 3/10
375/375 - 2s - 4ms/step - accuracy: 0.9654 - loss: 0.1158 - val_accuracy: 0.9705 - val_loss: 0.0986
Epoch 4/10
375/375 - 2s - 4ms/step - accuracy: 0.9696 - loss: 0.0988 - val_accuracy: 0.9754 - val_loss: 0.0865
Epoch 5/10
375/375 - 2s - 5ms/step - accuracy: 0.9754 - loss: 0.0788 - val_accuracy: 0.9758 - val_loss: 0.0849
Epoch 6/10
375/375 - 2s - 5ms/step - accuracy: 0.9770 - loss: 0.0704 - val_accuracy: 0.9764 - val_loss: 0.0820
Epoch 7/10
375/375 - 2s - 4ms/step - accuracy: 0.9803 - loss: 0.0640 - val_accuracy: 0.9750 - val_loss: 0.0926
Epoch 8/10
375/375 - 2s - 4ms/step - accuracy: 0.9818 - loss: 0.0553 - val_accuracy: 0.9786 - val_loss: 0.0774
Epoch 9/10
375/375 - 2s - 5ms/step - accuracy: 0.9841 - loss: 0.0487 - val_accuracy: 0

In [None]:
def predict_with_dropout(model, X):
    return model(X, training=True)  # Ensure dropout is active during inference

def evaluate_models(models_list, test_x, test_y, training=False):
    num_samples = 900
    sample_indices = np.random.choice(test_x.shape[0], num_samples, replace=False)

    ensemble_accuracy = 0
    single_accuracy = 0
    inference_times = []

    for i, index in tqdm(enumerate(sample_indices)):
        img_sample = np.reshape(test_x[index], (1, -1))
        true_label = np.reshape(test_y[index], (1, -1))

        ensemble_prob = np.zeros((1, test_y.shape[1]))
        start_time = time.time()

        if training:
            for model in models_list:
                prob = []
                for _ in range(5):
                    single_prediction = predict_with_dropout(model, tf.constant(img_sample, dtype=tf.float32)).numpy()
                    prob.append(single_prediction)
                prob = np.array(prob)

                avg_prob = prob.mean(axis=0)  # Average the probabilities across iterations
                ensemble_prob += avg_prob
        else:
            for model in models_list:
                prob = model.predict(img_sample, verbose=0)
                ensemble_prob += prob
        
        end_time = time.time()
        inference_times.append(end_time - start_time)

        ensemble_prob /= len(models_list)
        single_prob = models_list[0].predict(img_sample, verbose=0)

        ensemble_accuracy += np.argmax(ensemble_prob) == np.argmax(true_label)
        single_accuracy += np.argmax(single_prob) == np.argmax(true_label)

    avg_ensemble_accuracy = ensemble_accuracy / num_samples
    avg_single_accuracy = single_accuracy / num_samples
    avg_inference_time = np.mean(inference_times)

    print("================== Evaluation Results ==================")
    print(f"Ensemble Accuracy: {avg_ensemble_accuracy:.4f}")
    print(f"Single Model Accuracy: {avg_single_accuracy:.4f}")
    print(f"Average Inference Time per Sample: {avg_inference_time:.6f} seconds")
    

def evaluate_combined_model(combined_model, test_x, test_y, num_branches):
    num_samples = 900
    sample_indices = np.random.choice(test_x.shape[0], num_samples, replace=False)

    ensemble_accuracy = 0
    single_accuracy = 0
    inference_times = []

    for i, index in tqdm(enumerate(sample_indices)):
        img_sample = np.reshape(test_x[index], (1, -1))
        true_label = np.reshape(test_y[index], (1, -1))

        start_time = time.time()
        combined_output = combined_model.predict(img_sample, verbose=0)
        end_time = time.time()
        
        branch_outputs = np.split(combined_output, num_branches, axis=1)
        ensemble_prob = np.mean(branch_outputs, axis=0)

        single_prob = branch_outputs[0]
        
        inference_times.append(end_time - start_time)

        ensemble_accuracy += np.argmax(ensemble_prob) == np.argmax(true_label)
        single_accuracy += np.argmax(single_prob) == np.argmax(true_label)

    avg_ensemble_accuracy = ensemble_accuracy / num_samples
    avg_single_accuracy = single_accuracy / num_samples
    avg_inference_time = np.mean(inference_times)

    print("================== Evaluation Results ==================")
    print(f"Ensemble Accuracy: {avg_ensemble_accuracy:.4f}")
    print(f"Single Model Accuracy (First Branch): {avg_single_accuracy:.4f}")
    print(f"Average Inference Time per Sample: {avg_inference_time:.6f} seconds")


evaluate_models(models_ensemble, X_test, Y_test)
evaluate_models([model_mc_dropout]*5, X_test, Y_test, training=True)
evaluate_models(models_bootstrap, X_test, Y_test)
evaluate_combined_model(inference_model, X_test, Y_test, 5)

900it [03:06,  4.81it/s]

Ensemble Accuracy: 0.9733
Single Model Accuracy: 0.9767
Average Inference Time per Sample: 0.158070 seconds





In [None]:
def predict_with_dropout(model, X):
    return model(X, training=True)  # Ensure dropout is active during inference

def evaluate_models_non_mnist(models_list, test_x, test_y, training=False):
    num_samples = 900
    sample_indices = np.random.choice(test_x.shape[0], num_samples, replace=False)

    ensemble_accuracy = 0
    single_accuracy = 0
    inference_times = []

    array_ensemble_NotMNIST = np.zeros([num_samples])
    count_ex_n_en = 0
    tot_prob_en = 0
    conf = 0.2
    
    variance_list = []
    entropy_list = []

    for i, index in tqdm(enumerate(sample_indices)):
        img_sample = np.reshape(test_x[index], (1, -1))
        true_label = np.reshape(test_y[index], (1, -1))

        ensemble_prob = np.zeros((1, test_y.shape[1]))
        individual_probs = []
        start_time = time.time()

        
        if training:
            for model in models_list:
                # Run the model n_iter times with dropout enabled
                prob = np.array([predict_with_dropout(model, tf.constant(img_sample, dtype=tf.float32)).numpy() for _ in range(5)])
                avg_prob = prob.mean(axis=0) 
                ensemble_prob += avg_prob
                individual_probs.append(avg_prob)
        else:
            for model in models_list:
                prob = model.predict(img_sample, verbose=0)
                ensemble_prob += prob
                individual_probs.append(prob)

        end_time = time.time()
        inference_times.append(end_time - start_time)

        ensemble_prob /= len(models_list)
        single_prob = models_list[0].predict(img_sample, verbose=0)
        ensemble_accuracy += np.argmax(ensemble_prob) == np.argmax(true_label)
        single_accuracy += np.argmax(single_prob) == np.argmax(true_label)
        
        individual_probs = np.array(individual_probs).squeeze()
        variance = np.var(individual_probs, axis=0).mean()
        variance_list.append(variance)
        
        pred_entropy = entropy(ensemble_prob.squeeze())
        entropy_list.append(pred_entropy)

        idx_sample = np.argmax(ensemble_prob)
        max_prob = ensemble_prob[0, idx_sample]
        array_ensemble_NotMNIST[i] = max_prob
        tot_prob_en += max_prob

        if max_prob >= conf:
            count_ex_n_en += 1
        # print(f"{i + 1}th sample: label = {idx_sample}, Confidence = {max_prob}")

    avg_ensemble_accuracy = ensemble_accuracy / num_samples
    avg_single_accuracy = single_accuracy / num_samples
    avg_inference_time = np.mean(inference_times)
    avg_variance = np.mean(variance_list)
    avg_entropy = np.mean(entropy_list)

    print("====================== Ensemble Result ======================")
    print(f"Accuracy: {avg_ensemble_accuracy:.4f}")
    print(f"Avg confidence: {tot_prob_en / num_samples:.4f}")
    print(f"Satisfying Ensemble NotMNIST: {count_ex_n_en}")
    print(f"Average Variance: {avg_variance:.4f}")
    print(f"Average Entropy: {avg_entropy:.4f}")
    print(f"Average Inference Time per Sample: {avg_inference_time:.6f} seconds")


def evaluate_combined_model_non_mnist(combined_model, test_x, test_y, num_branches):
    import time
    from scipy.stats import entropy

    num_samples = 900
    sample_indices = np.random.choice(test_x.shape[0], num_samples, replace=False)

    ensemble_accuracy = 0
    single_accuracy = 0
    inference_times = []

    array_ensemble_NotMNIST = np.zeros([num_samples])
    count_ex_n_en = 0
    tot_prob_en = 0
    conf = 0.2

    variance_list = []
    entropy_list = []

    for i, index in tqdm(enumerate(sample_indices)):
        img_sample = np.reshape(test_x[index], (1, -1))
        true_label = np.reshape(test_y[index], (1, -1))

        start_time = time.time()
        combined_output = combined_model.predict(img_sample, verbose=0)
        
        branch_outputs = np.split(combined_output, num_branches, axis=1)
        ensemble_prob = np.mean(branch_outputs, axis=0)

        single_prob = branch_outputs[0]

        end_time = time.time()
        inference_times.append(end_time - start_time)

        ensemble_accuracy += np.argmax(ensemble_prob) == np.argmax(true_label)
        single_accuracy += np.argmax(single_prob) == np.argmax(true_label)

        branch_predictions = np.array(branch_outputs).squeeze()
        variance = np.var(branch_predictions, axis=0).mean()
        variance_list.append(variance)

        pred_entropy = entropy(ensemble_prob.squeeze())
        entropy_list.append(pred_entropy)

        idx_sample = np.argmax(ensemble_prob)
        max_prob = ensemble_prob[0, idx_sample]
        array_ensemble_NotMNIST[i] = max_prob
        tot_prob_en += max_prob

        if max_prob >= conf:
            count_ex_n_en += 1

    avg_ensemble_accuracy = ensemble_accuracy / num_samples
    avg_single_accuracy = single_accuracy / num_samples
    avg_inference_time = np.mean(inference_times)
    avg_variance = np.mean(variance_list)
    avg_entropy = np.mean(entropy_list)

    print("====================== Ensemble Result ======================")
    print(f"Accuracy: {avg_ensemble_accuracy:.4f}")
    print(f"Avg confidence: {tot_prob_en / num_samples:.4f}")
    print(f"Satisfying Ensemble NotMNIST: {count_ex_n_en}")
    print(f"Average Variance: {avg_variance:.4f}")
    print(f"Average Entropy: {avg_entropy:.4f}")
    print(f"Average Inference Time per Sample: {avg_inference_time:.6f} seconds")
    
    
evaluate_models_non_mnist(models_ensemble, NotMNIST_x, NotMNIST_y)
evaluate_models_non_mnist([model_mc_dropout]*5, NotMNIST_x, NotMNIST_y, training=True)
evaluate_models_non_mnist(models_bootstrap, NotMNIST_x, NotMNIST_y)
evaluate_combined_model_non_mnist(inference_model, NotMNIST_x, NotMNIST_y, 5)


900it [03:12,  4.68it/s]

Accuracy: 0.0978
Avg confidence: 0.7545
Average Variance: 0.0053
Average Entropy: 0.5987
Average Inference Time per Sample: 0.164834 seconds



