In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import save_model, load_model
#from tensorflow.keras.callbacks import Callback
#%load_ext tensorboard
%matplotlib widget
import lottery_ticket_pruner
from lottery_ticket_pruner import LotteryTicketPruner, PrunerCallback
from mine import MINE
%run functions.ipynb

In [2]:
# Load the MNIST dataset using TensorFlow
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# Display the shapes of the training and test datasets
print("Training data shape:", x_train.shape, y_train.shape)
print("Test data shape:", x_test.shape, y_test.shape)

# reshape data as 2D numpy arrays
# convert to float32 and normalize grayscale for better num. representation
x_train = x_train.reshape(60000, 784).astype("float32") / 255
x_test = x_test.reshape(10000, 784).astype("float32") / 255

y_train = y_train.astype("float32")
y_test = y_test.astype("float32")

# The tutorial reserved 10.000 training samples for validation, we change to 5.000 
# as that is what Frankle and Carbin did in their paper
x_val = x_train[-5000:]
y_val = y_train[-5000:]
x_train = x_train[:-5000]
y_train = y_train[:-5000]

Training data shape: (60000, 28, 28) (60000,)
Test data shape: (10000, 28, 28) (10000,)


In [3]:
# Hyperparams
batch_size = 60 # batchsize, 60 images per weight update
epochs = 10 # nr. of epochs we train our models
validation_split = 1/11 # 5000 val 55000 train data
input_dim = 784 # input_distribution size for MINE
d1_dim = 100 # first hidden layer distribution size for MINE
d2_dim = 30  # second hidden layer distribution size for MINE
output_dim = 10 # output_distribution dim for MINE
pruning_rate = 0.5 # pruning rate for LTH iterative Pruning -> removes pruning_rate% of lowest magnitude weights in an iteration
pruning_iterations = 5 # number of iterations for applying the pruning rate iteratively -> 1 time : 20% sparse, 13 times : ~95% sparse
averaging_iterations = 2 # number of total experimental runs to average for graph representations

In [4]:
tf.keras.backend.clear_session() # clearing backend right at start, just in case

inputs = keras.Input(shape=(input_dim,), name="digits") # Functional build of a 2-hidden layer fully connected MLP
x = layers.Dense(d1_dim, activation="relu", name="dense_1")(inputs) # methods made no mention of the activaton function specifically
x = layers.Dense(d2_dim, activation="relu", name="dense_2")(x) # ReLU is standard, as all available implementations seem to use it too
outputs = layers.Dense(output_dim, activation="softmax", name="predictions")(x)  # softmax activation for multi-class classification

base_model = keras.Model(inputs=inputs, outputs=outputs)
base_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 digits (InputLayer)         [(None, 784)]             0         
                                                                 
 dense_1 (Dense)             (None, 100)               78500     
                                                                 
 dense_2 (Dense)             (None, 30)                3030      
                                                                 
 predictions (Dense)         (None, 10)                310       
                                                                 
Total params: 81,840
Trainable params: 81,840
Non-trainable params: 0
_________________________________________________________________


In [5]:
# loading the saved initialization
base_model.load_weights("init_weights.h5")
init_weights = base_model.get_weights() # init weights for Lotter Ticket reset to initial weights

In [None]:
# Append the data lists to the dictionary for each iteration
data_dict = {"accuracies": [],
             "losses": [],
             "MI_estimate_x_d1": [],
             "MI_estimate_x_d2": [],
             "MI_estimate_x_o": [],
             "MI_estimate_d1_d2": [],
             "MI_estimate_d1_o": [],
             "MI_estimate_d2_o": [],
             "MI_hist_x_d1": [],
             "MI_hist_x_d2": [],
             "MI_hist_x_o": [],
             "MI_hist_d1_d2": [],
             "MI_hist_d1_o": [],
             "MI_hist_d2_o": [],
             "accuracies_init": [],
             "losses_init": [],
             "MI_estimate_x_d1_init": [],
             "MI_estimate_x_d2_init": [],
             "MI_estimate_x_o_init": [],
             "MI_estimate_d1_d2_init": [],
             "MI_estimate_d1_o_init": [],
             "MI_estimate_d2_o_init": [],
             "MI_hist_x_d1_init": [],
             "MI_hist_x_d2_init": [],
             "MI_hist_x_o_init": [],
             "MI_hist_d1_d2_init": [],
             "MI_hist_d1_o_init": [],
             "MI_hist_d2_o_init": [],
             "accuracies_rand": [],
             "losses_rand": [],
             "MI_estimate_x_d1_rand": [],
             "MI_estimate_x_d2_rand": [],
             "MI_estimate_x_o_rand": [],
             "MI_estimate_d1_d2_rand": [],
             "MI_estimate_d1_o_rand": [],
             "MI_estimate_d2_o_rand": [],
             "MI_hist_x_d1_rand": [],
             "MI_hist_x_d2_rand": [],
             "MI_hist_x_o_rand": [],
             "MI_hist_d1_d2_rand": [],
             "MI_hist_d1_o_rand": [],
             "MI_hist_d2_o_rand": [],
             }

# We do 10 runs to get an average

for j in range(averaging_iterations):
    print("------------------------")
    print("------------------------")
    print("------------------------")
    print("Experimental run number: " + str(j+1))
    print("------------------------")
    print("------------------------")
    print("------------------------")
    
    init_model = keras.models.clone_model(base_model)
    pruner = LotteryTicketPruner(init_model) # pruner set-up
    pruner2 = LotteryTicketPruner(init_model)
    
    # collected outputs for evaluation
    accuracies = []
    losses = []
    MI_estimate_x_d1 = []
    MI_estimate_x_d2 = []
    MI_estimate_x_o = []
    MI_estimate_d1_d2 = []
    MI_estimate_d1_o = []
    MI_estimate_d2_o = []
    MI_hist_x_d1 = []
    MI_hist_x_d2 = []
    MI_hist_x_o = []
    MI_hist_d1_d2 = []
    MI_hist_d1_o = []
    MI_hist_d2_o = []
    
    accuracies_init = []
    losses_init = []
    MI_estimate_x_d1_init = []
    MI_estimate_x_d2_init = []
    MI_estimate_x_o_init = []
    MI_estimate_d1_d2_init = []
    MI_estimate_d1_o_init = []
    MI_estimate_d2_o_init = []
    MI_hist_x_d1_init = []
    MI_hist_x_d2_init = []
    MI_hist_x_o_init = []
    MI_hist_d1_d2_init = []
    MI_hist_d1_o_init = []
    MI_hist_d2_o_init = []
    
    accuracies_rand = []
    losses_rand = []
    MI_estimate_x_d1_rand = []
    MI_estimate_x_d2_rand = []
    MI_estimate_x_o_rand = []
    MI_estimate_d1_d2_rand = []
    MI_estimate_d1_o_rand = []
    MI_estimate_d2_o_rand = []
    MI_hist_x_d1_rand = []
    MI_hist_x_d2_rand = []
    MI_hist_x_o_rand = []
    MI_hist_d1_d2_rand = []
    MI_hist_d1_o_rand = []
    MI_hist_d2_o_rand = []

    # compiling model with training params
    model = keras.models.clone_model(base_model)
    model2 = keras.models.clone_model(base_model)
    model.load_weights("init_weights.h5")
    model2.load_weights("init_weights.h5")
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1.2e-3), # Adam optimizer, lr=0.0012
                  # Loss function to minimize
                  loss=keras.losses.SparseCategoricalCrossentropy(), # multi-class classification loss function
                  # List of metrics to monitor
                  metrics=[keras.metrics.SparseCategoricalAccuracy()],
                 )
    model2.compile(optimizer=keras.optimizers.Adam(learning_rate=1.2e-3), # Adam optimizer, lr=0.0012
                  # Loss function to minimize
                  loss=keras.losses.SparseCategoricalCrossentropy(), # multi-class classification loss function
                  # List of metrics to monitor
                  metrics=[keras.metrics.SparseCategoricalAccuracy()],
                 )

    print("")
    print("")
    pre_train_loss, pre_train_accuracy = model.evaluate(x_test, y_test)
    accuracies.append(pre_train_accuracy)
    accuracies_init.append(pre_train_accuracy)
    accuracies_rand.append(pre_train_accuracy) # we'll save time by appending to rand outputs because same before prune
    print("acc appended")
    losses.append(pre_train_loss)
    losses_init.append(pre_train_loss)
    losses_rand.append(pre_train_loss) # same for losses, before pruning there's no structural difference yet
    print("loss appended")
    print("")
    print("")
    print("attempting mutual information neural estimation")
    fit_loss_history_x_d1, mutual_info_x_d1 = get_mine_x_d1(model)
    fit_loss_history_x_d2, mutual_info_x_d2 = get_mine_x_d2(model)
    fit_loss_history_x_o, mutual_info_x_o = get_mine_x_o(model)
    fit_loss_history_d1_d2, mutual_info_d1_d2 = get_mine_d1_d2(model)
    fit_loss_history_d1_o, mutual_info_d1_o = get_mine_d1_o(model)
    fit_loss_history_d2_o, mutual_info_d2_o = get_mine_d2_o(model)
    print("estimated mutual information x_d1: " + str(mutual_info_x_d1))
    print("estimated mutual information x_d2: " + str(mutual_info_x_d2))
    print("estimated mutual information x_o: " + str(mutual_info_x_o))
    print("estimated mutual information d1_d2: " + str(mutual_info_d1_d2))
    print("estimated mutual information d1_o: " + str(mutual_info_d1_o))
    print("estimated mutual information d2_o: " + str(mutual_info_d2_o))
    MI_estimate_x_d1.append(mutual_info_x_d1)
    MI_estimate_x_d2.append(mutual_info_x_d2)
    MI_estimate_x_o.append(mutual_info_x_o)
    MI_estimate_d1_d2.append(mutual_info_d1_d2)
    MI_estimate_d1_o.append(mutual_info_d1_o)
    MI_estimate_d2_o.append(mutual_info_d2_o)
    MI_estimate_x_d1_init.append(mutual_info_x_d1) # those are already init_weights
    MI_estimate_x_d2_init.append(mutual_info_x_d2)
    MI_estimate_x_o_init.append(mutual_info_x_o)
    MI_estimate_d1_d2_init.append(mutual_info_d1_d2)
    MI_estimate_d1_o_init.append(mutual_info_d1_o)
    MI_estimate_d2_o_init.append(mutual_info_d2_o)
    MI_estimate_x_d1_rand.append(mutual_info_x_d1) # append same as model because no pruning has been done yet
    MI_estimate_x_d2_rand.append(mutual_info_x_d2)
    MI_estimate_x_o_rand.append(mutual_info_x_o)
    MI_estimate_d1_d2_rand.append(mutual_info_d1_d2)
    MI_estimate_d1_o_rand.append(mutual_info_d1_o)
    MI_estimate_d2_o_rand.append(mutual_info_d2_o)
    print("mutual info appended")
    MI_hist_x_d1.append(fit_loss_history_x_d1)
    MI_hist_x_d2.append(fit_loss_history_x_d2)
    MI_hist_x_o.append(fit_loss_history_x_o)
    MI_hist_d1_d2.append(fit_loss_history_d1_d2)
    MI_hist_d1_o.append(fit_loss_history_d1_o)
    MI_hist_d2_o.append(fit_loss_history_d2_o)
    MI_hist_x_d1_init.append(fit_loss_history_x_d1) # same for losses, it's init_weights
    MI_hist_x_d2_init.append(fit_loss_history_x_d2)
    MI_hist_x_o_init.append(fit_loss_history_x_o)
    MI_hist_d1_d2_init.append(fit_loss_history_d1_d2)
    MI_hist_d1_o_init.append(fit_loss_history_d1_o)
    MI_hist_d2_o_init.append(fit_loss_history_d2_o)
    MI_hist_x_d1_rand.append(fit_loss_history_x_d1)
    MI_hist_x_d2_rand.append(fit_loss_history_x_d2)
    MI_hist_x_o_rand.append(fit_loss_history_x_o)
    MI_hist_d1_d2_rand.append(fit_loss_history_d1_d2)
    MI_hist_d1_o_rand.append(fit_loss_history_d1_o)
    MI_hist_d2_o_rand.append(fit_loss_history_d2_o)
    print("fit_loss_history appended")
    print("")
    print("")
    print("fully connected model, pre-train: " + "loss: " + str(pre_train_loss) + " acc: " + str(pre_train_accuracy))
    print("")
    print("")
    print("sanity check")
    print("output layer weight mask, pre-train, first 10 arr: ")
    print(model.layers[3].get_weights()[0][:10])

    # fully-connected trained
    model.load_weights("trained_weights.h5") # LOADING pre-trained weights for reproducibility
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1.2e-3), # Adam optimizer, lr=0.0012
                  # Loss function to minimize
                  loss=keras.losses.SparseCategoricalCrossentropy(), # multi-class classification loss function
                  # List of metrics to monitor
                  metrics=[keras.metrics.SparseCategoricalAccuracy()],
                 )
    model2.load_weights("trained_weights.h5") # LOADING pre-trained weights for reproducibility
    model2.compile(optimizer=keras.optimizers.Adam(learning_rate=1.2e-3), # Adam optimizer, lr=0.0012
                  # Loss function to minimize
                  loss=keras.losses.SparseCategoricalCrossentropy(), # multi-class classification loss function
                  # List of metrics to monitor
                  metrics=[keras.metrics.SparseCategoricalAccuracy()],
                  )
    print("")
    print("")
    trained_loss, trained_accuracy = model.evaluate(x_test, y_test)
    print("fully connected model, trained: " + "loss: " + str(trained_loss) + " acc: " + str(trained_accuracy))
    print("")
    print("")
    accuracies.append(trained_accuracy) 
    accuracies_init.append(pre_train_accuracy) # if we take trained model and put init_weights, we get the init_model
    accuracies_rand.append(trained_accuracy) # no prune yet, so same
    print("acc appended")
    losses.append(trained_loss)
    losses_init.append(pre_train_loss) # s.a.
    losses_rand.append(trained_loss) # s.a.
    print("loss appended")
    print("")
    print("")
    MI_estimate_x_d1_init.append(mutual_info_x_d1) # appending early, because I reuse variable names
    MI_estimate_x_d2_init.append(mutual_info_x_d2) # the logic is if we have the fully trained model and reset
    MI_estimate_x_o_init.append(mutual_info_x_o)   # the weights to init_weights, we just get the init_model from above again
    MI_estimate_d1_d2_init.append(mutual_info_d1_d2) 
    MI_estimate_d1_o_init.append(mutual_info_d1_o)
    MI_estimate_d2_o_init.append(mutual_info_d2_o)
    MI_hist_x_d1_init.append(fit_loss_history_x_d1)
    MI_hist_x_d2_init.append(fit_loss_history_x_d2)
    MI_hist_x_o_init.append(fit_loss_history_x_o)
    MI_hist_d1_d2_init.append(fit_loss_history_d1_d2)
    MI_hist_d1_o_init.append(fit_loss_history_d1_o)
    MI_hist_d2_o_init.append(fit_loss_history_d2_o)
    print("early append of _init done")
    print("")
    print("")
    print("attempting mutual information neural estimation")
    fit_loss_history_x_d1, mutual_info_x_d1 = get_mine_x_d1(model)
    fit_loss_history_x_d2, mutual_info_x_d2 = get_mine_x_d2(model)
    fit_loss_history_x_o, mutual_info_x_o = get_mine_x_o(model)
    fit_loss_history_d1_d2, mutual_info_d1_d2 = get_mine_d1_d2(model)
    fit_loss_history_d1_o, mutual_info_d1_o = get_mine_d1_o(model)
    fit_loss_history_d2_o, mutual_info_d2_o = get_mine_d2_o(model)
    print("estimated mutual information x_d1: " + str(mutual_info_x_d1))
    print("estimated mutual information x_d2: " + str(mutual_info_x_d2))
    print("estimated mutual information x_o: " + str(mutual_info_x_o))
    print("estimated mutual information d1_d2: " + str(mutual_info_d1_d2))
    print("estimated mutual information d1_o: " + str(mutual_info_d1_o))
    print("estimated mutual information d2_o: " + str(mutual_info_d2_o))
    MI_estimate_x_d1.append(mutual_info_x_d1)
    MI_estimate_x_d2.append(mutual_info_x_d2)
    MI_estimate_x_o.append(mutual_info_x_o)
    MI_estimate_d1_d2.append(mutual_info_d1_d2) # same as above, same model pre-pruning
    MI_estimate_d1_o.append(mutual_info_d1_o)
    MI_estimate_d2_o.append(mutual_info_d2_o)
    MI_estimate_x_d1_rand.append(mutual_info_x_d1)
    MI_estimate_x_d2_rand.append(mutual_info_x_d2)
    MI_estimate_x_o_rand.append(mutual_info_x_o)
    MI_estimate_d1_d2_rand.append(mutual_info_d1_d2) 
    MI_estimate_d1_o_rand.append(mutual_info_d1_o)
    MI_estimate_d2_o_rand.append(mutual_info_d2_o)
    print("mutual info appended")
    MI_hist_x_d1.append(fit_loss_history_x_d1)
    MI_hist_x_d2.append(fit_loss_history_x_d2)
    MI_hist_x_o.append(fit_loss_history_x_o)
    MI_hist_d1_d2.append(fit_loss_history_d1_d2)
    MI_hist_d1_o.append(fit_loss_history_d1_o)
    MI_hist_d2_o.append(fit_loss_history_d2_o)
    MI_hist_x_d1_rand.append(fit_loss_history_x_d1) # model pre-pruning, same weights, same histories
    MI_hist_x_d2_rand.append(fit_loss_history_x_d2)
    MI_hist_x_o_rand.append(fit_loss_history_x_o)
    MI_hist_d1_d2_rand.append(fit_loss_history_d1_d2)
    MI_hist_d1_o_rand.append(fit_loss_history_d1_o)
    MI_hist_d2_o_rand.append(fit_loss_history_d2_o)
    print("fit_loss_history appended")
    print("")
    print("")
    print("sanity check")
    print("output layer weight mask, trained, fully connected model, first 10 arr: ")
    print(model.layers[3].get_weights()[0][:10] != 0)

    for i in range(pruning_iterations):
        pruner.set_pretrained_weights(model) # pruner for large_final pruning schedule
        pruner2.set_pretrained_weights(model2) # pruner for random pruning schedule
        model.set_weights(init_weights)
        model2.set_weights(init_weights)
        pruner.calc_prune_mask(model, pruning_rate,'large_final')
        pruner2.calc_prune_mask(model2, pruning_rate, 'random')
        print("prune_mask calculated")
        print("")
        sparsity = calc_sparsity(i,pruning_rate)
        print(f"Iteration {i+1}: making {sparsity:.2f}% sparse large_final")
        history = model.fit(x_train,
                            y_train,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=0,
                            # monitoring validation loss and metrics
                            # at the end of each epoch
                            validation_data=(x_val, y_val),
                            callbacks=[PrunerCallback(pruner)])
                   
        print(f"Iteration {i+1}: making {sparsity:.2f}% sparse random")
        history = model2.fit(x_train,
                             y_train,
                             batch_size=batch_size,
                             epochs=epochs,
                             verbose=0,
                             # monitoring validation loss and metrics
                             # at the end of each epoch
                             validation_data=(x_val, y_val),
                             callbacks=[PrunerCallback(pruner2)])

        print("")
        print("")
        ticket_loss, ticket_accuracy = model.evaluate(x_test, y_test)
        print(f"{sparsity:.2f}% sparse large_final: " + "loss: " + str(ticket_loss) + " acc: " + str(ticket_accuracy))
        model_init = keras.models.clone_model(model) # cloning the ticket so my set_function doesn't interfere with the iteration
        model_init = set_model(init_model, model_init)
        model_init.compile(optimizer=keras.optimizers.Adam(learning_rate=1.2e-3), # Adam optimizer, lr=0.0012
                  # Loss function to minimize
                  loss=keras.losses.SparseCategoricalCrossentropy(), # multi-class classification loss function
                  # List of metrics to monitor
                  metrics=[keras.metrics.SparseCategoricalAccuracy()],
                  )# using my set_function to take the ticket and manually set non-zero weights to init_weights
        ticket_loss_init, ticket_accuracy_init = model_init.evaluate(x_test, y_test)
        print(f"{sparsity:.2f}% sparse large_final, init_weights: " + "loss: " + str(ticket_loss_init) + " acc: " + str(ticket_accuracy_init))
        random_loss, random_accuracy = model2.evaluate(x_test, y_test)
        print(f"{sparsity:.2f}% sparse random: " + "loss: " + str(random_loss) + " acc: " + str(random_accuracy))
        print("")
        print("")
        accuracies.append(ticket_accuracy)
        accuracies_init.append(ticket_accuracy_init)
        accuracies_rand.append(random_accuracy)
        print("acc appended")
        losses.append(ticket_loss)
        losses_init.append(ticket_loss_init)
        losses_rand.append(random_loss)
        print("loss appended")
        print("")
        print("")
        print("attempting mutual information neural estimation")
        fit_loss_history_x_d1, mutual_info_x_d1 = get_mine_x_d1(model)
        fit_loss_history_x_d2, mutual_info_x_d2 = get_mine_x_d2(model)
        fit_loss_history_x_o, mutual_info_x_o = get_mine_x_o(model)
        fit_loss_history_d1_d2, mutual_info_d1_d2 = get_mine_d1_d2(model)
        fit_loss_history_d1_o, mutual_info_d1_o = get_mine_d1_o(model)
        fit_loss_history_d2_o, mutual_info_d2_o = get_mine_d2_o(model)
        fit_loss_history_x_d1_init, mutual_info_x_d1_init = get_mine_x_d1(model_init)
        fit_loss_history_x_d2_init, mutual_info_x_d2_init = get_mine_x_d2(model_init)
        fit_loss_history_x_o_init, mutual_info_x_o_init = get_mine_x_o(model_init)
        fit_loss_history_d1_d2_init, mutual_info_d1_d2_init = get_mine_d1_d2(model_init)
        fit_loss_history_d1_o_init, mutual_info_d1_o_init = get_mine_d1_o(model_init)
        fit_loss_history_d2_o_init, mutual_info_d2_o_init = get_mine_d2_o(model_init)
        fit_loss_history_x_d1_rand, mutual_info_x_d1_rand = get_mine_x_d1(model2)
        fit_loss_history_x_d2_rand, mutual_info_x_d2_rand = get_mine_x_d2(model2)
        fit_loss_history_x_o_rand, mutual_info_x_o_rand = get_mine_x_o(model2)
        fit_loss_history_d1_d2_rand, mutual_info_d1_d2_rand = get_mine_d1_d2(model2)
        fit_loss_history_d1_o_rand, mutual_info_d1_o_rand = get_mine_d1_o(model2)
        fit_loss_history_d2_o_rand, mutual_info_d2_o_rand = get_mine_d2_o(model2)
        print("estimated mutual information x_d1, large_final: " + str(mutual_info_x_d1))
        print("estimated mutual information x_d2, large_final: " + str(mutual_info_x_d2))
        print("estimated mutual information x_o, large_final: " + str(mutual_info_x_o))
        print("estimated mutual information d1_d2, large_final: " + str(mutual_info_d1_d2))
        print("estimated mutual information d1_o, large_final: " + str(mutual_info_d1_o))
        print("estimated mutual information d2_o, large_final: " + str(mutual_info_d2_o))
        print("estimated mutual information x_d1, large_final, init_weights: " + str(mutual_info_x_d1_init))
        print("estimated mutual information x_d2, large_final, init_weights: " + str(mutual_info_x_d2_init))
        print("estimated mutual information x_o, large_final, init_weights: " + str(mutual_info_x_o_init))
        print("estimated mutual information d1_d2, large_final, init_weights: " + str(mutual_info_d1_d2_init))
        print("estimated mutual information d1_o, large_final, init_weights: " + str(mutual_info_d1_o_init))
        print("estimated mutual information d2_o, large_final, init_weights: " + str(mutual_info_d2_o_init))
        print("estimated mutual information x_d1, random: " + str(mutual_info_x_d1_rand))
        print("estimated mutual information x_d2, random: " + str(mutual_info_x_d2_rand))
        print("estimated mutual information x_o, random: " + str(mutual_info_x_o_rand))
        print("estimated mutual information d1_d2, random: " + str(mutual_info_d1_d2_rand))
        print("estimated mutual information d1_o, random: " + str(mutual_info_d1_o_rand))
        print("estimated mutual information d2_o, random: " + str(mutual_info_d2_o_rand))
        MI_estimate_x_d1.append(mutual_info_x_d1)
        MI_estimate_x_d2.append(mutual_info_x_d2)
        MI_estimate_x_o.append(mutual_info_x_o)
        MI_estimate_d1_d2.append(mutual_info_d1_d2)
        MI_estimate_d1_o.append(mutual_info_d1_o)
        MI_estimate_d2_o.append(mutual_info_d2_o)
        MI_estimate_x_d1_init.append(mutual_info_x_d1_init)
        MI_estimate_x_d2_init.append(mutual_info_x_d2_init)
        MI_estimate_x_o_init.append(mutual_info_x_o_init)
        MI_estimate_d1_d2_init.append(mutual_info_d1_d2_init)
        MI_estimate_d1_o_init.append(mutual_info_d1_o_init)
        MI_estimate_d2_o_init.append(mutual_info_d2_o_init)
        MI_estimate_x_d1_rand.append(mutual_info_x_d1_rand)
        MI_estimate_x_d2_rand.append(mutual_info_x_d2_rand)
        MI_estimate_x_o_rand.append(mutual_info_x_o_rand)
        MI_estimate_d1_d2_rand.append(mutual_info_d1_d2_rand)
        MI_estimate_d1_o_rand.append(mutual_info_d1_o_rand)
        MI_estimate_d2_o_rand.append(mutual_info_d2_o_rand)
        print("mutual info appended")
        MI_hist_x_d1.append(fit_loss_history_x_d1)
        MI_hist_x_d2.append(fit_loss_history_x_d2)
        MI_hist_x_o.append(fit_loss_history_x_o)
        MI_hist_d1_d2.append(fit_loss_history_d1_d2)
        MI_hist_d1_o.append(fit_loss_history_d1_o)
        MI_hist_d2_o.append(fit_loss_history_d2_o)
        MI_hist_x_d1_init.append(fit_loss_history_x_d1_init)
        MI_hist_x_d2_init.append(fit_loss_history_x_d2_init)
        MI_hist_x_o_init.append(fit_loss_history_x_o_init)
        MI_hist_d1_d2_init.append(fit_loss_history_d1_d2_init)
        MI_hist_d1_o_init.append(fit_loss_history_d1_o_init)
        MI_hist_d2_o_init.append(fit_loss_history_d2_o_init)
        MI_hist_x_d1_rand.append(fit_loss_history_x_d1_rand)
        MI_hist_x_d2_rand.append(fit_loss_history_x_d2_rand)
        MI_hist_x_o_rand.append(fit_loss_history_x_o_rand)
        MI_hist_d1_d2_rand.append(fit_loss_history_d1_d2_rand)
        MI_hist_d1_o_rand.append(fit_loss_history_d1_o_rand)
        MI_hist_d2_o_rand.append(fit_loss_history_d2_o_rand)
        print("fit_loss_history appended")
        print("")
        print("")
        print("sanity check")
        print(f"output layer weight mask, {sparsity:.2f}% sparse large_final, first 10 arr: ")
        print(model.layers[3].get_weights()[0][:10])
        print(f"output layer weight mask, {sparsity:.2f}% sparse large_final, init_weights, first 10 arr: ")
        print(model_init.layers[3].get_weights()[0][:10])
        print(f"output layer weight mask, {sparsity:.2f}% sparse random, first 10 arr: ")
        print(model2.layers[3].get_weights()[0][:10])
        
    # saving data for averaging     
    data_dict["accuracies"].append(accuracies)
    data_dict["losses"].append(losses)
    data_dict["MI_estimate_x_d1"].append(MI_estimate_x_d1)
    data_dict["MI_estimate_x_d2"].append(MI_estimate_x_d2)
    data_dict["MI_estimate_x_o"].append(MI_estimate_x_o)
    data_dict["MI_estimate_d1_d2"].append(MI_estimate_d1_d2)
    data_dict["MI_estimate_d1_o"].append(MI_estimate_d1_o)
    data_dict["MI_estimate_d2_o"].append(MI_estimate_d2_o)
    data_dict["MI_hist_x_d1"].append(MI_hist_x_d1)
    data_dict["MI_hist_x_d2"].append(MI_hist_x_d2)
    data_dict["MI_hist_x_o"].append(MI_hist_x_o)
    data_dict["MI_hist_d1_d2"].append(MI_hist_d1_d2)
    data_dict["MI_hist_d1_o"].append(MI_hist_d1_o)
    data_dict["MI_hist_d2_o"].append(MI_hist_d2_o)
    data_dict["accuracies_init"].append(accuracies_init)
    data_dict["losses_init"].append(losses_init)
    data_dict["MI_estimate_x_d1_init"].append(MI_estimate_x_d1_init)
    data_dict["MI_estimate_x_d2_init"].append(MI_estimate_x_d2_init)
    data_dict["MI_estimate_x_o_init"].append(MI_estimate_x_o_init)
    data_dict["MI_estimate_d1_d2_init"].append(MI_estimate_d1_d2_init)
    data_dict["MI_estimate_d1_o_init"].append(MI_estimate_d1_o_init)
    data_dict["MI_estimate_d2_o_init"].append(MI_estimate_d2_o_init)
    data_dict["MI_hist_x_d1_init"].append(MI_hist_x_d1_init)
    data_dict["MI_hist_x_d2_init"].append(MI_hist_x_d2_init)
    data_dict["MI_hist_x_o_init"].append(MI_hist_x_o_init)
    data_dict["MI_hist_d1_d2_init"].append(MI_hist_d1_d2_init)
    data_dict["MI_hist_d1_o_init"].append(MI_hist_d1_o_init)
    data_dict["MI_hist_d2_o_init"].append(MI_hist_d2_o_init)
    data_dict["accuracies_rand"].append(accuracies_rand)
    data_dict["losses_rand"].append(losses_rand)
    data_dict["MI_estimate_x_d1_rand"].append(MI_estimate_x_d1_rand)
    data_dict["MI_estimate_x_d2_rand"].append(MI_estimate_x_d2_rand)
    data_dict["MI_estimate_x_o_rand"].append(MI_estimate_x_o_rand)
    data_dict["MI_estimate_d1_d2_rand"].append(MI_estimate_d1_d2_rand)
    data_dict["MI_estimate_d1_o_rand"].append(MI_estimate_d1_o_rand)
    data_dict["MI_estimate_d2_o_rand"].append(MI_estimate_d2_o_rand)
    data_dict["MI_hist_x_d1_rand"].append(MI_hist_x_d1_rand)
    data_dict["MI_hist_x_d2_rand"].append(MI_hist_x_d2_rand)
    data_dict["MI_hist_x_o_rand"].append(MI_hist_x_o_rand)
    data_dict["MI_hist_d1_d2_rand"].append(MI_hist_d1_d2_rand)
    data_dict["MI_hist_d1_o_rand"].append(MI_hist_d1_o_rand)
    data_dict["MI_hist_d2_o_rand"].append(MI_hist_d2_o_rand)
    print(f"Experiment {j+1} saved in Dictionary")

------------------------
------------------------
------------------------
Experimental run number: 1
------------------------
------------------------
------------------------


acc appended
loss appended


attempting mutual information neural estimation
Epoch 1/10
430/430 - 2s - loss: -2.5569e+00 - 2s/epoch - 5ms/step
Epoch 2/10
430/430 - 2s - loss: -4.0039e+00 - 2s/epoch - 5ms/step
Epoch 3/10
430/430 - 2s - loss: -4.3256e+00 - 2s/epoch - 5ms/step
Epoch 4/10
430/430 - 3s - loss: -4.5562e+00 - 3s/epoch - 6ms/step
Epoch 5/10
430/430 - 3s - loss: -4.7707e+00 - 3s/epoch - 7ms/step
Epoch 6/10
430/430 - 2s - loss: -5.1962e+00 - 2s/epoch - 5ms/step
Epoch 7/10
430/430 - 3s - loss: -5.8073e+00 - 3s/epoch - 6ms/step
Epoch 8/10
430/430 - 3s - loss: -5.5608e+00 - 3s/epoch - 6ms/step
Epoch 9/10
430/430 - 2s - loss: -7.1638e+00 - 2s/epoch - 5ms/step
Epoch 10/10
430/430 - 3s - loss: -6.7619e+00 - 3s/epoch - 6ms/step
Epoch 1/10
430/430 - 3s - loss: -1.1464e+00 - 3s/epoch - 6ms/step
Epoch 2/10
430/43

Epoch 1/10
430/430 - 2s - loss: -1.3442e-01 - 2s/epoch - 4ms/step
Epoch 2/10
430/430 - 2s - loss: -4.5679e-01 - 2s/epoch - 4ms/step
Epoch 3/10
430/430 - 2s - loss: -7.5774e-01 - 2s/epoch - 4ms/step
Epoch 4/10
430/430 - 1s - loss: -9.2585e-01 - 1s/epoch - 3ms/step
Epoch 5/10
430/430 - 1s - loss: -1.0911e+00 - 1s/epoch - 3ms/step
Epoch 6/10
430/430 - 1s - loss: -1.2155e+00 - 1s/epoch - 3ms/step
Epoch 7/10
430/430 - 1s - loss: -1.2802e+00 - 941ms/epoch - 2ms/step
Epoch 8/10
430/430 - 1s - loss: -1.3665e+00 - 1s/epoch - 3ms/step
Epoch 9/10
430/430 - 1s - loss: -1.4419e+00 - 1s/epoch - 3ms/step
Epoch 10/10
430/430 - 1s - loss: -1.5016e+00 - 1s/epoch - 2ms/step
Epoch 1/10
430/430 - 1s - loss: -1.2889e+00 - 904ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -2.1092e+00 - 445ms/epoch - 1ms/step
Epoch 3/10
430/430 - 0s - loss: -2.4058e+00 - 445ms/epoch - 1ms/step
Epoch 4/10
430/430 - 0s - loss: -2.5336e+00 - 445ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -2.7274e+00 - 445ms/epoch - 

Epoch 7/10
430/430 - 0s - loss: -1.2125e+00 - 471ms/epoch - 1ms/step
Epoch 8/10
430/430 - 0s - loss: -1.2605e+00 - 479ms/epoch - 1ms/step
Epoch 9/10
430/430 - 0s - loss: -1.3104e+00 - 474ms/epoch - 1ms/step
Epoch 10/10
430/430 - 0s - loss: -1.3712e+00 - 476ms/epoch - 1ms/step
Epoch 1/10
430/430 - 1s - loss: -1.5069e-01 - 723ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -3.7218e-01 - 447ms/epoch - 1ms/step
Epoch 3/10
430/430 - 0s - loss: -4.6838e-01 - 435ms/epoch - 1ms/step
Epoch 4/10
430/430 - 0s - loss: -4.9935e-01 - 437ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -5.2962e-01 - 446ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -5.4076e-01 - 425ms/epoch - 989us/step
Epoch 7/10
430/430 - 0s - loss: -5.5160e-01 - 456ms/epoch - 1ms/step
Epoch 8/10
430/430 - 0s - loss: -5.8926e-01 - 427ms/epoch - 993us/step
Epoch 9/10
430/430 - 0s - loss: -6.2077e-01 - 458ms/epoch - 1ms/step
Epoch 10/10
430/430 - 0s - loss: -6.2555e-01 - 455ms/epoch - 1ms/step
Epoch 1/10
430/430 - 4s - lo

Epoch 9/10
430/430 - 0s - loss: -2.7939e+00 - 444ms/epoch - 1ms/step
Epoch 10/10
430/430 - 0s - loss: -2.8314e+00 - 428ms/epoch - 996us/step
Epoch 1/10
430/430 - 1s - loss: -1.9662e-01 - 748ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -4.9463e-01 - 433ms/epoch - 1ms/step
Epoch 3/10
430/430 - 0s - loss: -6.4742e-01 - 426ms/epoch - 991us/step
Epoch 4/10
430/430 - 0s - loss: -7.6914e-01 - 430ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -8.7525e-01 - 443ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -1.0197e+00 - 453ms/epoch - 1ms/step
Epoch 7/10
430/430 - 0s - loss: -1.1137e+00 - 405ms/epoch - 942us/step
Epoch 8/10
430/430 - 0s - loss: -1.1839e+00 - 418ms/epoch - 972us/step
Epoch 9/10
430/430 - 0s - loss: -1.2458e+00 - 408ms/epoch - 949us/step
Epoch 10/10
430/430 - 0s - loss: -1.3078e+00 - 429ms/epoch - 998us/step
Epoch 1/10
430/430 - 1s - loss: -1.2601e-01 - 713ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -2.6168e-01 - 408ms/epoch - 949us/step
Epoch 3/10
430/430

Epoch 1/10
430/430 - 2s - loss: -1.1634e+00 - 2s/epoch - 4ms/step
Epoch 2/10
430/430 - 2s - loss: -2.3329e+00 - 2s/epoch - 5ms/step
Epoch 3/10
430/430 - 2s - loss: -2.6844e+00 - 2s/epoch - 5ms/step
Epoch 4/10
430/430 - 1s - loss: -2.9502e+00 - 1s/epoch - 3ms/step
Epoch 5/10
430/430 - 2s - loss: -3.0650e+00 - 2s/epoch - 5ms/step
Epoch 6/10
430/430 - 2s - loss: -3.1530e+00 - 2s/epoch - 4ms/step
Epoch 7/10
430/430 - 2s - loss: -3.3203e+00 - 2s/epoch - 6ms/step
Epoch 8/10
430/430 - 2s - loss: -3.3409e+00 - 2s/epoch - 4ms/step
Epoch 9/10
430/430 - 2s - loss: -3.3525e+00 - 2s/epoch - 5ms/step
Epoch 10/10
430/430 - 3s - loss: -3.4548e+00 - 3s/epoch - 6ms/step
Epoch 1/10
430/430 - 2s - loss: -1.1605e-01 - 2s/epoch - 5ms/step
Epoch 2/10
430/430 - 2s - loss: -5.6268e-01 - 2s/epoch - 4ms/step
Epoch 3/10
430/430 - 2s - loss: -9.9905e-01 - 2s/epoch - 4ms/step
Epoch 4/10
430/430 - 2s - loss: -1.2434e+00 - 2s/epoch - 4ms/step
Epoch 5/10
430/430 - 2s - loss: -1.4511e+00 - 2s/epoch - 5ms/step
Epoch 6/1

Epoch 1/10
430/430 - 2s - loss: -2.6848e+00 - 2s/epoch - 4ms/step
Epoch 2/10
430/430 - 1s - loss: -4.1417e+00 - 1s/epoch - 3ms/step
Epoch 3/10
430/430 - 1s - loss: -4.4639e+00 - 989ms/epoch - 2ms/step
Epoch 4/10
430/430 - 1s - loss: -4.6629e+00 - 1s/epoch - 2ms/step
Epoch 5/10
430/430 - 1s - loss: -5.3599e+00 - 997ms/epoch - 2ms/step
Epoch 6/10
430/430 - 1s - loss: -5.3640e+00 - 977ms/epoch - 2ms/step
Epoch 7/10
430/430 - 1s - loss: -5.9802e+00 - 844ms/epoch - 2ms/step
Epoch 8/10
430/430 - 1s - loss: -5.7971e+00 - 903ms/epoch - 2ms/step
Epoch 9/10
430/430 - 1s - loss: -6.6379e+00 - 1s/epoch - 3ms/step
Epoch 10/10
430/430 - 1s - loss: -6.9421e+00 - 1s/epoch - 2ms/step
Epoch 1/10
430/430 - 3s - loss: -1.2658e+00 - 3s/epoch - 7ms/step
Epoch 2/10
430/430 - 2s - loss: -2.4082e+00 - 2s/epoch - 5ms/step
Epoch 3/10
430/430 - 2s - loss: -2.8446e+00 - 2s/epoch - 5ms/step
Epoch 4/10
430/430 - 1s - loss: -2.9744e+00 - 934ms/epoch - 2ms/step
Epoch 5/10
430/430 - 2s - loss: -3.1847e+00 - 2s/epoch - 

Iteration 3: making 87.50% sparse random


87.50% sparse large_final: loss: 0.34478759765625 acc: 0.8823999762535095
87.50% sparse large_final, init_weights: loss: 2.3685452938079834 acc: 0.09920000284910202
87.50% sparse random: loss: 0.442656934261322 acc: 0.8615000247955322


acc appended
loss appended


attempting mutual information neural estimation
Epoch 1/10
430/430 - 2s - loss: -2.4711e+00 - 2s/epoch - 4ms/step
Epoch 2/10
430/430 - 2s - loss: -3.8029e+00 - 2s/epoch - 5ms/step
Epoch 3/10
430/430 - 1s - loss: -4.3209e+00 - 1s/epoch - 3ms/step
Epoch 4/10
430/430 - 1s - loss: -4.2790e+00 - 1s/epoch - 3ms/step
Epoch 5/10
430/430 - 1s - loss: -4.5333e+00 - 1s/epoch - 3ms/step
Epoch 6/10
430/430 - 1s - loss: -5.2181e+00 - 1s/epoch - 3ms/step
Epoch 7/10
430/430 - 1s - loss: -5.0528e+00 - 1s/epoch - 2ms/step
Epoch 8/10
430/430 - 1s - loss: -5.4509e+00 - 1s/epoch - 3ms/step
Epoch 9/10
430/430 - 1s - loss: -5.9327e+00 - 1s/epoch - 3ms/step
Epoch 10/10
430/430 - 1s - loss: -6.6045e+00 - 1s

Epoch 3/10
430/430 - 0s - loss: -6.6825e-01 - 408ms/epoch - 949us/step
Epoch 4/10
430/430 - 0s - loss: -7.8423e-01 - 408ms/epoch - 949us/step
Epoch 5/10
430/430 - 0s - loss: -8.5322e-01 - 405ms/epoch - 942us/step
Epoch 6/10
430/430 - 0s - loss: -9.2367e-01 - 406ms/epoch - 944us/step
Epoch 7/10
430/430 - 0s - loss: -9.7574e-01 - 408ms/epoch - 949us/step
Epoch 8/10
430/430 - 0s - loss: -1.0374e+00 - 408ms/epoch - 949us/step
Epoch 9/10
430/430 - 0s - loss: -1.0793e+00 - 408ms/epoch - 949us/step
Epoch 10/10
430/430 - 0s - loss: -1.1085e+00 - 405ms/epoch - 942us/step
Epoch 1/10
430/430 - 1s - loss: -1.5831e-01 - 688ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -3.6916e-01 - 393ms/epoch - 914us/step
Epoch 3/10
430/430 - 0s - loss: -4.2043e-01 - 393ms/epoch - 914us/step
Epoch 4/10
430/430 - 0s - loss: -4.3169e-01 - 395ms/epoch - 919us/step
Epoch 5/10
430/430 - 0s - loss: -4.5369e-01 - 393ms/epoch - 914us/step
Epoch 6/10
430/430 - 0s - loss: -4.6794e-01 - 396ms/epoch - 921us/step
Epoch 7

Iteration 4: making 93.75% sparse random


93.75% sparse large_final: loss: 0.9894002079963684 acc: 0.6309999823570251
93.75% sparse large_final, init_weights: loss: 2.3685452938079834 acc: 0.09920000284910202
93.75% sparse random: loss: 1.4676746129989624 acc: 0.4830999970436096


acc appended
loss appended


attempting mutual information neural estimation
Epoch 1/10
430/430 - 2s - loss: -2.5755e+00 - 2s/epoch - 4ms/step
Epoch 2/10
430/430 - 1s - loss: -4.0574e+00 - 1s/epoch - 3ms/step
Epoch 3/10
430/430 - 2s - loss: -4.3954e+00 - 2s/epoch - 5ms/step
Epoch 4/10
430/430 - 2s - loss: -4.8320e+00 - 2s/epoch - 4ms/step
Epoch 5/10
430/430 - 2s - loss: -4.9885e+00 - 2s/epoch - 4ms/step
Epoch 6/10
430/430 - 2s - loss: -4.9404e+00 - 2s/epoch - 5ms/step
Epoch 7/10
430/430 - 2s - loss: -5.5002e+00 - 2s/epoch - 4ms/step
Epoch 8/10
430/430 - 2s - loss: -5.8170e+00 - 2s/epoch - 4ms/step
Epoch 9/10
430/430 - 2s - loss: -5.9567e+00 - 2s/epoch - 4ms/step
Epoch 10/10
430/430 - 1s - loss: -7.2510e+00 -

430/430 - 0s - loss: -5.7944e-01 - 462ms/epoch - 1ms/step
Epoch 4/10
430/430 - 0s - loss: -7.3944e-01 - 464ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -8.4715e-01 - 462ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -9.3349e-01 - 457ms/epoch - 1ms/step
Epoch 7/10
430/430 - 0s - loss: -9.8177e-01 - 474ms/epoch - 1ms/step
Epoch 8/10
430/430 - 0s - loss: -9.9972e-01 - 448ms/epoch - 1ms/step
Epoch 9/10
430/430 - 0s - loss: -1.0645e+00 - 453ms/epoch - 1ms/step
Epoch 10/10
430/430 - 0s - loss: -1.0865e+00 - 432ms/epoch - 1ms/step
Epoch 1/10
430/430 - 1s - loss: -1.2850e-01 - 728ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -2.5422e-01 - 419ms/epoch - 975us/step
Epoch 3/10
430/430 - 0s - loss: -3.0175e-01 - 413ms/epoch - 961us/step
Epoch 4/10
430/430 - 0s - loss: -3.5092e-01 - 415ms/epoch - 965us/step
Epoch 5/10
430/430 - 0s - loss: -3.7266e-01 - 414ms/epoch - 963us/step
Epoch 6/10
430/430 - 0s - loss: -3.9576e-01 - 412ms/epoch - 959us/step
Epoch 7/10
430/430 - 0s - loss: -4

Iteration 5: making 96.88% sparse random


96.88% sparse large_final: loss: 1.9035412073135376 acc: 0.2574999928474426
96.88% sparse large_final, init_weights: loss: 2.3685452938079834 acc: 0.09920000284910202
96.88% sparse random: loss: 2.1925604343414307 acc: 0.2167000025510788


acc appended
loss appended


attempting mutual information neural estimation
Epoch 1/10
430/430 - 2s - loss: -2.5076e+00 - 2s/epoch - 5ms/step
Epoch 2/10
430/430 - 1s - loss: -4.0523e+00 - 1s/epoch - 3ms/step
Epoch 3/10
430/430 - 2s - loss: -4.3576e+00 - 2s/epoch - 4ms/step
Epoch 4/10
430/430 - 2s - loss: -4.8773e+00 - 2s/epoch - 4ms/step
Epoch 5/10
430/430 - 2s - loss: -4.9944e+00 - 2s/epoch - 4ms/step
Epoch 6/10
430/430 - 2s - loss: -5.3780e+00 - 2s/epoch - 4ms/step
Epoch 7/10
430/430 - 1s - loss: -5.8352e+00 - 1s/epoch - 3ms/step
Epoch 8/10
430/430 - 2s - loss: -6.3212e+00 - 2s/epoch - 4ms/step
Epoch 9/10
430/430 - 2s - loss: -7.3528e+00 - 2s/epoch - 4ms/step
Epoch 10/10
430/430 - 1s - loss: -6.7531e+00 -

Epoch 4/10
430/430 - 0s - loss: -7.6821e-01 - 468ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -8.3410e-01 - 492ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -9.2386e-01 - 479ms/epoch - 1ms/step
Epoch 7/10
430/430 - 0s - loss: -9.8729e-01 - 462ms/epoch - 1ms/step
Epoch 8/10
430/430 - 0s - loss: -1.0588e+00 - 449ms/epoch - 1ms/step
Epoch 9/10
430/430 - 0s - loss: -1.1306e+00 - 463ms/epoch - 1ms/step
Epoch 10/10
430/430 - 0s - loss: -1.2040e+00 - 482ms/epoch - 1ms/step
Epoch 1/10
430/430 - 1s - loss: -1.1797e-01 - 727ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -3.0883e-01 - 416ms/epoch - 968us/step
Epoch 3/10
430/430 - 0s - loss: -4.0192e-01 - 428ms/epoch - 996us/step
Epoch 4/10
430/430 - 0s - loss: -4.3636e-01 - 461ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -4.6555e-01 - 439ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -4.9016e-01 - 427ms/epoch - 993us/step
Epoch 7/10
430/430 - 0s - loss: -5.2078e-01 - 414ms/epoch - 963us/step
Epoch 8/10
430/430 - 0s -

acc appended
loss appended


attempting mutual information neural estimation
Epoch 1/10
430/430 - 3s - loss: -2.6617e+00 - 3s/epoch - 6ms/step
Epoch 2/10
430/430 - 2s - loss: -3.9322e+00 - 2s/epoch - 4ms/step
Epoch 3/10
430/430 - 3s - loss: -4.2229e+00 - 3s/epoch - 6ms/step
Epoch 4/10
430/430 - 2s - loss: -4.7931e+00 - 2s/epoch - 4ms/step
Epoch 5/10
430/430 - 2s - loss: -4.9719e+00 - 2s/epoch - 5ms/step
Epoch 6/10
430/430 - 2s - loss: -5.3939e+00 - 2s/epoch - 5ms/step
Epoch 7/10
430/430 - 2s - loss: -5.7843e+00 - 2s/epoch - 5ms/step
Epoch 8/10
430/430 - 2s - loss: -6.2692e+00 - 2s/epoch - 6ms/step
Epoch 9/10
430/430 - 2s - loss: -5.8176e+00 - 2s/epoch - 5ms/step
Epoch 10/10
430/430 - 2s - loss: -6.7403e+00 - 2s/epoch - 6ms/step
Epoch 1/10
430/430 - 1s - loss: -1.2779e+00 - 1s/epoch - 3ms/step
Epoch 2/10
430/430 - 1s - loss: -2.4803e+00 - 1s/epoch - 3ms/step
Epoch 3/10
430/430 - 1s - loss: -2.8801e+00 - 1s/epoch - 3ms/step
Epoch 4/10
430/430 - 1s - loss: -3.0731e+00 - 963ms/epoch - 2ms/

Epoch 4/10
430/430 - 2s - loss: -7.0243e-01 - 2s/epoch - 5ms/step
Epoch 5/10
430/430 - 2s - loss: -8.6765e-01 - 2s/epoch - 4ms/step
Epoch 6/10
430/430 - 2s - loss: -9.7313e-01 - 2s/epoch - 4ms/step
Epoch 7/10
430/430 - 2s - loss: -1.0699e+00 - 2s/epoch - 4ms/step
Epoch 8/10
430/430 - 2s - loss: -1.1576e+00 - 2s/epoch - 4ms/step
Epoch 9/10
430/430 - 2s - loss: -1.2776e+00 - 2s/epoch - 5ms/step
Epoch 10/10
430/430 - 1s - loss: -1.3286e+00 - 1s/epoch - 3ms/step
Epoch 1/10
430/430 - 1s - loss: -1.1902e+00 - 795ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -2.0206e+00 - 487ms/epoch - 1ms/step
Epoch 3/10
430/430 - 0s - loss: -2.2985e+00 - 465ms/epoch - 1ms/step
Epoch 4/10
430/430 - 0s - loss: -2.4320e+00 - 484ms/epoch - 1ms/step
Epoch 5/10
430/430 - 1s - loss: -2.6228e+00 - 503ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -2.7466e+00 - 490ms/epoch - 1ms/step
Epoch 7/10
430/430 - 0s - loss: -2.7380e+00 - 499ms/epoch - 1ms/step
Epoch 8/10
430/430 - 0s - loss: -2.8750e+00 - 483ms/ep

Epoch 1/10
430/430 - 1s - loss: -1.3107e-01 - 772ms/epoch - 2ms/step
Epoch 2/10
430/430 - 0s - loss: -2.9004e-01 - 454ms/epoch - 1ms/step
Epoch 3/10
430/430 - 0s - loss: -3.9457e-01 - 450ms/epoch - 1ms/step
Epoch 4/10
430/430 - 0s - loss: -4.5096e-01 - 447ms/epoch - 1ms/step
Epoch 5/10
430/430 - 0s - loss: -4.9497e-01 - 447ms/epoch - 1ms/step
Epoch 6/10
430/430 - 0s - loss: -5.2452e-01 - 447ms/epoch - 1ms/step
Epoch 7/10
430/430 - 0s - loss: -5.5290e-01 - 448ms/epoch - 1ms/step
Epoch 8/10
430/430 - 0s - loss: -5.9178e-01 - 447ms/epoch - 1ms/step
Epoch 9/10
430/430 - 0s - loss: -6.1906e-01 - 446ms/epoch - 1ms/step
Epoch 10/10
430/430 - 0s - loss: -6.5394e-01 - 447ms/epoch - 1ms/step
Epoch 1/10
430/430 - 3s - loss: -2.5348e+00 - 3s/epoch - 6ms/step
Epoch 2/10
430/430 - 2s - loss: -4.1076e+00 - 2s/epoch - 5ms/step
Epoch 3/10
430/430 - 2s - loss: -4.2617e+00 - 2s/epoch - 4ms/step
Epoch 4/10
430/430 - 2s - loss: -4.7372e+00 - 2s/epoch - 4ms/step
Epoch 5/10
430/430 - 2s - loss: -5.1375e+00 -

In [None]:
custom_ticks = [i for i in range(pruning_iterations+2)] # custom ticks for averaged graphs
custom_labels = ['NT', 'FT'] + [f"{round(calc_sparsity(i, pruning_rate), 2)}%" for i in range(pruning_iterations)] # see above

In [None]:
print(custom_ticks)
print(custom_labels)

In [None]:
plt.figure(1)
plt.plot(np.average(data_dict["accuracies"], axis=0), 'o-', label="large_final")
plt.plot(np.average(data_dict["accuracies_init"], axis=0), 'o-', label="large_final, init")
plt.plot(np.average(data_dict["accuracies_rand"], axis=0), 'o-', label="random")

plt.xticks(custom_ticks,custom_labels)
plt.ylim(0,1)
plt.xlabel('sparisty')
plt.ylabel('accuracy')
plt.title(f'Accuracy as models get more sparse, average of {averaging_iterations} runs')
plt.legend()

# Display the plot
plt.show()

In [None]:
plt.figure(2)
plt.plot(np.average(data_dict["losses"], axis=0), 'o-', label="large_final")
plt.plot(np.average(data_dict["losses_init"], axis=0), 'o-', label="large_final, init")
plt.plot(np.average(data_dict["losses_rand"], axis=0), 'o-', label="random")
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('loss')
plt.title(f'Loss as models get more sparse, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_x_d1"], axis=0), 'o-', label="x_d1_LF")
plt.plot(np.average(data_dict["MI_estimate_x_d2"], axis=0), 'o-', label="x_d2_LF")
plt.plot(np.average(data_dict["MI_estimate_x_o"], axis=0), 'o-', label="x_o_LF")
plt.plot(np.average(data_dict["MI_estimate_d1_d2"], axis=0), 'o-', label="d1_d2_LF")
plt.plot(np.average(data_dict["MI_estimate_d1_o"], axis=0), 'o-', label="d1_o_LF")
plt.plot(np.average(data_dict["MI_estimate_d2_o"], axis=0), 'o-', label="d2_o_LF")
plt.plot(np.average(data_dict["MI_estimate_x_d1_init"], axis=0), 'o-', label="x_d1_LF_init")
plt.plot(np.average(data_dict["MI_estimate_x_d2_init"], axis=0), 'o-', label="x_d2_LF_init")
plt.plot(np.average(data_dict["MI_estimate_x_o_init"], axis=0), 'o-', label="x_o_LF_init")
plt.plot(np.average(data_dict["MI_estimate_d1_d2_init"], axis=0), 'o-', label="d1_d2_LF_init")
plt.plot(np.average(data_dict["MI_estimate_d1_o_init"], axis=0), 'o-', label="d1_o_LF_init")
plt.plot(np.average(data_dict["MI_estimate_d2_o_init"], axis=0), 'o-', label="d2_o_LF_init")
plt.plot(np.average(data_dict["MI_estimate_x_d1_rand"], axis=0), 'o-', label="x_d1_random")
plt.plot(np.average(data_dict["MI_estimate_x_d2_rand"], axis=0), 'o-', label="x_d2_random")
plt.plot(np.average(data_dict["MI_estimate_x_o_rand"], axis=0), 'o-', label="x_o_random")
plt.plot(np.average(data_dict["MI_estimate_d1_d2_rand"], axis=0), 'o-', label="d1_d2_random")
plt.plot(np.average(data_dict["MI_estimate_d1_o_rand"], axis=0), 'o-', label="d1_o_random")
plt.plot(np.average(data_dict["MI_estimate_d2_o_rand"], axis=0), 'o-', label="d2_o_random")
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimates of layers in models of varying sparsity, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_x_d1"], axis=0), "o-", color="b", linewidth=4, label="x_d1_LF")
for graph in data_dict["MI_estimate_x_d1"]:
    plt.plot(graph, "o", color="b", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_x_d1_init"], axis=0), "o-", color="g", linewidth=4, label="x_d1_LF_init")
for graph in data_dict["MI_estimate_x_d1_init"]:
    plt.plot(graph, "o", color="g", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_x_d1_rand"], axis=0), "o-", color="r", linewidth=4, label="x_d1_random")
for graph in data_dict["MI_estimate_x_d1_rand"]:
    plt.plot(graph, "o", color="r", markersize=5, alpha=0.5)
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimation of Input with first hidden layer, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_x_d2"], axis=0), "o-", color="b", linewidth=4, label="x_d2_LF")
for graph in data_dict["MI_estimate_x_d2"]:
    plt.plot(graph, "o", color="b", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_x_d2_init"], axis=0), "o-", color="g", linewidth=4, label="x_d2_LF_init")
for graph in data_dict["MI_estimate_x_d2_init"]:
    plt.plot(graph, "o", color="g", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_x_d2_rand"], axis=0), "o-", color="r", linewidth=4, label="x_d2_random")
for graph in data_dict["MI_estimate_x_d2_rand"]:
    plt.plot(graph, "o", color="r", markersize=5, alpha=0.5)
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimation of Input with second hidden layer, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_x_o"], axis=0), "o-", color="b", linewidth=4, label="x_o_LF")
for graph in data_dict["MI_estimate_x_o"]:
    plt.plot(graph, "o", color="b", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_x_o_init"], axis=0), "o-", color="g", linewidth=4, label="x_o_LF_init")
for graph in data_dict["MI_estimate_x_o_init"]:
    plt.plot(graph, "o", color="g", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_x_o_rand"], axis=0), "o-", color="r", linewidth=4, label="x_o_random")
for graph in data_dict["MI_estimate_x_o_rand"]:
    plt.plot(graph, "o", color="r", markersize=5, alpha=0.5)
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimation of Input with output layer, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_d1_d2"], axis=0), "o-", color="b", linewidth=4, label="d1_d2_LF")
for graph in data_dict["MI_estimate_d1_d2"]:
    plt.plot(graph, "o", color="b", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_d1_d2_init"], axis=0), "o-", color="g", linewidth=4, label="d1_d2_LF_init")
for graph in data_dict["MI_estimate_d1_d2_init"]:
    plt.plot(graph, "o", color="g", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_d1_d2_rand"], axis=0), "o-", color="r", linewidth=4, label="d1_d2_random")
for graph in data_dict["MI_estimate_d1_d2_rand"]:
    plt.plot(graph, "o", color="r", markersize=5, alpha=0.5)
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimation of first with second hidden layer, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_d1_o"], axis=0), "o-", color="b", linewidth=4, label="d1_o_LF")
for graph in data_dict["MI_estimate_d1_o"]:
    plt.plot(graph, "o", color="b", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_d1_o_init"], axis=0), "o-", color="g", linewidth=4, label="d1_o_LF_init")
for graph in data_dict["MI_estimate_d1_o_init"]:
    plt.plot(graph, "o", color="g", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_d1_o_rand"], axis=0), "o-", color="r", linewidth=4, label="d1_o_random")
for graph in data_dict["MI_estimate_d1_o_rand"]:
    plt.plot(graph, "o", color="r", markersize=5, alpha=0.5)
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimation of first hidden with output layer, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
plt.plot(np.average(data_dict["MI_estimate_d2_o"], axis=0), "o-", color="b", linewidth=4, label="d2_o_LF")
for graph in data_dict["MI_estimate_d2_o"]:
    plt.plot(graph, "o", color="b", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_d2_o_init"], axis=0), "o-", color="g", linewidth=4, label="d2_o_LF_init")
for graph in data_dict["MI_estimate_d2_o_init"]:
    plt.plot(graph, "o", color="g", markersize=5, alpha=0.5)
plt.plot(np.average(data_dict["MI_estimate_d2_o_rand"], axis=0), "o-", color="r", linewidth=4, label="d2_o_random")
for graph in data_dict["MI_estimate_d2_o_rand"]:
    plt.plot(graph, "o", color="r", markersize=5, alpha=0.5)
plt.legend()
plt.xticks(custom_ticks,custom_labels)
plt.xlabel('sparisty')
plt.ylabel('MI estimate')
plt.title(f'MI estimation of second hidden with output layer, average of {averaging_iterations} runs')

# Display the plot
plt.show()

In [None]:
plt.figure()
# Plot each loss history with a different label
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_d1"])):
    plt.plot(-loss_history,label=custom_labels[i]+"-LF")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_d1_init"])):
    plt.plot(-loss_history,label=custom_labels[i]+"-LF_init")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_d1_rand"])):
    plt.plot(-loss_history,label=custom_labels[i]+"R")
plt.legend()
# Add labels, title, and legend
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Loss Histories of sparsities through training for x_d1, average of {averaging_iterations} runs')


# Show the plot
plt.show()

In [None]:
plt.figure()
# Plot each loss history with a different label
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_d2"])):
    plt.plot(-loss_history,label=custom_labels[i]+"LF")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_d2_init"])):
    plt.plot(-loss_history,label=custom_labels[i]+"LF_init")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_d2_rand"])):
    plt.plot(-loss_history,label=custom_labels[i]+"R")
plt.legend()
# Add labels, title, and legend
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Loss Histories of sparsities through training for x_d2, average of {averaging_iterations} runs')


# Show the plot
plt.show()

In [None]:
plt.figure()
# Plot each loss history with a different label
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_o"])):
    plt.plot(-loss_history,label=custom_labels[i]+"LF")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_o_init"])):
    plt.plot(-loss_history,label=custom_labels[i]+"LF_init")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_x_o_rand"])):
    plt.plot(-loss_history,label=custom_labels[i]+"R")
plt.legend()
# Add labels, title, and legend
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Loss Histories of sparsities through training for x_o, average of {averaging_iterations} runs')


# Show the plot
plt.show()

In [None]:
plt.figure()
# Plot each loss history with a different label
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d1_d2"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"LF")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d1_d2_init"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"LF_init")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d1_d2_rand"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"R")
plt.legend()
# Add labels, title, and legend
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Loss Histories of sparsities through training for d1_d2, average of {averaging_iterations} runs')


# Show the plot
plt.show()

In [None]:
plt.figure()
# Plot each loss history with a different label
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d1_o"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"LF")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d1_o_init"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"LF_init")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d1_o_rand"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"R")
plt.legend()
# Add labels, title, and legend
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Loss Histories of sparsities through training for d1_o, average of {averaging_iterations} runs')


# Show the plot
plt.show()

In [None]:
plt.figure()
# Plot each loss history with a different label
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d2_o"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"LF")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d2_o_init"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"LF_init")
for i, loss_history in enumerate(calculate_average_loss(data_dict["MI_hist_d2_o_rand"])):
    plt.plot(-loss_history, 'o-', label=custom_labels[i]+"R")
plt.legend()
# Add labels, title, and legend
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title(f'Loss Histories of sparsities through training for d2_o, average of {averaging_iterations} runs')


# Show the plot
plt.show()