In [1]:
!git clone https://github.com/AntreasAntoniou/HowToTrainYourMAMLPytorch.git

Cloning into 'HowToTrainYourMAMLPytorch'...
remote: Enumerating objects: 36634, done.[K
remote: Counting objects: 100% (261/261), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 36634 (delta 159), reused 210 (delta 129), pack-reused 36373 (from 1)[K
Receiving objects: 100% (36634/36634), 18.95 MiB | 9.11 MiB/s, done.
Resolving deltas: 100% (2204/2204), done.
Updating files: 100% (32570/32570), done.


In [2]:
import os

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# from meta_neural_network_architectures import VGGReLUNormNetwork

In [3]:
def set_torch_seed(seed):
    """
    Sets the pytorch seeds for current experiment run
    :param seed: The seed (int)
    :return: A random number generator to use
    """
    rng = np.random.RandomState(seed=seed)
    torch_seed = rng.randint(0, 999999)
    torch.manual_seed(seed=torch_seed)

    return rng

# MAML Classifier Class

In [4]:
import logging
import os
from collections import OrderedDict

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class GradientDescentLearningRule(nn.Module):
    """Simple (stochastic) gradient descent learning rule.
    For a scalar error function `E(p[0], p_[1] ... )` of some set of
    potentially multidimensional parameters this attempts to find a local
    minimum of the loss function by applying updates to each parameter of the
    form
        p[i] := p[i] - learning_rate * dE/dp[i]
    With `learning_rate` a positive scaling parameter.
    The error function used in successive applications of these updates may be
    a stochastic estimator of the true error function (e.g. when the error with
    respect to only a subset of data-points is calculated) in which case this
    will correspond to a stochastic gradient descent learning rule.
    """

    def __init__(self, device, learning_rate=1e-3):
        """Creates a new learning rule object.
        Args:
            learning_rate: A postive scalar to scale gradient updates to the
                parameters by. This needs to be carefully set - if too large
                the learning dynamic will be unstable and may diverge, while
                if set too small learning will proceed very slowly.
        """
        super(GradientDescentLearningRule, self).__init__()
        assert learning_rate > 0., 'learning_rate should be positive.'
        self.learning_rate = torch.ones(1) * learning_rate
        self.learning_rate.to(device)

    def update_params(self, names_weights_dict, names_grads_wrt_params_dict, num_step, tau=0.9):
        """Applies a single gradient descent update to all parameters.
        All parameter updates are performed using in-place operations and so
        nothing is returned.
        Args:
            grads_wrt_params: A list of gradients of the scalar loss function
                with respect to each of the parameters passed to `initialise`
                previously, with this list expected to be in the same order.
        """
        return {
            key: names_weights_dict[key]
            - self.learning_rate * names_grads_wrt_params_dict[key]
            for key in names_weights_dict.keys()
        }


class LSLRGradientDescentLearningRule(nn.Module):
    """Simple (stochastic) gradient descent learning rule.
    For a scalar error function `E(p[0], p_[1] ... )` of some set of
    potentially multidimensional parameters this attempts to find a local
    minimum of the loss function by applying updates to each parameter of the
    form
        p[i] := p[i] - learning_rate * dE/dp[i]
    With `learning_rate` a positive scaling parameter.
    The error function used in successive applications of these updates may be
    a stochastic estimator of the true error function (e.g. when the error with
    respect to only a subset of data-points is calculated) in which case this
    will correspond to a stochastic gradient descent learning rule.
    """

    def __init__(self, device, total_num_inner_loop_steps, use_learnable_learning_rates, init_learning_rate=1e-3):
        """Creates a new learning rule object.
        Args:
            init_learning_rate: A postive scalar to scale gradient updates to the
                parameters by. This needs to be carefully set - if too large
                the learning dynamic will be unstable and may diverge, while
                if set too small learning will proceed very slowly.
        """
        super(LSLRGradientDescentLearningRule, self).__init__()
        print(init_learning_rate)
        assert init_learning_rate > 0., 'learning_rate should be positive.'

        self.init_learning_rate = torch.ones(1) * init_learning_rate
        self.init_learning_rate.to(device)
        self.total_num_inner_loop_steps = total_num_inner_loop_steps
        self.use_learnable_learning_rates = use_learnable_learning_rates

    def initialise(self, names_weights_dict):
        self.names_learning_rates_dict = nn.ParameterDict()
        for idx, (key, param) in enumerate(names_weights_dict.items()):
            self.names_learning_rates_dict[key.replace(".", "-")] = nn.Parameter(
                data=torch.ones(self.total_num_inner_loop_steps + 1) * self.init_learning_rate,
                requires_grad=self.use_learnable_learning_rates)

    def reset(self):

        # for key, param in self.names_learning_rates_dict.items():
        #     param.fill_(self.init_learning_rate)
        pass

    def update_params(self, names_weights_dict, names_grads_wrt_params_dict, num_step, tau=0.1):
        """Applies a single gradient descent update to all parameters.
        All parameter updates are performed using in-place operations and so
        nothing is returned.
        Args:
            grads_wrt_params: A list of gradients of the scalar loss function
                with respect to each of the parameters passed to `initialise`
                previously, with this list expected to be in the same order.
        """
        return {
            key: names_weights_dict[key]
            - self.names_learning_rates_dict[key.replace(".", "-")][num_step]
            * names_grads_wrt_params_dict[key]
            for key in names_grads_wrt_params_dict.keys()
        }

In [5]:
class MAMLFewShotClassifier(nn.Module):
    def __init__(self, im_shape, device, args):
        """
        Initializes a MAML few shot learning system
        :param im_shape: The images input size, in batch, c, h, w shape
        :param device: The device to use to use the model on.
        :param args: A namedtuple of arguments specifying various hyperparameters.
        """
        super(MAMLFewShotClassifier, self).__init__()
        self.args = args
        self.device = device
        self.batch_size = args.batch_size
        self.use_cuda = args.use_cuda
        self.im_shape = im_shape
        self.current_epoch = 0

        self.rng = set_torch_seed(seed=args.seed)
        self.classifier = VGGReLUNormNetwork(im_shape=self.im_shape, num_output_classes=self.args.
                                             num_classes_per_set,
                                             args=args, device=device, meta_classifier=True).to(device=self.device)
        self.task_learning_rate = args.task_learning_rate

        self.inner_loop_optimizer = LSLRGradientDescentLearningRule(device=device,
                                                                    init_learning_rate=self.task_learning_rate,
                                                                    total_num_inner_loop_steps=self.args.number_of_training_steps_per_iter,
                                                                    use_learnable_learning_rates=self.args.learnable_per_layer_per_step_inner_loop_learning_rate)
        self.inner_loop_optimizer.initialise(
            names_weights_dict=self.get_inner_loop_parameter_dict(params=self.classifier.named_parameters()))

        print("Inner Loop parameters")
        for key, value in self.inner_loop_optimizer.named_parameters():
            print(key, value.shape)

        self._noise_size = 0.1

        self.use_cuda = args.use_cuda
        self.device = device
        self.args = args
        self.to(device)
        print("Outer Loop parameters")
        for name, param in self.named_parameters():
            if param.requires_grad:
                print(name, param.shape, param.device, param.requires_grad)


        self.optimizer = optim.Adam(self.trainable_parameters(), lr=args.meta_learning_rate, amsgrad=False)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer=self.optimizer, T_max=self.args.total_epochs,
                                                              eta_min=self.args.min_learning_rate)

        self.device = torch.device('cpu')
        if torch.cuda.is_available():
            if torch.cuda.device_count() > 1:
                self.to(torch.cuda.current_device())
                self.classifier = nn.DataParallel(module=self.classifier)
            else:
                self.to(torch.cuda.current_device())

            self.device = torch.cuda.current_device()

    def get_per_step_loss_importance_vector(self):
        """
        Generates a tensor of dimensionality (num_inner_loop_steps) indicating the importance of each step's target
        loss towards the optimization loss.
        :return: A tensor to be used to compute the weighted average of the loss, useful for
        the MSL (Multi Step Loss) mechanism.
        """
        loss_weights = np.ones(shape=(self.args.number_of_training_steps_per_iter)) * (
                1.0 / self.args.number_of_training_steps_per_iter)
        decay_rate = 1.0 / self.args.number_of_training_steps_per_iter / self.args.multi_step_loss_num_epochs
        min_value_for_non_final_losses = 0.03 / self.args.number_of_training_steps_per_iter
        for i in range(len(loss_weights) - 1):
            curr_value = np.maximum(loss_weights[i] - (self.current_epoch * decay_rate), min_value_for_non_final_losses)
            loss_weights[i] = curr_value

        curr_value = np.minimum(
            loss_weights[-1] + (self.current_epoch * (self.args.number_of_training_steps_per_iter - 1) * decay_rate),
            1.0 - ((self.args.number_of_training_steps_per_iter - 1) * min_value_for_non_final_losses))
        loss_weights[-1] = curr_value
        loss_weights = torch.Tensor(loss_weights).to(device=self.device)
        return loss_weights

    def get_inner_loop_parameter_dict(self, params):
        """
        Returns a dictionary with the parameters to use for inner loop updates.
        :param params: A dictionary of the network's parameters.
        :return: A dictionary of the parameters to use for the inner loop optimization process.
        """
        return {
            name: param.to(device=self.device)
            for name, param in params
            if param.requires_grad
            and (
                not self.args.enable_inner_loop_optimizable_bn_params
                and "norm_layer" not in name
                or self.args.enable_inner_loop_optimizable_bn_params
            )
        }

    def apply_inner_loop_update(self, loss, names_weights_copy, use_second_order, current_step_idx):
        """
        Applies an inner loop update given current step's loss, the weights to update, a flag indicating whether to use
        second order derivatives and the current step's index.
        :param loss: Current step's loss with respect to the support set.
        :param names_weights_copy: A dictionary with names to parameters to update.
        :param use_second_order: A boolean flag of whether to use second order derivatives.
        :param current_step_idx: Current step's index.
        :return: A dictionary with the updated weights (name, param)
        """
        num_gpus = torch.cuda.device_count()
        if num_gpus > 1:
            self.classifier.module.zero_grad(params=names_weights_copy)
        else:
            self.classifier.zero_grad(params=names_weights_copy)

        grads = torch.autograd.grad(loss, names_weights_copy.values(),
                                    create_graph=use_second_order, allow_unused=True)
        names_grads_copy = dict(zip(names_weights_copy.keys(), grads))

        names_weights_copy = {key: value[0] for key, value in names_weights_copy.items()}

        for key, grad in names_grads_copy.items():
            if grad is None:
                print('Grads not found for inner loop parameter', key)
            names_grads_copy[key] = names_grads_copy[key].sum(dim=0)


        names_weights_copy = self.inner_loop_optimizer.update_params(names_weights_dict=names_weights_copy,
                                                                     names_grads_wrt_params_dict=names_grads_copy,
                                                                     num_step=current_step_idx)

        num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
        names_weights_copy = {
            name.replace('module.', ''): value.unsqueeze(0).repeat(
                [num_devices] + [1 for i in range(len(value.shape))]) for
            name, value in names_weights_copy.items()}


        return names_weights_copy

    def get_across_task_loss_metrics(self, total_losses, total_accuracies):
        losses = {'loss': torch.mean(torch.stack(total_losses))}

        losses['accuracy'] = np.mean(total_accuracies)

        return losses

    def forward(self, data_batch, epoch, use_second_order, use_multi_step_loss_optimization, num_steps, training_phase):
        """
        Runs a forward outer loop pass on the batch of tasks using the MAML/++ framework.
        :param data_batch: A data batch containing the support and target sets.
        :param epoch: Current epoch's index
        :param use_second_order: A boolean saying whether to use second order derivatives.
        :param use_multi_step_loss_optimization: Whether to optimize on the outer loop using just the last step's
        target loss (True) or whether to use multi step loss which improves the stability of the system (False)
        :param num_steps: Number of inner loop steps.
        :param training_phase: Whether this is a training phase (True) or an evaluation phase (False)
        :return: A dictionary with the collected losses of the current outer forward propagation.
        """
        x_support_set, x_target_set, y_support_set, y_target_set = data_batch

        [b, ncs, spc] = y_support_set.shape

        self.num_classes_per_set = ncs

        total_losses = []
        total_accuracies = []
        per_task_target_preds = [[] for i in range(len(x_target_set))]
        self.classifier.zero_grad()
        task_accuracies = []
        for task_id, (x_support_set_task, y_support_set_task, x_target_set_task, y_target_set_task) in enumerate(zip(x_support_set,
                              y_support_set,
                              x_target_set,
                              y_target_set)):
            task_losses = []
            per_step_loss_importance_vectors = self.get_per_step_loss_importance_vector()
            names_weights_copy = self.get_inner_loop_parameter_dict(self.classifier.named_parameters())

            num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1

            names_weights_copy = {
                name.replace('module.', ''): value.unsqueeze(0).repeat(
                    [num_devices] + [1 for i in range(len(value.shape))]) for
                name, value in names_weights_copy.items()}

            n, s, c, h, w = x_target_set_task.shape

            x_support_set_task = x_support_set_task.view(-1, c, h, w)
            y_support_set_task = y_support_set_task.view(-1)
            x_target_set_task = x_target_set_task.view(-1, c, h, w)
            y_target_set_task = y_target_set_task.view(-1)

            for num_step in range(num_steps):

                support_loss, support_preds = self.net_forward(
                    x=x_support_set_task,
                    y=y_support_set_task,
                    weights=names_weights_copy,
                    backup_running_statistics=num_step == 0,
                    training=True,
                    num_step=num_step,
                )


                names_weights_copy = self.apply_inner_loop_update(loss=support_loss,
                                                                  names_weights_copy=names_weights_copy,
                                                                  use_second_order=use_second_order,
                                                                  current_step_idx=num_step)

                if use_multi_step_loss_optimization and training_phase and epoch < self.args.multi_step_loss_num_epochs:
                    target_loss, target_preds = self.net_forward(x=x_target_set_task,
                                                                 y=y_target_set_task, weights=names_weights_copy,
                                                                 backup_running_statistics=False, training=True,
                                                                 num_step=num_step)

                    task_losses.append(per_step_loss_importance_vectors[num_step] * target_loss)
                elif num_step == (self.args.number_of_training_steps_per_iter - 1):
                    target_loss, target_preds = self.net_forward(x=x_target_set_task,
                                                                 y=y_target_set_task, weights=names_weights_copy,
                                                                 backup_running_statistics=False, training=True,
                                                                 num_step=num_step)
                    task_losses.append(target_loss)

            per_task_target_preds[task_id] = target_preds.detach().cpu().numpy()
            _, predicted = torch.max(target_preds.data, 1)

            accuracy = predicted.float().eq(y_target_set_task.data.float()).cpu().float()
            task_losses = torch.sum(torch.stack(task_losses))
            total_losses.append(task_losses)
            total_accuracies.extend(accuracy)

            if not training_phase:
                self.classifier.restore_backup_stats()

        losses = self.get_across_task_loss_metrics(total_losses=total_losses,
                                                   total_accuracies=total_accuracies)

        for idx, item in enumerate(per_step_loss_importance_vectors):
            losses['loss_importance_vector_{}'.format(idx)] = item.detach().cpu().numpy()

        return losses, per_task_target_preds

    def net_forward(self, x, y, weights, backup_running_statistics, training, num_step):
        """
        A base model forward pass on some data points x. Using the parameters in the weights dictionary. Also requires
        boolean flags indicating whether to reset the running statistics at the end of the run (if at evaluation phase).
        A flag indicating whether this is the training session and an int indicating the current step's number in the
        inner loop.
        :param x: A data batch of shape b, c, h, w
        :param y: A data targets batch of shape b, n_classes
        :param weights: A dictionary containing the weights to pass to the network.
        :param backup_running_statistics: A flag indicating whether to reset the batch norm running statistics to their
         previous values after the run (only for evaluation)
        :param training: A flag indicating whether the current process phase is a training or evaluation.
        :param num_step: An integer indicating the number of the step in the inner loop.
        :return: the crossentropy losses with respect to the given y, the predictions of the base model.
        """
        preds = self.classifier.forward(x=x, params=weights,
                                        training=training,
                                        backup_running_statistics=backup_running_statistics, num_step=num_step)

        loss = F.cross_entropy(input=preds, target=y)

        return loss, preds

    def trainable_parameters(self):
        """
        Returns an iterator over the trainable parameters of the model.
        """
        for param in self.parameters():
            if param.requires_grad:
                noise = torch.randn_like(param) * self._noise_size  # TODO EXPERIMENT WITH THIS
                print(f"the grinch added {noise} amount of noise :)")
                param.data.add_(noise)
                yield param

    def train_forward_prop(self, data_batch, epoch):
        """
        Runs an outer loop forward prop using the meta-model and base-model.
        :param data_batch: A data batch containing the support set and the target set input, output pairs.
        :param epoch: The index of the currrent epoch.
        :return: A dictionary of losses for the current step.
        """
        losses, per_task_target_preds = self.forward(data_batch=data_batch, epoch=epoch,
                                                     use_second_order=self.args.second_order and
                                                                      epoch > self.args.first_order_to_second_order_epoch,
                                                     use_multi_step_loss_optimization=self.args.use_multi_step_loss_optimization,
                                                     num_steps=self.args.number_of_training_steps_per_iter,
                                                     training_phase=True)
        return losses, per_task_target_preds

    def evaluation_forward_prop(self, data_batch, epoch):
        """
        Runs an outer loop evaluation forward prop using the meta-model and base-model.
        :param data_batch: A data batch containing the support set and the target set input, output pairs.
        :param epoch: The index of the currrent epoch.
        :return: A dictionary of losses for the current step.
        """
        losses, per_task_target_preds = self.forward(data_batch=data_batch, epoch=epoch, use_second_order=False,
                                                     use_multi_step_loss_optimization=True,
                                                     num_steps=self.args.number_of_evaluation_steps_per_iter,
                                                     training_phase=False)

        return losses, per_task_target_preds

    def meta_update(self, loss):
        """
        Applies an outer loop update on the meta-parameters of the model.
        :param loss: The current crossentropy loss.
        """
        self.optimizer.zero_grad()
        loss.backward()
        if 'imagenet' in self.args.dataset_name:
            for name, param in self.classifier.named_parameters():
                if param.requires_grad:
                    param.grad.data.clamp_(-10, 10)  # not sure if this is necessary, more experiments are needed
        self._noise_size *= 0.95
        print(f"noise size: {self._noise_size}")
        self.optimizer.step()

    def run_train_iter(self, data_batch, epoch):
        """
        Runs an outer loop update step on the meta-model's parameters.
        :param data_batch: input data batch containing the support set and target set input, output pairs
        :param epoch: the index of the current epoch
        :return: The losses of the ran iteration.
        """
        epoch = int(epoch)
        if epoch > 1:
          self.scheduler.step(epoch=epoch)
        if self.current_epoch != epoch:
            self.current_epoch = epoch

        if not self.training:
            self.train()

        x_support_set, x_target_set, y_support_set, y_target_set = data_batch

        x_support_set = torch.Tensor(x_support_set).float().to(device=self.device)
        x_target_set = torch.Tensor(x_target_set).float().to(device=self.device)
        y_support_set = torch.Tensor(y_support_set).long().to(device=self.device)
        y_target_set = torch.Tensor(y_target_set).long().to(device=self.device)

        data_batch = (x_support_set, x_target_set, y_support_set, y_target_set)

        losses, per_task_target_preds = self.train_forward_prop(data_batch=data_batch, epoch=epoch)

        self.meta_update(loss=losses['loss'])
        losses['learning_rate'] = self.scheduler.get_last_lr()[0]
        self.optimizer.zero_grad()
        self.zero_grad()

        return losses, per_task_target_preds

    def run_validation_iter(self, data_batch):
        """
        Runs an outer loop evaluation step on the meta-model's parameters.
        :param data_batch: input data batch containing the support set and target set input, output pairs
        :param epoch: the index of the current epoch
        :return: The losses of the ran iteration.
        """

        if self.training:
            self.eval()

        x_support_set, x_target_set, y_support_set, y_target_set = data_batch

        x_support_set = torch.Tensor(x_support_set).float().to(device=self.device)
        x_target_set = torch.Tensor(x_target_set).float().to(device=self.device)
        y_support_set = torch.Tensor(y_support_set).long().to(device=self.device)
        y_target_set = torch.Tensor(y_target_set).long().to(device=self.device)

        data_batch = (x_support_set, x_target_set, y_support_set, y_target_set)

        losses, per_task_target_preds = self.evaluation_forward_prop(data_batch=data_batch, epoch=self.current_epoch)

        # losses['loss'].backward() # uncomment if you get the weird memory error
        # self.zero_grad()
        # self.optimizer.zero_grad()

        return losses, per_task_target_preds

    def save_model(self, model_save_dir, state):
        """
        Save the network parameter state and experiment state dictionary.
        :param model_save_dir: The directory to store the state at.
        :param state: The state containing the experiment state and the network. It's in the form of a dictionary
        object.
        """
        state['network'] = self.state_dict()
        state['optimizer'] = self.optimizer.state_dict()
        torch.save(state, f=model_save_dir)

    def load_model(self, model_save_dir, model_name, model_idx):
        """
        Load checkpoint and return the state dictionary containing the network state params and experiment state.
        :param model_save_dir: The directory from which to load the files.
        :param model_name: The model_name to be loaded from the direcotry.
        :param model_idx: The index of the model (i.e. epoch number or 'latest' for the latest saved model of the current
        experiment)
        :return: A dictionary containing the experiment state and the saved model parameters.
        """
        filepath = os.path.join(model_save_dir, "{}_{}".format(model_name, model_idx))
        state = torch.load(filepath)
        state_dict_loaded = state['network']
        self.optimizer.load_state_dict(state['optimizer'])
        self.load_state_dict(state_dict=state_dict_loaded)
        return state

# Layer Classes

In [6]:
import numbers
from copy import copy

import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np

In [7]:
def extract_top_level_dict(current_dict):
    """
    Builds a graph dictionary from the passed depth_keys, value pair. Useful for dynamically passing external params
    :param depth_keys: A list of strings making up the name of a variable. Used to make a graph for that params tree.
    :param value: Param value
    :param key_exists: If none then assume new dict, else load existing dict and add new key->value pairs to it.
    :return: A dictionary graph of the params already added to the graph.
    """
    output_dict = {}
    for key in current_dict.keys():
        name = key.replace("layer_dict.", "")
        name = name.replace("layer_dict.", "")
        name = name.replace("block_dict.", "")
        name = name.replace("module-", "")
        top_level = name.split(".")[0]
        sub_level = ".".join(name.split(".")[1:])

        if top_level in output_dict:
            new_item = {key: value for key, value in output_dict[top_level].items()}
            new_item[sub_level] = current_dict[key]
            output_dict[top_level] = new_item

        elif sub_level == "":
            output_dict[top_level] = current_dict[key]
        else:
            output_dict[top_level] = {sub_level: current_dict[key]}
    #print(current_dict.keys(), output_dict.keys())
    return output_dict

In [8]:
class MetaConv2dLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, use_bias, groups=1, dilation_rate=1):
        """
        A MetaConv2D layer. Applies the same functionality of a standard Conv2D layer with the added functionality of
        being able to receive a parameter dictionary at the forward pass which allows the convolution to use external
        weights instead of the internal ones stored in the conv layer. Useful for inner loop optimization in the meta
        learning setting.
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param kernel_size: Convolutional kernel size
        :param stride: Convolutional stride
        :param padding: Convolution padding
        :param use_bias: Boolean indicating whether to use a bias or not.
        """
        super(MetaConv2dLayer, self).__init__()
        num_filters = out_channels
        self.stride = int(stride)
        self.padding = int(padding)
        self.dilation_rate = int(dilation_rate)
        self.use_bias = use_bias
        self.groups = int(groups)
        self.weight = nn.Parameter(torch.empty(num_filters, in_channels, kernel_size, kernel_size))
        nn.init.xavier_uniform_(self.weight)

        if self.use_bias:
            self.bias = nn.Parameter(torch.zeros(num_filters))

    def forward(self, x, params=None):
        """
        Applies a conv2D forward pass. If params are not None will use the passed params as the conv weights and biases
        :param x: Input image batch.
        :param params: If none, then conv layer will use the stored self.weights and self.bias, if they are not none
        then the conv layer will use the passed params as its parameters.
        :return: The output of a convolutional function.
        """
        if params is not None:
            params = extract_top_level_dict(current_dict=params)
            if self.use_bias:
                (weight, bias) = params["weight"], params["bias"]
            else:
                (weight) = params["weight"]
                bias = None
        elif self.use_bias:
            weight, bias = self.weight, self.bias
        else:
            weight = self.weight
            bias = None

        return F.conv2d(
            input=x,
            weight=weight,
            bias=bias,
            stride=self.stride,
            padding=self.padding,
            dilation=self.dilation_rate,
            groups=self.groups,
        )

In [9]:
class MetaLinearLayer(nn.Module):
    def __init__(self, input_shape, num_filters, use_bias):
        """
        A MetaLinear layer. Applies the same functionality of a standard linearlayer with the added functionality of
        being able to receive a parameter dictionary at the forward pass which allows the convolution to use external
        weights instead of the internal ones stored in the linear layer. Useful for inner loop optimization in the meta
        learning setting.
        :param input_shape: The shape of the input data, in the form (b, f)
        :param num_filters: Number of output filters
        :param use_bias: Whether to use biases or not.
        """
        super(MetaLinearLayer, self).__init__()
        b, c = input_shape

        self.use_bias = use_bias
        self.weights = nn.Parameter(torch.ones(num_filters, c))
        # nn.init.xavier_uniform_(self.weights) TODO CHANGE BACK TEST GENE (it works kinda)
        if self.use_bias:
            self.bias = nn.Parameter(torch.zeros(num_filters))

    def forward(self, x, params=None):
        """
        Forward propagates by applying a linear function (Wx + b). If params are none then internal params are used.
        Otherwise passed params will be used to execute the function.
        :param x: Input data batch, in the form (b, f)
        :param params: A dictionary containing 'weights' and 'bias'. If params are none then internal params are used.
        Otherwise the external are used.
        :return: The result of the linear function.
        """
        if params is not None:
            params = extract_top_level_dict(current_dict=params)
            if self.use_bias:
                (weight, bias) = params["weights"], params["bias"]
            else:
                (weight) = params["weights"]
                bias = None
        elif self.use_bias:
            weight, bias = self.weights, self.bias
        else:
            weight = self.weights
            bias = None
        return F.linear(input=x, weight=weight, bias=bias)

In [10]:
class MetaBatchNormLayer(nn.Module):
    def __init__(self, num_features, device, args, eps=1e-5, momentum=0.1, affine=True,
                 track_running_stats=True, meta_batch_norm=True, no_learnable_params=False,
                 use_per_step_bn_statistics=False):
        """
        A MetaBatchNorm layer. Applies the same functionality of a standard BatchNorm layer with the added functionality of
        being able to receive a parameter dictionary at the forward pass which allows the convolution to use external
        weights instead of the internal ones stored in the conv layer. Useful for inner loop optimization in the meta
        learning setting. Also has the additional functionality of being able to store per step running stats and per step beta and gamma.
        :param num_features:
        :param device:
        :param args:
        :param eps:
        :param momentum:
        :param affine:
        :param track_running_stats:
        :param meta_batch_norm:
        :param no_learnable_params:
        :param use_per_step_bn_statistics:
        """
        super(MetaBatchNormLayer, self).__init__()
        self.num_features = num_features
        self.eps = eps

        self.affine = affine
        self.track_running_stats = track_running_stats
        self.meta_batch_norm = meta_batch_norm
        self.num_features = num_features
        self.device = device
        self.use_per_step_bn_statistics = use_per_step_bn_statistics
        self.args = args
        self.learnable_gamma = self.args.learnable_bn_gamma
        self.learnable_beta = self.args.learnable_bn_beta

        if use_per_step_bn_statistics:
            self.running_mean = nn.Parameter(torch.zeros(args.number_of_training_steps_per_iter, num_features),
                                             requires_grad=False)
            self.running_var = nn.Parameter(torch.ones(args.number_of_training_steps_per_iter, num_features),
                                            requires_grad=False)
            self.bias = nn.Parameter(torch.zeros(args.number_of_training_steps_per_iter, num_features),
                                     requires_grad=self.learnable_beta)
            self.weight = nn.Parameter(torch.ones(args.number_of_training_steps_per_iter, num_features),
                                       requires_grad=self.learnable_gamma)
        else:
            self.running_mean = nn.Parameter(torch.zeros(num_features), requires_grad=False)
            self.running_var = nn.Parameter(torch.zeros(num_features), requires_grad=False)
            self.bias = nn.Parameter(torch.zeros(num_features),
                                     requires_grad=self.learnable_beta)
            self.weight = nn.Parameter(torch.ones(num_features),
                                       requires_grad=self.learnable_gamma)

        if self.args.enable_inner_loop_optimizable_bn_params:
            self.bias = nn.Parameter(torch.zeros(num_features),
                                     requires_grad=self.learnable_beta)
            self.weight = nn.Parameter(torch.ones(num_features),
                                       requires_grad=self.learnable_gamma)

        self.backup_running_mean = torch.zeros(self.running_mean.shape)
        self.backup_running_var = torch.ones(self.running_var.shape)

        self.momentum = momentum

    def forward(self, input, num_step, params=None, training=False, backup_running_statistics=False):
        """
        Forward propagates by applying a bach norm function. If params are none then internal params are used.
        Otherwise passed params will be used to execute the function.
        :param input: input data batch, size either can be any.
        :param num_step: The current inner loop step being taken. This is used when we are learning per step params and
         collecting per step batch statistics. It indexes the correct object to use for the current time-step
        :param params: A dictionary containing 'weight' and 'bias'.
        :param training: Whether this is currently the training or evaluation phase.
        :param backup_running_statistics: Whether to backup the running statistics. This is used
        at evaluation time, when after the pass is complete we want to throw away the collected validation stats.
        :return: The result of the batch norm operation.
        """
        if params is not None:
            params = extract_top_level_dict(current_dict=params)
            (weight, bias) = params["weight"], params["bias"]
            #print(num_step, params['weight'])
        else:
            #print(num_step, "no params")
            weight, bias = self.weight, self.bias

        if self.use_per_step_bn_statistics:
            running_mean = self.running_mean[num_step]
            running_var = self.running_var[num_step]
            if (
                params is None
                and not self.args.enable_inner_loop_optimizable_bn_params
            ):
                bias = self.bias[num_step]
                weight = self.weight[num_step]
        else:
            running_mean = None
            running_var = None


        if backup_running_statistics and self.use_per_step_bn_statistics:
            self.backup_running_mean.data = copy(self.running_mean.data)
            self.backup_running_var.data = copy(self.running_var.data)

        momentum = self.momentum

        return F.batch_norm(input, running_mean, running_var, weight, bias,
                              training=True, momentum=momentum, eps=self.eps)

    def restore_backup_stats(self):
        """
        Resets batch statistics to their backup values which are collected after each forward pass.
        """
        if self.use_per_step_bn_statistics:
            self.running_mean = nn.Parameter(self.backup_running_mean.to(device=self.device), requires_grad=False)
            self.running_var = nn.Parameter(self.backup_running_var.to(device=self.device), requires_grad=False)

    def extra_repr(self):
        return '{num_features}, eps={eps}, momentum={momentum}, affine={affine}, ' \
               'track_running_stats={track_running_stats}'.format(**self.__dict__)

In [11]:
class MetaLayerNormLayer(nn.Module):
    def __init__(self, input_feature_shape, eps=1e-5, elementwise_affine=True):
        """
        A MetaLayerNorm layer. A layer that applies the same functionality as a layer norm layer with the added
        capability of being able to receive params at inference time to use instead of the internal ones. As well as
        being able to use its own internal weights.
        :param input_feature_shape: The input shape without the batch dimension, e.g. c, h, w
        :param eps: Epsilon to use for protection against overflows
        :param elementwise_affine: Whether to learn a multiplicative interaction parameter 'w' in addition to
        the biases.
        """
        super(MetaLayerNormLayer, self).__init__()
        if isinstance(input_feature_shape, numbers.Integral):
            input_feature_shape = (input_feature_shape,)
        self.normalized_shape = torch.Size(input_feature_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
            self.weight = nn.Parameter(torch.Tensor(*input_feature_shape), requires_grad=False)
            self.bias = nn.Parameter(torch.Tensor(*input_feature_shape))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        """
        Reset parameters to their initialization values.
        """
        if self.elementwise_affine:
            self.weight.data.fill_(1)
            self.bias.data.zero_()

    def forward(self, input, num_step, params=None, training=False, backup_running_statistics=False):
        """
            Forward propagates by applying a layer norm function. If params are none then internal params are used.
            Otherwise passed params will be used to execute the function.
            :param input: input data batch, size either can be any.
            :param num_step: The current inner loop step being taken. This is used when we are learning per step params and
             collecting per step batch statistics. It indexes the correct object to use for the current time-step
            :param params: A dictionary containing 'weight' and 'bias'.
            :param training: Whether this is currently the training or evaluation phase.
            :param backup_running_statistics: Whether to backup the running statistics. This is used
            at evaluation time, when after the pass is complete we want to throw away the collected validation stats.
            :return: The result of the batch norm operation.
        """
        if params is not None:
            params = extract_top_level_dict(current_dict=params)
            bias = params["bias"]
        else:
            bias = self.bias
            #print('no inner loop params', self)

        return F.layer_norm(
            input, self.normalized_shape, self.weight, bias, self.eps)

    def restore_backup_stats(self):
        pass

    def extra_repr(self):
        return '{normalized_shape}, eps={eps}, ' \
               'elementwise_affine={elementwise_affine}'.format(**self.__dict__)

In [12]:
class MetaConvNormLayerReLU(nn.Module):
    def __init__(self, input_shape, num_filters, kernel_size, stride, padding, use_bias, args, normalization=True,
                 meta_layer=True, no_bn_learnable_params=False, device=None):
        """
           Initializes a BatchNorm->Conv->ReLU layer which applies those operation in that order.
           :param args: A named tuple containing the system's hyperparameters.
           :param device: The device to run the layer on.
           :param normalization: The type of normalization to use 'batch_norm' or 'layer_norm'
           :param meta_layer: Whether this layer will require meta-layer capabilities such as meta-batch norm,
           meta-conv etc.
           :param input_shape: The image input shape in the form (b, c, h, w)
           :param num_filters: number of filters for convolutional layer
           :param kernel_size: the kernel size of the convolutional layer
           :param stride: the stride of the convolutional layer
           :param padding: the bias of the convolutional layer
           :param use_bias: whether the convolutional layer utilizes a bias
        """
        super(MetaConvNormLayerReLU, self).__init__()
        self.normalization = normalization
        self.use_per_step_bn_statistics = args.per_step_bn_statistics
        self.input_shape = input_shape
        self.args = args
        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.use_bias = use_bias
        self.meta_layer = meta_layer
        self.no_bn_learnable_params = no_bn_learnable_params
        self.device = device
        self.layer_dict = nn.ModuleDict()
        self.build_block()

    def build_block(self):

        x = torch.zeros(self.input_shape)

        out = x

        self.conv = MetaConv2dLayer(in_channels=out.shape[1], out_channels=self.num_filters,
                                    kernel_size=self.kernel_size,
                                    stride=self.stride, padding=self.padding, use_bias=self.use_bias)



        out = self.conv(out)

        if self.normalization:
            if self.args.norm_layer == "batch_norm":
                self.norm_layer = MetaBatchNormLayer(out.shape[1], track_running_stats=True,
                                                     meta_batch_norm=self.meta_layer,
                                                     no_learnable_params=self.no_bn_learnable_params,
                                                     device=self.device,
                                                     use_per_step_bn_statistics=self.use_per_step_bn_statistics,
                                                     args=self.args)
            elif self.args.norm_layer == "layer_norm":
                self.norm_layer = MetaLayerNormLayer(input_feature_shape=out.shape[1:])

            out = self.norm_layer(out, num_step=0)

        out = F.leaky_relu(out)

        print(out.shape)

    def forward(self, x, num_step, params=None, training=False, backup_running_statistics=False):
        """
            Forward propagates by applying the function. If params are none then internal params are used.
            Otherwise passed params will be used to execute the function.
            :param input: input data batch, size either can be any.
            :param num_step: The current inner loop step being taken. This is used when we are learning per step params and
             collecting per step batch statistics. It indexes the correct object to use for the current time-step
            :param params: A dictionary containing 'weight' and 'bias'.
            :param training: Whether this is currently the training or evaluation phase.
            :param backup_running_statistics: Whether to backup the running statistics. This is used
            at evaluation time, when after the pass is complete we want to throw away the collected validation stats.
            :return: The result of the batch norm operation.
        """
        batch_norm_params = None
        conv_params = None
        activation_function_pre_params = None

        if params is not None:
            params = extract_top_level_dict(current_dict=params)

            if self.normalization:
                if 'norm_layer' in params:
                    batch_norm_params = params['norm_layer']

                if 'activation_function_pre' in params:
                    activation_function_pre_params = params['activation_function_pre']

            conv_params = params['conv']

        out = x


        out = self.conv(out, params=conv_params)

        if self.normalization:
            out = self.norm_layer.forward(out, num_step=num_step,
                                          params=batch_norm_params, training=training,
                                          backup_running_statistics=backup_running_statistics)

        out = F.leaky_relu(out)

        return out

    def restore_backup_stats(self):
        """
        Restore stored statistics from the backup, replacing the current ones.
        """
        if self.normalization:
            self.norm_layer.restore_backup_stats()

In [13]:
class MetaNormLayerConvReLU(nn.Module):
    def __init__(self, input_shape, num_filters, kernel_size, stride, padding, use_bias, args, normalization=True,
                 meta_layer=True, no_bn_learnable_params=False, device=None):
        """
           Initializes a BatchNorm->Conv->ReLU layer which applies those operation in that order.
           :param args: A named tuple containing the system's hyperparameters.
           :param device: The device to run the layer on.
           :param normalization: The type of normalization to use 'batch_norm' or 'layer_norm'
           :param meta_layer: Whether this layer will require meta-layer capabilities such as meta-batch norm,
           meta-conv etc.
           :param input_shape: The image input shape in the form (b, c, h, w)
           :param num_filters: number of filters for convolutional layer
           :param kernel_size: the kernel size of the convolutional layer
           :param stride: the stride of the convolutional layer
           :param padding: the bias of the convolutional layer
           :param use_bias: whether the convolutional layer utilizes a bias
        """
        super(MetaNormLayerConvReLU, self).__init__()
        self.normalization = normalization
        self.use_per_step_bn_statistics = args.per_step_bn_statistics
        self.input_shape = input_shape
        self.args = args
        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.use_bias = use_bias
        self.meta_layer = meta_layer
        self.no_bn_learnable_params = no_bn_learnable_params
        self.device = device
        self.layer_dict = nn.ModuleDict()
        self.build_block()

    def build_block(self):

        x = torch.zeros(self.input_shape)

        out = x
        if self.normalization:
            if self.args.norm_layer == "batch_norm":
                self.norm_layer = MetaBatchNormLayer(self.input_shape[1], track_running_stats=True,
                                                     meta_batch_norm=self.meta_layer,
                                                     no_learnable_params=self.no_bn_learnable_params,
                                                     device=self.device,
                                                     use_per_step_bn_statistics=self.use_per_step_bn_statistics,
                                                     args=self.args)
            elif self.args.norm_layer == "layer_norm":
                self.norm_layer = MetaLayerNormLayer(input_feature_shape=out.shape[1:])

            out = self.norm_layer.forward(out, num_step=0)
        self.conv = MetaConv2dLayer(in_channels=out.shape[1], out_channels=self.num_filters,
                                    kernel_size=self.kernel_size,
                                    stride=self.stride, padding=self.padding, use_bias=self.use_bias)


        self.layer_dict['activation_function_pre'] = nn.LeakyReLU()


        out = self.layer_dict['activation_function_pre'].forward(self.conv.forward(out))
        print(out.shape)

    def forward(self, x, num_step, params=None, training=False, backup_running_statistics=False):
        """
            Forward propagates by applying the function. If params are none then internal params are used.
            Otherwise passed params will be used to execute the function.
            :param input: input data batch, size either can be any.
            :param num_step: The current inner loop step being taken. This is used when we are learning per step params and
             collecting per step batch statistics. It indexes the correct object to use for the current time-step
            :param params: A dictionary containing 'weight' and 'bias'.
            :param training: Whether this is currently the training or evaluation phase.
            :param backup_running_statistics: Whether to backup the running statistics. This is used
            at evaluation time, when after the pass is complete we want to throw away the collected validation stats.
            :return: The result of the batch norm operation.
        """
        batch_norm_params = None

        if params is not None:
            params = extract_top_level_dict(current_dict=params)

            if self.normalization and 'norm_layer' in params:
                batch_norm_params = params['norm_layer']

            conv_params = params['conv']
        else:
            conv_params = None
            #print('no inner loop params', self)

        out = x

        if self.normalization:
            out = self.norm_layer.forward(out, num_step=num_step,
                                          params=batch_norm_params, training=training,
                                          backup_running_statistics=backup_running_statistics)

        out = self.conv.forward(out, params=conv_params)
        out = self.layer_dict['activation_function_pre'].forward(out)

        return out

    def restore_backup_stats(self):
        """
        Restore stored statistics from the backup, replacing the current ones.
        """
        if self.normalization:
            self.norm_layer.restore_backup_stats()

In [14]:
class VGGReLUNormNetwork(nn.Module):
    def __init__(self, im_shape, num_output_classes, args, device, meta_classifier=True):
        """
        Builds a multilayer convolutional network. It also provides functionality for passing external parameters to be
        used at inference time. Enables inner loop optimization readily.
        :param im_shape: The input image batch shape.
        :param num_output_classes: The number of output classes of the network.
        :param args: A named tuple containing the system's hyperparameters.
        :param device: The device to run this on.
        :param meta_classifier: A flag indicating whether the system's meta-learning (inner-loop) functionalities should
        be enabled.
        """
        super(VGGReLUNormNetwork, self).__init__()
        b, c, self.h, self.w = im_shape
        self.device = device
        self.total_layers = 0
        self.args = args
        self.upscale_shapes = []
        self.cnn_filters = args.cnn_num_filters
        self.input_shape = list(im_shape)
        self.num_stages = args.num_stages
        self.num_output_classes = num_output_classes

        if args.max_pooling:
            print("Using max pooling")
            self.conv_stride = 1
        else:
            print("Using strided convolutions")
            self.conv_stride = 2
        self.meta_classifier = meta_classifier

        self.build_network()
        print("meta network params")
        for name, param in self.named_parameters():
            print(name, param.shape)

    def build_network(self):
        """
        Builds the network before inference is required by creating some dummy inputs with the same input as the
        self.im_shape tuple. Then passes that through the network and dynamically computes input shapes and
        sets output shapes for each layer.
        """
        x = torch.zeros(self.input_shape)
        out = x
        self.layer_dict = nn.ModuleDict()
        self.upscale_shapes.append(x.shape)

        for i in range(self.num_stages):
            self.layer_dict['conv{}'.format(i)] = MetaConvNormLayerReLU(input_shape=out.shape,
                                                                        num_filters=self.cnn_filters,
                                                                        kernel_size=3, stride=self.conv_stride,
                                                                        padding=self.args.conv_padding,
                                                                        use_bias=True, args=self.args,
                                                                        normalization=True,
                                                                        meta_layer=self.meta_classifier,
                                                                        no_bn_learnable_params=False,
                                                                        device=self.device)
            out = self.layer_dict['conv{}'.format(i)](out, training=True, num_step=0)

            if self.args.max_pooling:
                out = F.max_pool2d(input=out, kernel_size=(2, 2), stride=2, padding=0)


        if not self.args.max_pooling:
            out = F.avg_pool2d(out, out.shape[2])

        self.encoder_features_shape = list(out.shape)
        out = out.view(out.shape[0], -1)

        self.layer_dict['linear'] = MetaLinearLayer(input_shape=(out.shape[0], np.prod(out.shape[1:])),
                                                    num_filters=self.num_output_classes, use_bias=True)

        out = self.layer_dict['linear'](out)
        print("VGGNetwork build", out.shape)

    def forward(self, x, num_step, params=None, training=False, backup_running_statistics=False):
        """
        Forward propages through the network. If any params are passed then they are used instead of stored params.
        :param x: Input image batch.
        :param num_step: The current inner loop step number
        :param params: If params are None then internal parameters are used. If params are a dictionary with keys the
         same as the layer names then they will be used instead.
        :param training: Whether this is training (True) or eval time.
        :param backup_running_statistics: Whether to backup the running statistics in their backup store. Which is
        then used to reset the stats back to a previous state (usually after an eval loop, when we want to throw away stored statistics)
        :return: Logits of shape b, num_output_classes.
        """
        param_dict = {}

        if params is not None:
            params = {key: value[0] for key, value in params.items()}
            param_dict = extract_top_level_dict(current_dict=params)

        # print('top network', param_dict.keys())
        for name, param in self.layer_dict.named_parameters():
            path_bits = name.split(".")
            layer_name = path_bits[0]
            if layer_name not in param_dict:
                param_dict[layer_name] = None

        out = x

        for i in range(self.num_stages):
            out = self.layer_dict['conv{}'.format(i)](out, params=param_dict['conv{}'.format(i)], training=training,
                                                      backup_running_statistics=backup_running_statistics,
                                                      num_step=num_step)
            if self.args.max_pooling:
                out = F.max_pool2d(input=out, kernel_size=(2, 2), stride=2, padding=0)

        if not self.args.max_pooling:
            out = F.avg_pool2d(out, out.shape[2])

        out = out.view(out.size(0), -1)
        out = self.layer_dict['linear'](out, param_dict['linear'])

        return out

    def zero_grad(self, params=None):
        if params is None:
            for param in self.parameters():
                if (
                    param.requires_grad == True
                    and param.grad is not None
                    and torch.sum(param.grad) > 0
                ):
                    print(param.grad)
                    param.grad.zero_()
        else:
            for name, param in params.items():
                if (
                    param.requires_grad == True
                    and param.grad is not None
                    and torch.sum(param.grad) > 0
                ):
                    print(param.grad)
                    param.grad.zero_()
                    params[name].grad = None

    def restore_backup_stats(self):
        """
        Reset stored batch statistics from the stored backup.
        """
        for i in range(self.num_stages):
            self.layer_dict['conv{}'.format(i)].restore_backup_stats()



TODO:

add more functionalities specific to MAML

# Experiment Builder

In [15]:
import csv
import datetime
import os
import numpy as np
import json

In [16]:
def build_experiment_folder(experiment_name):
    experiment_path = os.path.abspath(experiment_name)
    saved_models_filepath = "{}/{}".format(experiment_path, "saved_models")
    logs_filepath = "{}/{}".format(experiment_path, "logs")
    samples_filepath = "{}/{}".format(experiment_path, "visual_outputs")

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    if not os.path.exists(logs_filepath):
        os.makedirs(logs_filepath)
    if not os.path.exists(samples_filepath):
        os.makedirs(samples_filepath)
    if not os.path.exists(saved_models_filepath):
        os.makedirs(saved_models_filepath)

    outputs = (saved_models_filepath, logs_filepath, samples_filepath)
    outputs = (os.path.abspath(item) for item in outputs)
    return outputs

In [17]:
def save_statistics(experiment_name, line_to_add, filename="summary_statistics.csv", create=False):
    summary_filename = "{}/{}".format(experiment_name, filename)
    if create:
        with open(summary_filename, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(line_to_add)
    else:
        with open(summary_filename, 'a') as f:
            writer = csv.writer(f)
            writer.writerow(line_to_add)

    return summary_filename

In [18]:
def save_to_json(filename, dict_to_store):
    with open(os.path.abspath(filename), 'w') as f:
        json.dump(dict_to_store, fp=f)

In [19]:
import tqdm
import os
import numpy as np
import sys
# from utils.storage import build_experiment_folder, save_statistics, save_to_json
import time
import torch


class ExperimentBuilder(object):
    def __init__(self, args, data, model, device):
        """
        Initializes an experiment builder using a named tuple (args), a data provider (data), a meta learning system
        (model) and a device (e.g. gpu/cpu/n)
        :param args: A namedtuple containing all experiment hyperparameters
        :param data: A data provider of instance MetaLearningSystemDataLoader
        :param model: A meta learning system instance
        :param device: Device/s to use for the experiment
        """
        self.args, self.device = args, device

        self.model = model
        self.saved_models_filepath, self.logs_filepath, self.samples_filepath = build_experiment_folder(
            experiment_name=self.args.experiment_name)

        self.total_losses = {}
        self.state = {'best_val_acc': 0.0, 'best_val_iter': 0, 'current_iter': 0}
        self.start_epoch = 0
        self.max_models_to_save = self.args.max_models_to_save
        self.create_summary_csv = False

        if self.args.continue_from_epoch == 'from_scratch':
            self.create_summary_csv = True

        elif self.args.continue_from_epoch == 'latest':
            checkpoint = os.path.join(self.saved_models_filepath, "train_model_latest")
            print("attempting to find existing checkpoint", )
            if os.path.exists(checkpoint):
                self.state = \
                    self.model.load_model(model_save_dir=self.saved_models_filepath, model_name="train_model",
                                          model_idx='latest')
                self.start_epoch = int(self.state['current_iter'] / self.args.total_iter_per_epoch)

            else:
                self.args.continue_from_epoch = 'from_scratch'
                self.create_summary_csv = True
        elif int(self.args.continue_from_epoch) >= 0:
            self.state = \
                self.model.load_model(model_save_dir=self.saved_models_filepath, model_name="train_model",
                                      model_idx=self.args.continue_from_epoch)
            self.start_epoch = int(self.state['current_iter'] / self.args.total_iter_per_epoch)

        self.data = data(args=args, current_iter=self.state['current_iter'])

        print("train_seed {}, val_seed: {}, at start time".format(self.data.dataset.seed["train"],
                                                                  self.data.dataset.seed["val"]))
        self.total_epochs_before_pause = self.args.total_epochs_before_pause
        self.state['best_epoch'] = int(self.state['best_val_iter'] / self.args.total_iter_per_epoch)
        self.epoch = int(self.state['current_iter'] / self.args.total_iter_per_epoch)
        self.augment_flag = 'omniglot' in self.args.dataset_name.lower()
        self.start_time = time.time()
        self.epochs_done_in_this_run = 0
        print(self.state['current_iter'], int(self.args.total_iter_per_epoch * self.args.total_epochs))

    def build_summary_dict(self, total_losses, phase, summary_losses=None):
        """
        Builds/Updates a summary dict directly from the metric dict of the current iteration.
        :param total_losses: Current dict with total losses (not aggregations) from experiment
        :param phase: Current training phase
        :param summary_losses: Current summarised (aggregated/summarised) losses stats means, stdv etc.
        :return: A new summary dict with the updated summary statistics information.
        """
        if summary_losses is None:
            summary_losses = {}

        for key in total_losses:
            summary_losses["{}_{}_mean".format(phase, key)] = np.mean(total_losses[key])
            summary_losses["{}_{}_std".format(phase, key)] = np.std(total_losses[key])

        return summary_losses

    def build_loss_summary_string(self, summary_losses):
        """
        Builds a progress bar summary string given current summary losses dictionary
        :param summary_losses: Current summary statistics
        :return: A summary string ready to be shown to humans.
        """
        output_update = ""
        for key, value in zip(list(summary_losses.keys()), list(summary_losses.values())):
            if "loss" in key or "accuracy" in key:
                value = float(value)
                output_update += "{}: {:.4f}, ".format(key, value)

        return output_update

    def merge_two_dicts(self, first_dict, second_dict):
        """Given two dicts, merge them into a new dict as a shallow copy."""
        z = first_dict.copy()
        z.update(second_dict)
        return z

    def train_iteration(self, train_sample, sample_idx, epoch_idx, total_losses, current_iter, pbar_train):
        """
        Runs a training iteration, updates the progress bar and returns the total and current epoch train losses.
        :param train_sample: A sample from the data provider
        :param sample_idx: The index of the incoming sample, in relation to the current training run.
        :param epoch_idx: The epoch index.
        :param total_losses: The current total losses dictionary to be updated.
        :param current_iter: The current training iteration in relation to the whole experiment.
        :param pbar_train: The progress bar of the training.
        :return: Updates total_losses, train_losses, current_iter
        """
        x_support_set, x_target_set, y_support_set, y_target_set, seed = train_sample
        data_batch = (x_support_set, x_target_set, y_support_set, y_target_set)

        if sample_idx == 0:
            print("shape of data", x_support_set.shape, x_target_set.shape, y_support_set.shape,
                  y_target_set.shape)

        losses, _ = self.model.run_train_iter(data_batch=data_batch, epoch=epoch_idx)

        for key, value in zip(list(losses.keys()), list(losses.values())):
            if key not in total_losses:
                total_losses[key] = [float(value)]
            else:
                total_losses[key].append(float(value))

        train_losses = self.build_summary_dict(total_losses=total_losses, phase="train")
        train_output_update = self.build_loss_summary_string(losses)

        pbar_train.update(1)
        pbar_train.set_description("training phase {} -> {}".format(self.epoch, train_output_update))

        current_iter += 1

        return train_losses, total_losses, current_iter

    def evaluation_iteration(self, val_sample, total_losses, pbar_val, phase):
        """
        Runs a validation iteration, updates the progress bar and returns the total and current epoch val losses.
        :param val_sample: A sample from the data provider
        :param total_losses: The current total losses dictionary to be updated.
        :param pbar_val: The progress bar of the val stage.
        :return: The updated val_losses, total_losses
        """
        x_support_set, x_target_set, y_support_set, y_target_set, seed = val_sample
        data_batch = (
            x_support_set, x_target_set, y_support_set, y_target_set)

        losses, _ = self.model.run_validation_iter(data_batch=data_batch)
        for key, value in zip(list(losses.keys()), list(losses.values())):
            if key not in total_losses:
                total_losses[key] = [float(value)]
            else:
                total_losses[key].append(float(value))

        val_losses = self.build_summary_dict(total_losses=total_losses, phase=phase)
        val_output_update = self.build_loss_summary_string(losses)

        pbar_val.update(1)
        pbar_val.set_description(
            "val_phase {} -> {}".format(self.epoch, val_output_update))

        return val_losses, total_losses

    def test_evaluation_iteration(self, val_sample, model_idx, sample_idx, per_model_per_batch_preds, pbar_test):
        """
        Runs a validation iteration, updates the progress bar and returns the total and current epoch val losses.
        :param val_sample: A sample from the data provider
        :param total_losses: The current total losses dictionary to be updated.
        :param pbar_test: The progress bar of the val stage.
        :return: The updated val_losses, total_losses
        """
        x_support_set, x_target_set, y_support_set, y_target_set, seed = val_sample
        data_batch = (
            x_support_set, x_target_set, y_support_set, y_target_set)

        losses, per_task_preds = self.model.run_validation_iter(data_batch=data_batch)

        per_model_per_batch_preds[model_idx].extend(list(per_task_preds))

        test_output_update = self.build_loss_summary_string(losses)

        pbar_test.update(1)
        pbar_test.set_description(
            "test_phase {} -> {}".format(self.epoch, test_output_update))

        return per_model_per_batch_preds

    def save_models(self, model, epoch, state):
        """
        Saves two separate instances of the current model. One to be kept for history and reloading later and another
        one marked as "latest" to be used by the system for the next epoch training. Useful when the training/val
        process is interrupted or stopped. Leads to fault tolerant training and validation systems that can continue
        from where they left off before.
        :param model: Current meta learning model of any instance within the few_shot_learning_system.py
        :param epoch: Current epoch
        :param state: Current model and experiment state dict.
        """
        model.save_model(model_save_dir=os.path.join(self.saved_models_filepath, "train_model_{}".format(int(epoch))),
                         state=state)

        model.save_model(model_save_dir=os.path.join(self.saved_models_filepath, "train_model_latest"),
                         state=state)

        print("saved models to", self.saved_models_filepath)

    def pack_and_save_metrics(self, start_time, create_summary_csv, train_losses, val_losses, state):
        """
        Given current epochs start_time, train losses, val losses and whether to create a new stats csv file, pack stats
        and save into a statistics csv file. Return a new start time for the new epoch.
        :param start_time: The start time of the current epoch
        :param create_summary_csv: A boolean variable indicating whether to create a new statistics file or
        append results to existing one
        :param train_losses: A dictionary with the current train losses
        :param val_losses: A dictionary with the currrent val loss
        :return: The current time, to be used for the next epoch.
        """
        epoch_summary_losses = self.merge_two_dicts(first_dict=train_losses, second_dict=val_losses)

        if 'per_epoch_statistics' not in state:
            state['per_epoch_statistics'] = {}

        for key, value in epoch_summary_losses.items():

            if key not in state['per_epoch_statistics']:
                state['per_epoch_statistics'][key] = [value]
            else:
                state['per_epoch_statistics'][key].append(value)

        epoch_summary_string = self.build_loss_summary_string(epoch_summary_losses)
        epoch_summary_losses["epoch"] = self.epoch
        epoch_summary_losses['epoch_run_time'] = time.time() - start_time

        if create_summary_csv:
            self.summary_statistics_filepath = save_statistics(self.logs_filepath, list(epoch_summary_losses.keys()),
                                                               create=True)
            self.create_summary_csv = False

        start_time = time.time()
        print("epoch {} -> {}".format(epoch_summary_losses["epoch"], epoch_summary_string))

        self.summary_statistics_filepath = save_statistics(self.logs_filepath,
                                                           list(epoch_summary_losses.values()))
        return start_time, state

    def evaluated_test_set_using_the_best_models(self, top_n_models):
        per_epoch_statistics = self.state['per_epoch_statistics']
        val_acc = np.copy(per_epoch_statistics['val_accuracy_mean'])
        val_idx = np.array([i for i in range(len(val_acc))])
        sorted_idx = np.argsort(val_acc, axis=0).astype(dtype=np.int32)[::-1][:top_n_models]

        sorted_val_acc = val_acc[sorted_idx]
        val_idx = val_idx[sorted_idx]
        print(sorted_idx)
        print(sorted_val_acc)

        top_n_idx = val_idx[:top_n_models]
        per_model_per_batch_preds = [[] for i in range(top_n_models)]
        per_model_per_batch_targets = [[] for i in range(top_n_models)]
        test_losses = [dict() for i in range(top_n_models)]
        for idx, model_idx in enumerate(top_n_idx):
            self.state = \
                self.model.load_model(model_save_dir=self.saved_models_filepath, model_name="train_model",
                                      model_idx=model_idx + 1)
            with tqdm.tqdm(total=int(self.args.num_evaluation_tasks / self.args.batch_size)) as pbar_test:
                for sample_idx, test_sample in enumerate(
                        self.data.get_test_batches(total_batches=int(self.args.num_evaluation_tasks / self.args.batch_size),
                                                   augment_images=False)):
                    #print(test_sample[4])
                    per_model_per_batch_targets[idx].extend(np.array(test_sample[3]))
                    per_model_per_batch_preds = self.test_evaluation_iteration(val_sample=test_sample,
                                                                               sample_idx=sample_idx,
                                                                               model_idx=idx,
                                                                               per_model_per_batch_preds=per_model_per_batch_preds,
                                                                               pbar_test=pbar_test)
        # for i in range(top_n_models):
        #     print("test assertion", 0)
        #     print(per_model_per_batch_targets[0], per_model_per_batch_targets[i])
        #     assert np.equal(np.array(per_model_per_batch_targets[0]), np.array(per_model_per_batch_targets[i]))

        per_batch_preds = np.mean(per_model_per_batch_preds, axis=0)
        #print(per_batch_preds.shape)
        per_batch_max = np.argmax(per_batch_preds, axis=2)
        per_batch_targets = np.array(per_model_per_batch_targets[0]).reshape(per_batch_max.shape)
        #print(per_batch_max)
        accuracy = np.mean(np.equal(per_batch_targets, per_batch_max))
        accuracy_std = np.std(np.equal(per_batch_targets, per_batch_max))

        test_losses = {"test_accuracy_mean": accuracy, "test_accuracy_std": accuracy_std}

        _ = save_statistics(self.logs_filepath,
                            list(test_losses.keys()),
                            create=True, filename="test_summary.csv")

        summary_statistics_filepath = save_statistics(self.logs_filepath,
                                                      list(test_losses.values()),
                                                      create=False, filename="test_summary.csv")
        print(test_losses)
        print("saved test performance at", summary_statistics_filepath)

    def run_experiment(self):
        """
        Runs a full training experiment with evaluations of the model on the val set at every epoch. Furthermore,
        will return the test set evaluation results on the best performing validation model.
        """
        with tqdm.tqdm(initial=self.state['current_iter'],
                           total=int(self.args.total_iter_per_epoch * self.args.total_epochs)) as pbar_train:

            while (self.state['current_iter'] < (self.args.total_epochs * self.args.total_iter_per_epoch)) and (self.args.evaluate_on_test_set_only == False):

                for train_sample_idx, train_sample in enumerate(
                        self.data.get_train_batches(total_batches=int(self.args.total_iter_per_epoch *
                                                                      self.args.total_epochs) - self.state[
                                                                      'current_iter'],
                                                    augment_images=self.augment_flag)):
                    # print(self.state['current_iter'], (self.args.total_epochs * self.args.total_iter_per_epoch))
                    train_losses, total_losses, self.state['current_iter'] = self.train_iteration(
                        train_sample=train_sample,
                        total_losses=self.total_losses,
                        epoch_idx=(self.state['current_iter'] /
                                   self.args.total_iter_per_epoch),
                        pbar_train=pbar_train,
                        current_iter=self.state['current_iter'],
                        sample_idx=self.state['current_iter'])

                    if self.state['current_iter'] % self.args.total_iter_per_epoch == 0:

                        total_losses = {}
                        val_losses = {}
                        with tqdm.tqdm(total=int(self.args.num_evaluation_tasks / self.args.batch_size)) as pbar_val:
                            for _, val_sample in enumerate(
                                    self.data.get_val_batches(total_batches=int(self.args.num_evaluation_tasks / self.args.batch_size),
                                                              augment_images=False)):
                                val_losses, total_losses = self.evaluation_iteration(val_sample=val_sample,
                                                                                     total_losses=total_losses,
                                                                                     pbar_val=pbar_val, phase='val')

                            if val_losses["val_accuracy_mean"] > self.state['best_val_acc']:
                                print("Best validation accuracy", val_losses["val_accuracy_mean"])
                                self.state['best_val_acc'] = val_losses["val_accuracy_mean"]
                                self.state['best_val_iter'] = self.state['current_iter']
                                self.state['best_epoch'] = int(
                                    self.state['best_val_iter'] / self.args.total_iter_per_epoch)


                        self.epoch += 1
                        self.state = self.merge_two_dicts(first_dict=self.merge_two_dicts(first_dict=self.state,
                                                                                          second_dict=train_losses),
                                                          second_dict=val_losses)

                        self.save_models(model=self.model, epoch=self.epoch, state=self.state)

                        self.start_time, self.state = self.pack_and_save_metrics(start_time=self.start_time,
                                                                                 create_summary_csv=self.create_summary_csv,
                                                                                 train_losses=train_losses,
                                                                                 val_losses=val_losses,
                                                                                 state=self.state)

                        self.total_losses = {}

                        self.epochs_done_in_this_run += 1

                        save_to_json(filename=os.path.join(self.logs_filepath, "summary_statistics.json"),
                                     dict_to_store=self.state['per_epoch_statistics'])

                        if self.epochs_done_in_this_run >= self.total_epochs_before_pause:
                            print("train_seed {}, val_seed: {}, at pause time".format(self.data.dataset.seed["train"],
                                                                                      self.data.dataset.seed["val"]))
                            sys.exit()
            self.evaluated_test_set_using_the_best_models(top_n_models=5)

# Data Loader

In [20]:
import json
import os
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import tqdm
import concurrent.futures
import pickle
import torch
from torchvision import transforms
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# from utils.parser_utils import get_args


class rotate_image(object):

    def __init__(self, k, channels):
        self.k = k
        self.channels = channels

    def __call__(self, image):
        if self.channels == 1:
            if len(image.shape) == 3:
                image = image[:, :, 0]
                image = np.expand_dims(image, axis=2)

            elif len(image.shape) == 4:
                image = image[:, :, :, 0]
                image = np.expand_dims(image, axis=3)

        image = np.rot90(image, k=self.k).copy()
        return image


class torch_rotate_image(object):

    def __init__(self, k, channels):
        self.k = k
        self.channels = channels

    def __call__(self, image):
        rotate = transforms.RandomRotation(degrees=self.k * 90)
        if image.shape[-1] == 1:
            image = image[:, :, 0]
        image = Image.fromarray(image)
        image = rotate(image)
        image = np.array(image)
        if len(image.shape) == 2:
            image = np.expand_dims(image, axis=2)
        return image


def augment_image(image, k, channels, augment_bool, args, dataset_name):
    transform_train, transform_evaluation = get_transforms_for_dataset(dataset_name=dataset_name,
                                                                       args=args, k=k)
    if len(image.shape) > 3:
        images = [item for item in image]
        output_images = []
        for image in images:
            if augment_bool is True:
                for transform_current in transform_train:
                    image = transform_current(image)
            else:
                for transform_current in transform_evaluation:
                    image = transform_current(image)
            output_images.append(image)
        image = torch.stack(output_images)
    elif augment_bool is True:
        # meanstd transformation
        for transform_current in transform_train:
            image = transform_current(image)
    else:
        for transform_current in transform_evaluation:
            image = transform_current(image)
    return image


def get_transforms_for_dataset(dataset_name, args, k):
    if "cifar10" in dataset_name or "cifar100" in dataset_name:
        transform_train = [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(args.classification_mean, args.classification_std)]

        transform_evaluate = [
            transforms.ToTensor(),
            transforms.Normalize(args.classification_mean, args.classification_std)]

    elif 'omniglot' in dataset_name:

        transform_train = [rotate_image(k=k, channels=args.image_channels), transforms.ToTensor()]
        transform_evaluate = [transforms.ToTensor()]


    elif 'imagenet' in dataset_name:

        transform_train = [transforms.Compose([

            transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])]

        transform_evaluate = [transforms.Compose([

            transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])]

    return transform_train, transform_evaluate


class FewShotLearningDatasetParallel(Dataset):
    def __init__(self, args):
        """
        A data provider class inheriting from Pytorch's Dataset class. It takes care of creating task sets for
        our few-shot learning model training and evaluation
        :param args: Arguments in the form of a Bunch object. Includes all hyperparameters necessary for the
        data-provider. For transparency and readability reasons to explicitly set as self.object_name all arguments
        required for the data provider, such that the reader knows exactly what is necessary for the data provider/
        """
        self.data_path = args.dataset_path
        self.dataset_name = args.dataset_name
        self.data_loaded_in_memory = False
        self.image_height, self.image_width, self.image_channel = args.image_height, args.image_width, args.image_channels
        self.args = args
        self.indexes_of_folders_indicating_class = args.indexes_of_folders_indicating_class
        self.reverse_channels = args.reverse_channels
        self.labels_as_int = args.labels_as_int
        self.train_val_test_split = args.train_val_test_split
        self.current_set_name = "train"
        self.num_target_samples = args.num_target_samples
        self.reset_stored_filepaths = args.reset_stored_filepaths
        val_rng = np.random.RandomState(seed=args.val_seed)
        val_seed = val_rng.randint(1, 999999)
        train_rng = np.random.RandomState(seed=args.train_seed)
        train_seed = train_rng.randint(1, 999999)
        test_rng = np.random.RandomState(seed=args.val_seed)
        test_seed = test_rng.randint(1, 999999)
        args.val_seed = val_seed
        args.train_seed = train_seed
        args.test_seed = test_seed
        self.init_seed = {"train": args.train_seed, "val": args.val_seed, 'test': args.val_seed}
        self.seed = {"train": args.train_seed, "val": args.val_seed, 'test': args.val_seed}
        self.num_of_gpus = args.num_of_gpus
        self.batch_size = args.batch_size

        self.train_index = 0
        self.val_index = 0
        self.test_index = 0

        self.augment_images = False
        self.num_samples_per_class = args.num_samples_per_class
        self.num_classes_per_set = args.num_classes_per_set

        self.rng = np.random.RandomState(seed=self.seed['val'])
        self.datasets = self.load_dataset()

        self.indexes = {"train": 0, "val": 0, 'test': 0}
        self.dataset_size_dict = {
            "train": {key: len(self.datasets['train'][key]) for key in list(self.datasets['train'].keys())},
            "val": {key: len(self.datasets['val'][key]) for key in list(self.datasets['val'].keys())},
            'test': {key: len(self.datasets['test'][key]) for key in list(self.datasets['test'].keys())}}
        self.label_set = self.get_label_set()
        self.data_length = {name: np.sum([len(self.datasets[name][key])
                                          for key in self.datasets[name]]) for name in self.datasets.keys()}

        print("data", self.data_length)
        self.observed_seed_set = None

    def load_dataset(self):
        """
        Loads a dataset's dictionary files and splits the data according to the train_val_test_split variable stored
        in the args object.
        :return: Three sets, the training set, validation set and test sets (referred to as the meta-train,
        meta-val and meta-test in the paper)
        """
        rng = np.random.RandomState(seed=self.seed['val'])

        if self.args.sets_are_pre_split == True:
            print("Loading pre-split data")
            data_image_paths, index_to_label_name_dict_file, label_to_index = self.load_datapaths()
            dataset_splits = {}
            for key, value in data_image_paths.items():
                key = self.get_label_from_index(index=key)
                bits = key.split("/")
                set_name = bits[0]
                class_label = bits[1]
                if set_name not in dataset_splits:
                    dataset_splits[set_name] = {class_label: value}
                else:
                    dataset_splits[set_name][class_label] = value
        else:
            data_image_paths, index_to_label_name_dict_file, label_to_index = self.load_datapaths()
            total_label_types = len(data_image_paths)
            num_classes_idx = np.arange(len(data_image_paths.keys()), dtype=np.int32)
            rng.shuffle(num_classes_idx)
            keys = list(data_image_paths.keys())
            values = list(data_image_paths.values())
            new_keys = [keys[idx] for idx in num_classes_idx]
            new_values = [values[idx] for idx in num_classes_idx]
            data_image_paths = dict(zip(new_keys, new_values))
            # data_image_paths = self.shuffle(data_image_paths)
            x_train_id, x_val_id, x_test_id = int(self.train_val_test_split[0] * total_label_types), \
                                              int(np.sum(self.train_val_test_split[:2]) * total_label_types), \
                                              int(total_label_types)
            # print(x_train_id, x_val_id, x_test_id)
            # print("DATA IMAGE PATH FIRST KEY")
            test_first_class_key = list(data_image_paths.keys())[0]
            # print(test_first_class_key)
            # print(data_image_paths[test_first_class_key])
            x_train_classes = (class_key for class_key in list(data_image_paths.keys())[:x_train_id])
            x_val_classes = (class_key for class_key in list(data_image_paths.keys())[x_train_id:x_val_id])
            x_test_classes = (class_key for class_key in list(data_image_paths.keys())[x_val_id:x_test_id])
            x_train, x_val, x_test = {class_key: data_image_paths[class_key] for class_key in x_train_classes}, \
                                     {class_key: data_image_paths[class_key] for class_key in x_val_classes}, \
                                     {class_key: data_image_paths[class_key] for class_key in x_test_classes},
            dataset_splits = {"train": x_train, "val":x_val , "test": x_test}

        if self.args.load_into_memory is True:

            print("Loading data into RAM")
            x_loaded = {"train": [], "val": [], "test": []}

            for set_key, set_value in dataset_splits.items():
                print("Currently loading into memory the {} set".format(set_key))
                # print("Set value is {}".format(set_value))
                x_loaded[set_key] = {key: np.zeros(len(value), ) for key, value in set_value.items()}
                # for class_key, class_value in set_value.items():
                with tqdm.tqdm(total=len(set_value)) as pbar_memory_load:
                    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
                        # Process the list of files, but split the work across the process pool to use all CPUs!
                        for (class_label, class_images_loaded) in executor.map(self.load_parallel_batch, (set_value.items())):
                            x_loaded[set_key][class_label] = class_images_loaded
                            pbar_memory_load.update(1)

            dataset_splits = x_loaded
            self.data_loaded_in_memory = True

        return dataset_splits

    def load_datapaths(self):
        """
        If saved json dictionaries of the data are available, then this method loads the dictionaries such that the
        data is ready to be read. If the json dictionaries do not exist, then this method calls get_data_paths()
        which will build the json dictionary containing the class to filepath samples, and then store them.
        :return: data_image_paths: dict containing class to filepath list pairs.
                 index_to_label_name_dict_file: dict containing numerical indexes mapped to the human understandable
                 string-names of the class
                 label_to_index: dictionary containing human understandable string mapped to numerical indexes
        """
        dataset_dir = config["dataset_path"]
        data_path_file = "{}/{}.json".format(dataset_dir, self.dataset_name)
        self.index_to_label_name_dict_file = "{}/map_to_label_name_{}.json".format(dataset_dir, self.dataset_name)
        # print(self.index_to_label_name_dict_file)
        self.label_name_to_map_dict_file = "{}/label_name_to_map_{}.json".format(dataset_dir, self.dataset_name)
        # print(self.label_name_to_map_dict_file)

        if not os.path.exists(data_path_file):
            self.reset_stored_filepaths = True

        if self.reset_stored_filepaths == True:
            if os.path.exists(data_path_file):
                os.remove(data_path_file)
            self.reset_stored_filepaths = False

        try:
            data_image_paths = self.load_from_json(filename=data_path_file)
            #json name difference; takes in /content/datasets...
            #changed to datasets/... which is appended to new path
            label_to_index = self.load_from_json(filename=self.label_name_to_map_dict_file)
            index_to_label_name_dict_file = self.load_from_json(filename=self.index_to_label_name_dict_file)


            # print(data_image_paths)
            # print(index_to_label_name_dict_file)
            # print(label_to_index)
            return data_image_paths, index_to_label_name_dict_file, label_to_index
        except:
            print("Mapped data paths can't be found, remapping paths..")
            data_image_paths, code_to_label_name, label_name_to_code = self.get_data_paths()
            self.save_to_json(dict_to_store=data_image_paths, filename=data_path_file)
            self.save_to_json(dict_to_store=code_to_label_name, filename=self.index_to_label_name_dict_file)
            self.save_to_json(dict_to_store=label_name_to_code, filename=self.label_name_to_map_dict_file)
            return self.load_datapaths()

    def save_to_json(self, filename, dict_to_store):
        with open(os.path.abspath(filename), 'w') as f:
            json.dump(dict_to_store, fp=f)

    def load_from_json(self, filename):
        with open(filename, mode="r") as f:
            load_dict = json.load(fp=f)

        return load_dict

    def load_test_image(self, filepath):
        """
        Tests whether a target filepath contains an uncorrupted image. If image is corrupted, attempt to fix.
        :param filepath: Filepath of image to be tested
        :return: Return filepath of image if image exists and is uncorrupted (or attempt to fix has succeeded),
        else return None
        """
        image = None
        try:
            image = Image.open(filepath)
        except RuntimeWarning:
            os.system("convert {} -strip {}".format(filepath, filepath))
            print("converting")
            image = Image.open(filepath)
        except:
            print("Broken image")

        if image is not None:
            return filepath
        else:
            return None

    def get_data_paths(self):
        """
        Method that scans the dataset directory and generates class to image-filepath list dictionaries.
        :return: data_image_paths: dict containing class to filepath list pairs.
                 index_to_label_name_dict_file: dict containing numerical indexes mapped to the human understandable
                 string-names of the class
                 label_to_index: dictionary containing human understandable string mapped to numerical indexes
        """
        print("Get images from", self.data_path)
        data_image_path_list_raw = []
        labels = set()
        for subdir, dir, files in os.walk(self.data_path):
            for file in files:
                if (".jpeg") in file.lower() or (".png") in file.lower() or (".jpg") in file.lower():
                    filepath = os.path.abspath(os.path.join(subdir, file))
                    label = self.get_label_from_path(filepath)
                    data_image_path_list_raw.append(filepath)
                    labels.add(label)

        labels = sorted(labels)
        idx_to_label_name = {idx: label for idx, label in enumerate(labels)}
        label_name_to_idx = {label: idx for idx, label in enumerate(labels)}
        data_image_path_dict = {idx: [] for idx in list(idx_to_label_name.keys())}
        with tqdm.tqdm(total=len(data_image_path_list_raw)) as pbar_error:
            with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
                # Process the list of files, but split the work across the process pool to use all CPUs!
                for image_file in executor.map(self.load_test_image, (data_image_path_list_raw)):
                    pbar_error.update(1)
                    if image_file is not None:
                        label = self.get_label_from_path(image_file)
                        data_image_path_dict[label_name_to_idx[label]].append(image_file)

        return data_image_path_dict, idx_to_label_name, label_name_to_idx

    def get_label_set(self):
        """
        Generates a set containing all class numerical indexes
        :return: A set containing all class numerical indexes
        """
        index_to_label_name_dict_file = self.load_from_json(filename=self.index_to_label_name_dict_file)
        return set(list(index_to_label_name_dict_file.keys()))

    def get_index_from_label(self, label):
        """
        Given a class's (human understandable) string, returns the numerical index of that class
        :param label: A string of a human understandable class contained in the dataset
        :return: An int containing the numerical index of the given class-string
        """
        label_to_index = self.load_from_json(filename=self.label_name_to_map_dict_file)
        return label_to_index[label]

    def get_label_from_index(self, index):
        """
        Given an index return the human understandable label mapping to it.
        :param index: A numerical index (int)
        :return: A human understandable label (str)
        """
        index_to_label_name = self.load_from_json(filename=self.index_to_label_name_dict_file)
        return index_to_label_name[index]

    def get_label_from_path(self, filepath):
        """
        Given a path of an image generate the human understandable label for that image.
        :param filepath: The image's filepath
        :return: A human understandable label.
        """
        label_bits = filepath.split("/")
        label = "/".join([label_bits[idx] for idx in self.indexes_of_folders_indicating_class])
        if self.labels_as_int:
            label = int(label)
        return label

    def load_image(self, image_path, channels):
        """
        Given an image filepath and the number of channels to keep, load an image and keep the specified channels
        :param image_path: The image's filepath
        :param channels: The number of channels to keep
        :return: An image array of shape (h, w, channels), whose values range between 0.0 and 1.0.
        """
        if not self.data_loaded_in_memory:
            image = Image.open(image_path)
            if 'omniglot' in self.dataset_name:
                image = image.resize((self.image_height, self.image_width), resample=Image.LANCZOS)
                image = np.array(image, np.float32)
                if channels == 1:
                    image = np.expand_dims(image, axis=2)
            else:
                image = image.resize((self.image_height, self.image_width)).convert('RGB')
                image = np.array(image, np.float32)
                image = image / 255.0
        else:
            image = image_path

        return image

    def load_batch(self, batch_image_paths):
        """
        Load a batch of images, given a list of filepaths
        :param batch_image_paths: A list of filepaths
        :return: A numpy array of images of shape batch, height, width, channels
        """
        image_batch = []

        if self.data_loaded_in_memory:
            for image_path in batch_image_paths:
                image_batch.append(image_path)
            image_batch = np.array(image_batch, dtype=np.float32)
            #print(image_batch.shape)
        else:
            print("BATCH IMAGE PATH (no content?):")
            print(image_path)
            image_batch = [self.load_image(image_path=image_path, channels=self.image_channel)
                           for image_path in batch_image_paths]
            image_batch = np.array(image_batch, dtype=np.float32)
            image_batch = self.preprocess_data(image_batch)

        return image_batch

    def load_parallel_batch(self, inputs):
        """
        Load a batch of images, given a list of filepaths
        :param batch_image_paths: A list of filepaths
        :return: A numpy array of images of shape batch, height, width, channels
        """
        class_label, batch_image_paths = inputs
        image_batch = []

        if self.data_loaded_in_memory:
            for image_path in batch_image_paths:
                image_batch.append(np.copy(image_path))
            image_batch = np.array(image_batch, dtype=np.float32)
        else:
            #with tqdm.tqdm(total=1) as load_pbar:
            image_batch = [self.load_image(image_path=image_path, channels=self.image_channel)
                           for image_path in batch_image_paths]
                #load_pbar.update(1)

            image_batch = np.array(image_batch, dtype=np.float32)
            image_batch = self.preprocess_data(image_batch)

        return class_label, image_batch

    def preprocess_data(self, x):
        """
        Preprocesses data such that their shapes match the specified structures
        :param x: A data batch to preprocess
        :return: A preprocessed data batch
        """
        x_shape = x.shape
        x = np.reshape(x, (-1, x_shape[-3], x_shape[-2], x_shape[-1]))
        if self.reverse_channels is True:
            reverse_photos = np.ones(shape=x.shape)
            for channel in range(x.shape[-1]):
                reverse_photos[:, :, :, x.shape[-1] - 1 - channel] = x[:, :, :, channel]
            x = reverse_photos
        x = x.reshape(x_shape)
        return x

    def reconstruct_original(self, x):
        """
        Applies the reverse operations that preprocess_data() applies such that the data returns to their original form
        :param x: A batch of data to reconstruct
        :return: A reconstructed batch of data
        """
        x = x * 255.0
        return x

    def shuffle(self, x, rng):
        """
        Shuffles the data batch along it's first axis
        :param x: A data batch
        :return: A shuffled data batch
        """
        indices = np.arange(len(x))
        rng.shuffle(indices)
        x = x[indices]
        return x

    def get_set(self, dataset_name, seed, augment_images=False):
        """
        Generates a task-set to be used for training or evaluation
        :param set_name: The name of the set to use, e.g. "train", "val" etc.
        :return: A task-set containing an image and label support set, and an image and label target set.
        """
        #seed = seed % self.args.total_unique_tasks
        rng = np.random.RandomState(seed)

        # print(self.dataset_size_dict)
        selected_classes = rng.choice(list(self.dataset_size_dict[dataset_name].keys()),
                                      size=self.num_classes_per_set, replace=False)
        rng.shuffle(selected_classes)
        k_list = rng.randint(0, 4, size=self.num_classes_per_set)
        k_dict = {selected_class: k_item for (selected_class, k_item) in zip(selected_classes, k_list)}
        episode_labels = [i for i in range(self.num_classes_per_set)]
        class_to_episode_label = {selected_class: episode_label for (selected_class, episode_label) in
                                  zip(selected_classes, episode_labels)}

        x_images = []
        y_labels = []

        for class_entry in selected_classes:
            choose_samples_list = rng.choice(self.dataset_size_dict[dataset_name][class_entry],
                                             size=self.num_samples_per_class + self.num_target_samples, replace=False)
            class_image_samples = []
            class_labels = []
            for sample in choose_samples_list:
                choose_samples = self.datasets[dataset_name][class_entry][sample]
                x_class_data = self.load_batch([choose_samples])[0]
                k = k_dict[class_entry]
                x_class_data = augment_image(image=x_class_data, k=k,
                                             channels=self.image_channel, augment_bool=augment_images,
                                             dataset_name=self.dataset_name, args=self.args)
                class_image_samples.append(x_class_data)
                class_labels.append(int(class_to_episode_label[class_entry]))
            class_image_samples = torch.stack(class_image_samples)
            x_images.append(class_image_samples)
            y_labels.append(class_labels)

        x_images = torch.stack(x_images)
        y_labels = np.array(y_labels, dtype=np.float32)

        support_set_images = x_images[:, :self.num_samples_per_class]
        support_set_labels = y_labels[:, :self.num_samples_per_class]
        target_set_images = x_images[:, self.num_samples_per_class:]
        target_set_labels = y_labels[:, self.num_samples_per_class:]

        return support_set_images, target_set_images, support_set_labels, target_set_labels, seed

    def __len__(self):
        return self.data_length[self.current_set_name]

    def length(self, set_name):
        self.switch_set(set_name=set_name)
        return len(self)

    def set_augmentation(self, augment_images):
        self.augment_images = augment_images

    def switch_set(self, set_name, current_iter=None):
        self.current_set_name = set_name
        if set_name == "train":
            self.update_seed(dataset_name=set_name, seed=self.init_seed[set_name] + current_iter)

    def update_seed(self, dataset_name, seed=100):
        self.seed[dataset_name] = seed

    def __getitem__(self, idx):
        support_set_images, target_set_image, support_set_labels, target_set_label, seed = \
            self.get_set(self.current_set_name, seed=self.seed[self.current_set_name] + idx,
                         augment_images=self.augment_images)

        return support_set_images, target_set_image, support_set_labels, target_set_label, seed

    def reset_seed(self):
        self.seed = self.init_seed


class MetaLearningSystemDataLoader(object):
    def __init__(self, args, current_iter=0):
        """
        Initializes a meta learning system dataloader. The data loader uses the Pytorch DataLoader class to parallelize
        batch sampling and preprocessing.
        :param args: An arguments NamedTuple containing all the required arguments.
        :param current_iter: Current iter of experiment. Is used to make sure the data loader continues where it left
        of previously.
        """
        self.num_of_gpus = args.num_of_gpus
        self.batch_size = args.batch_size
        self.samples_per_iter = args.samples_per_iter
        self.num_workers = args.num_dataprovider_workers
        self.total_train_iters_produced = 0
        self.dataset = FewShotLearningDatasetParallel(args=args)
        self.batches_per_iter = args.samples_per_iter
        self.full_data_length = self.dataset.data_length
        self.continue_from_iter(current_iter=current_iter)
        self.args = args

    def get_dataloader(self):
        """
        Returns a data loader with the correct set (train, val or test), continuing from the current iter.
        :return:
        """
        return DataLoader(self.dataset, batch_size=(self.num_of_gpus * self.batch_size * self.samples_per_iter),
                          shuffle=False, num_workers=self.num_workers, drop_last=True)

    def continue_from_iter(self, current_iter):
        """
        Makes sure the data provider is aware of where we are in terms of training iterations in the experiment.
        :param current_iter:
        """
        self.total_train_iters_produced += (current_iter * (self.num_of_gpus * self.batch_size * self.samples_per_iter))

    def get_train_batches(self, total_batches=-1, augment_images=False):
        """
        Returns a training batches data_loader
        :param total_batches: The number of batches we want the data loader to sample
        :param augment_images: Whether we want the images to be augmented.
        """
        if total_batches == -1:
            self.dataset.data_length = self.full_data_length
        else:
            self.dataset.data_length["train"] = total_batches * self.dataset.batch_size
        self.dataset.switch_set(set_name="train", current_iter=self.total_train_iters_produced)
        self.dataset.set_augmentation(augment_images=augment_images)
        self.total_train_iters_produced += (self.num_of_gpus * self.batch_size * self.samples_per_iter)
        for sample_id, sample_batched in enumerate(self.get_dataloader()):
            yield sample_batched


    def get_val_batches(self, total_batches=-1, augment_images=False):
        """
        Returns a validation batches data_loader
        :param total_batches: The number of batches we want the data loader to sample
        :param augment_images: Whether we want the images to be augmented.
        """
        if total_batches == -1:
            self.dataset.data_length = self.full_data_length
        else:
            self.dataset.data_length['val'] = total_batches * self.dataset.batch_size
        self.dataset.switch_set(set_name="val")
        self.dataset.set_augmentation(augment_images=augment_images)
        for sample_id, sample_batched in enumerate(self.get_dataloader()):
            yield sample_batched


    def get_test_batches(self, total_batches=-1, augment_images=False):
        """
        Returns a testing batches data_loader
        :param total_batches: The number of batches we want the data loader to sample
        :param augment_images: Whether we want the images to be augmented.
        """
        if total_batches == -1:
            self.dataset.data_length = self.full_data_length
        else:
            self.dataset.data_length['test'] = total_batches * self.dataset.batch_size
        self.dataset.switch_set(set_name='test')
        self.dataset.set_augmentation(augment_images=augment_images)
        for sample_id, sample_batched in enumerate(self.get_dataloader()):
            yield sample_batched

# Train MAML

In [21]:
import json

config = {
  "batch_size":8,
  "image_height":28,
  "image_width":28,
  "image_channels":1,
  "gpu_to_use":0,
  "num_dataprovider_workers":4,
  "max_models_to_save":5,
  "dataset_name":"omniglot_dataset",
  "dataset_path":"/content/HowToTrainYourMAMLPytorch/datasets",
  "reset_stored_paths":False,
  "experiment_name":"omniglot_5_8_0.1_64_20_2",
  "train_seed": 2, "val_seed": 0,
  "train_val_test_split": [0.70918052988, 0.03080714725, 0.2606284658],
  "indexes_of_folders_indicating_class": [-3, -2],
  "load_from_npz_files": False,
  "sets_are_pre_split": False,
  "load_into_memory": True,
  "init_inner_loop_learning_rate": 0.1,
  "train_in_stages": False,
  "multi_step_loss_num_epochs": 10,
  "minimum_per_task_contribution": 0.01,
  "num_evaluation_tasks":600,
  "learnable_per_layer_per_step_inner_loop_learning_rate": True,
  "enable_inner_loop_optimizable_bn_params": False,

  "total_epochs": 150,
  "total_iter_per_epoch":500, "continue_from_epoch": -2,
  "evaluate_on_test_set_only": False,
  "max_pooling": True,
  "per_step_bn_statistics": True,
  "learnable_batch_norm_momentum": False,
  "evalute_on_test_set_only": False,
  "learnable_bn_gamma": True,
  "learnable_bn_beta": True,

  "weight_decay": 0.0,
  "dropout_rate_value":0.0,
  "min_learning_rate":0.00001,
  "meta_learning_rate":0.001,   "total_epochs_before_pause": 150,
  "first_order_to_second_order_epoch":-1,

  "norm_layer":"batch_norm",
  "cnn_num_filters":64,
  "num_stages":4,
  "conv_padding": True,
  "number_of_training_steps_per_iter":5,
  "number_of_evaluation_steps_per_iter":5,
  "cnn_blocks_per_stage":1,
  "num_classes_per_set":5,
  "num_samples_per_class":5,
  "num_target_samples": 1,

  "second_order": True,
  "use_multi_step_loss_optimization":True,


  # "seed": 2,

}


with open("omniglot_maml++-omniglot_5_8_0.1_64_20_2.json", "w") as outfile:
    json.dump(config, outfile)

In [22]:
# from torch import cuda


# def get_args():
#     import argparse
#     import os
#     import torch
#     import json
#     parser = argparse.ArgumentParser(description='Welcome to the MAML++ training and inference system')

#     parser.add_argument('--batch_size', nargs="?", type=int, default=32, help='Batch_size for experiment')
#     parser.add_argument('--image_height', nargs="?", type=int, default=28)
#     parser.add_argument('--image_width', nargs="?", type=int, default=28)
#     parser.add_argument('--image_channels', nargs="?", type=int, default=1)
#     parser.add_argument('--reset_stored_filepaths', type=str, default="False")
#     parser.add_argument('--reverse_channels', type=str, default="False")
#     parser.add_argument('--num_of_gpus', type=int, default=1)
#     parser.add_argument('--indexes_of_folders_indicating_class', nargs='+', default=[-2, -3])
#     parser.add_argument('--train_val_test_split', nargs='+', default=[0.73982737361, 0.26, 0.13008631319])
#     parser.add_argument('--samples_per_iter', nargs="?", type=int, default=1)
#     parser.add_argument('--labels_as_int', type=str, default="False")
#     parser.add_argument('--seed', type=int, default=104)

#     parser.add_argument('--gpu_to_use', type=int)
#     parser.add_argument('--num_dataprovider_workers', nargs="?", type=int, default=4)
#     parser.add_argument('--max_models_to_save', nargs="?", type=int, default=5)
#     parser.add_argument('--dataset_name', type=str, default="omniglot_dataset")
#     parser.add_argument('--dataset_path', type=str, default="datasets/omniglot_dataset")
#     parser.add_argument('--reset_stored_paths', type=str, default="False")
#     parser.add_argument('--experiment_name', nargs="?", type=str, )
#     parser.add_argument('--architecture_name', nargs="?", type=str)
#     parser.add_argument('--continue_from_epoch', nargs="?", type=str, default='latest', help='Continue from checkpoint of epoch')
#     parser.add_argument('--dropout_rate_value', type=float, default=0.3, help='Dropout_rate_value')
#     parser.add_argument('--num_target_samples', type=int, default=15, help='Dropout_rate_value')
#     parser.add_argument('--second_order', type=str, default="False", help='Dropout_rate_value')
#     parser.add_argument('--total_epochs', type=int, default=200, help='Number of epochs per experiment')
#     parser.add_argument('--total_iter_per_epoch', type=int, default=500, help='Number of iters per epoch')
#     parser.add_argument('--min_learning_rate', type=float, default=0.00001, help='Min learning rate')
#     parser.add_argument('--meta_learning_rate', type=float, default=0.001, help='Learning rate of overall MAML system')
#     parser.add_argument('--meta_opt_bn', type=str, default="False")
#     parser.add_argument('--task_learning_rate', type=float, default=0.1, help='Learning rate per task gradient step')

#     parser.add_argument('--norm_layer', type=str, default="batch_norm")
#     parser.add_argument('--max_pooling', type=str, default="False")
#     parser.add_argument('--per_step_bn_statistics', type=str, default="False")
#     parser.add_argument('--num_classes_per_set', type=int, default=20, help='Number of classes to sample per set')
#     parser.add_argument('--cnn_num_blocks', type=int, default=4, help='Number of classes to sample per set')
#     parser.add_argument('--number_of_training_steps_per_iter', type=int, default=1, help='Number of classes to sample per set')
#     parser.add_argument('--number_of_evaluation_steps_per_iter', type=int, default=1, help='Number of classes to sample per set')
#     parser.add_argument('--cnn_num_filters', type=int, default=64, help='Number of classes to sample per set')
#     parser.add_argument('--cnn_blocks_per_stage', type=int, default=1,
#                         help='Number of classes to sample per set')
#     parser.add_argument('--num_samples_per_class', type=int, default=1, help='Number of samples per set to sample')
#     parser.add_argument('--name_of_args_json_file', type=str, default="None")

#     args = parser.parse_args()
#     args_dict = vars(args)
#     if args.name_of_args_json_file is not "None":
#         args_dict = extract_args_from_json(args.name_of_args_json_file, args_dict)

#     for key in list(args_dict.keys()):

#         if str(args_dict[key]).lower() == "true":
#             args_dict[key] = True
#         elif str(args_dict[key]).lower() == "false":
#             args_dict[key] = False
#         if key == "dataset_path":
#             args_dict[key] = os.path.join(os.environ['DATASET_DIR'], args_dict[key])
#             print(key, os.path.join(os.environ['DATASET_DIR'], args_dict[key]))

#         print(key, args_dict[key], type(args_dict[key]))

#     args = Bunch(args_dict)


#     args.use_cuda = torch.cuda.is_available()
#     if torch.cuda.is_available():  # checks whether a cuda gpu is available and whether the gpu flag is True
#         device = torch.cuda.current_device()

#         print("use GPU", device)
#         print("GPU ID {}".format(torch.cuda.current_device()))

#     else:
#         print("use CPU")
#         device = torch.device('cpu')  # sets the device to be CPU


#     return args, device



# class Bunch(object):
#   def __init__(self, adict):
#     self.__dict__.update(adict)

# def extract_args_from_json(json_file_path, args_dict):
#     import json
#     summary_filename = json_file_path
#     with open(summary_filename) as f:
#         summary_dict = json.load(fp=f)

#     for key in summary_dict.keys():
#         if "continue_from" not in key and "gpu_to_use" not in key:
#             args_dict[key] = summary_dict[key]

#     return args_dict

In [23]:
import argparse
import os
import torch
import json

class Bunch(object):
    def __init__(self, adict):
        self.__dict__.update(adict)

def load_args_from_json(json_file_path):
    def extract_args_from_json(json_file_path, args_dict):
        with open(json_file_path) as f:
            summary_dict = json.load(fp=f)
        for key, value in summary_dict.items():
            if "continue_from" not in key and "gpu_to_use" not in key:
                args_dict[key] = value
        return args_dict

    parser = argparse.ArgumentParser(description='Welcome to the MAML++ training and inference system')

    parser.add_argument('--batch_size', type=int, default=32, help='Batch_size for experiment')
    parser.add_argument('--image_height', type=int, default=28)
    parser.add_argument('--image_width', type=int, default=28)
    parser.add_argument('--image_channels', type=int, default=1)
    parser.add_argument('--reset_stored_filepaths', type=str, default="False")
    parser.add_argument('--reverse_channels', type=str, default="False")
    parser.add_argument('--num_of_gpus', type=int, default=1)
    parser.add_argument('--indexes_of_folders_indicating_class', nargs='+', default=[-2, -3])
    parser.add_argument('--train_val_test_split', nargs='+', default=[0.73982737361, 0.26, 0.13008631319])
    parser.add_argument('--samples_per_iter', type=int, default=1)
    parser.add_argument('--labels_as_int', type=str, default="False")
    parser.add_argument('--seed', type=int, default=104)

    parser.add_argument('--gpu_to_use', type=int)
    parser.add_argument('--num_dataprovider_workers', type=int, default=4)
    parser.add_argument('--max_models_to_save', type=int, default=5)
    parser.add_argument('--dataset_name', type=str, default="omniglot_dataset")
    parser.add_argument('--dataset_path', type=str, default="datasets/omniglot_dataset")
    parser.add_argument('--reset_stored_paths', type=str, default="False")
    parser.add_argument('--experiment_name', type=str)
    parser.add_argument('--architecture_name', type=str)
    parser.add_argument('--continue_from_epoch', type=str, default='latest', help='Continue from checkpoint of epoch')
    parser.add_argument('--dropout_rate_value', type=float, default=0.3, help='Dropout_rate_value')
    parser.add_argument('--num_target_samples', type=int, default=15, help='Dropout_rate_value')
    parser.add_argument('--second_order', type=str, default="False", help='Dropout_rate_value')
    parser.add_argument('--total_epochs', type=int, default=200, help='Number of epochs per experiment')
    parser.add_argument('--total_iter_per_epoch', type=int, default=500, help='Number of iters per epoch')
    parser.add_argument('--min_learning_rate', type=float, default=0.00001, help='Min learning rate')
    parser.add_argument('--meta_learning_rate', type=float, default=0.001, help='Learning rate of overall MAML system')
    parser.add_argument('--meta_opt_bn', type=str, default="False")
    parser.add_argument('--task_learning_rate', type=float, default=0.1, help='Learning rate per task gradient step')

    parser.add_argument('--norm_layer', type=str, default="batch_norm")
    parser.add_argument('--max_pooling', type=str, default="False")
    parser.add_argument('--per_step_bn_statistics', type=str, default="False")
    parser.add_argument('--num_classes_per_set', type=int, default=20, help='Number of classes to sample per set')
    parser.add_argument('--cnn_num_blocks', type=int, default=4, help='Number of classes to sample per set')
    parser.add_argument('--number_of_training_steps_per_iter', type=int, default=1, help='Number of classes to sample per set')
    parser.add_argument('--number_of_evaluation_steps_per_iter', type=int, default=1, help='Number of classes to sample per set')
    parser.add_argument('--cnn_num_filters', type=int, default=64, help='Number of classes to sample per set')
    parser.add_argument('--cnn_blocks_per_stage', type=int, default=1, help='Number of classes to sample per set')
    parser.add_argument('--num_samples_per_class', type=int, default=1, help='Number of samples per set to sample')
    parser.add_argument('--name_of_args_json_file', type=str, default="None")

    args = parser.parse_args([])
    args_dict = vars(args)

    # Override args with JSON file values
    if json_file_path:
        args_dict = extract_args_from_json(json_file_path, args_dict)

    # Convert string-based booleans to actual booleans
    for key in args_dict:
        if isinstance(args_dict[key], str) and args_dict[key].lower() == "true":
            args_dict[key] = True
        elif isinstance(args_dict[key], str) and args_dict[key].lower() == "false":
            args_dict[key] = False

    # Resolve dataset path if environment variable is set
    if "dataset_path" in args_dict and config["dataset_path"]:
        args_dict["dataset_path"] = os.path.join(config["dataset_path"], args_dict["dataset_path"])

    args = Bunch(args_dict)

    # Check if CUDA is available
    args.use_cuda = torch.cuda.is_available()
    device = torch.device('cuda' if args.use_cuda else 'cpu')

    return args, device


In [24]:
import shutil

def maybe_unzip_dataset(args):

    datasets = [args.dataset_name]
    dataset_paths = [args.dataset_path]
    done = False

    for dataset_idx, dataset_path in enumerate(dataset_paths):
        if dataset_path.endswith('/'):
            dataset_path = dataset_path[:-1]
        # print(dataset_path)
        if not os.path.exists(dataset_path):
            print("Not found dataset folder structure.. searching for .tar.bz2 file")
            zip_directory = "{}.tar.bz2".format(os.path.join(config["dataset_path"], datasets[dataset_idx]))

            assert os.path.exists(os.path.abspath(zip_directory)), "{} dataset zip file not found" \
                                                  "place dataset in datasets folder as explained in README".format(os.path.abspath(zip_directory))
            print("Found zip file, unpacking")

            unzip_file(filepath_pack=os.path.join(config["dataset_path"], "{}.tar.bz2".format(datasets[dataset_idx])),
                       filepath_to_store=config["dataset_path"])



            args.reset_stored_filepaths = True

        total_files = 0
        for subdir, dir, files in os.walk(dataset_path):
            for file in files:
                if file.lower().endswith(".jpeg") or file.lower().endswith(".jpg") or file.lower().endswith(
                        ".png") or file.lower().endswith(".pkl"):
                    total_files += 1
        print("count stuff________________________________________", total_files)
        if (total_files == 1623 * 20 and datasets[dataset_idx] == 'omniglot_dataset') or (
                total_files == 100 * 600 and 'mini_imagenet' in datasets[dataset_idx]) or (
                total_files == 3 and 'mini_imagenet_pkl' in datasets[dataset_idx]):
            print("file count is correct")
            done = True
        elif datasets[dataset_idx] not in [
            'omniglot_dataset',
            'mini_imagenet',
            'mini_imagenet_pkl',
        ]:
            done = True
            print("using new dataset")

        if not done:
            shutil.rmtree(dataset_path, ignore_errors=True)
            maybe_unzip_dataset(args)


In [25]:
os.chdir('/content/HowToTrainYourMAMLPytorch')

In [None]:
# Combines the arguments, model, data and experiment builders to run an experiment

# python train_maml_system.py --name_of_args_json_file experiment_config/ --gpu_to_use 1
# import sys
# # Simulate command line arguments
# sys.argv = [  # Replace with the current file name
#            '--name_of_args_json_file', 'content/omniglot_maml++-omniglot_5_8_0.1_64_5_2.json',
#            '--gpu_to_use', '1',  # Dataset directory
#           #  '--experiment_name', 'omniglot_experiment',  # Experiment name
#           #  '--architecture_name', 'maml', # You'll likely need to provide an appropriate architecture name
#            # ... add other necessary arguments
#            ]

# args, device = get_args()
args, device = load_args_from_json("../omniglot_maml++-omniglot_5_8_0.1_64_20_2.json")

model = MAMLFewShotClassifier(args=args, device=device,
                              im_shape=(2, args.image_channels,
                                        args.image_height, args.image_width))
# maybe_unzip_dataset(args=args)
data = MetaLearningSystemDataLoader
maml_system = ExperimentBuilder(model=model, data=data, args=args, device=device)
maml_system.run_experiment()

Using max pooling
torch.Size([2, 64, 28, 28])
torch.Size([2, 64, 14, 14])
torch.Size([2, 64, 7, 7])
torch.Size([2, 64, 3, 3])
VGGNetwork build torch.Size([2, 5])
meta network params
layer_dict.conv0.conv.weight torch.Size([64, 1, 3, 3])
layer_dict.conv0.conv.bias torch.Size([64])
layer_dict.conv0.norm_layer.running_mean torch.Size([5, 64])
layer_dict.conv0.norm_layer.running_var torch.Size([5, 64])
layer_dict.conv0.norm_layer.bias torch.Size([5, 64])
layer_dict.conv0.norm_layer.weight torch.Size([5, 64])
layer_dict.conv1.conv.weight torch.Size([64, 64, 3, 3])
layer_dict.conv1.conv.bias torch.Size([64])
layer_dict.conv1.norm_layer.running_mean torch.Size([5, 64])
layer_dict.conv1.norm_layer.running_var torch.Size([5, 64])
layer_dict.conv1.norm_layer.bias torch.Size([5, 64])
layer_dict.conv1.norm_layer.weight torch.Size([5, 64])
layer_dict.conv2.conv.weight torch.Size([64, 64, 3, 3])
layer_dict.conv2.conv.bias torch.Size([64])
layer_dict.conv2.norm_layer.running_mean torch.Size([5, 64])


100%|██████████| 1150/1150 [00:06<00:00, 182.45it/s]


Currently loading into memory the val set


100%|██████████| 50/50 [00:00<00:00, 129.99it/s]


Currently loading into memory the test set


100%|██████████| 423/423 [00:02<00:00, 189.19it/s]


data {'train': 23000, 'val': 1000, 'test': 8460}
train_seed 875689, val_seed: 985773, at start time
0 75000




shape of data torch.Size([8, 5, 5, 1, 28, 28]) torch.Size([8, 5, 1, 1, 28, 28]) torch.Size([8, 5, 5]) torch.Size([8, 5, 1])


  and param.grad is not None
training phase 0 -> loss: 1.2018, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 1/75000 [00:02<47:01:54,  2.26s/it]

noise size: 0.095


training phase 0 -> loss: 1.0820, accuracy: 0.8000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 2/75000 [00:03<30:43:10,  1.47s/it]

noise size: 0.09025


training phase 0 -> loss: 1.0389, accuracy: 0.7250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 3/75000 [00:04<25:20:37,  1.22s/it]

noise size: 0.0857375


training phase 0 -> loss: 1.0370, accuracy: 0.7500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 4/75000 [00:05<25:48:56,  1.24s/it]

noise size: 0.08145062499999998


training phase 0 -> loss: 1.0575, accuracy: 0.7500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 5/75000 [00:06<21:59:31,  1.06s/it]

noise size: 0.07737809374999999


training phase 0 -> loss: 1.0060, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 6/75000 [00:06<19:19:03,  1.08it/s]

noise size: 0.07350918906249998


training phase 0 -> loss: 1.0187, accuracy: 0.7750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 7/75000 [00:07<17:34:00,  1.19it/s]

noise size: 0.06983372960937498


training phase 0 -> loss: 1.0384, accuracy: 0.7750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 8/75000 [00:08<16:38:44,  1.25it/s]

noise size: 0.06634204312890622


training phase 0 -> loss: 1.0273, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 9/75000 [00:08<15:49:16,  1.32it/s]

noise size: 0.0630249409724609


training phase 0 -> loss: 1.0261, accuracy: 0.6750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 10/75000 [00:09<15:27:53,  1.35it/s]

noise size: 0.05987369392383786


training phase 0 -> loss: 0.9263, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 11/75000 [00:10<15:00:31,  1.39it/s]

noise size: 0.05688000922764597


training phase 0 -> loss: 0.9762, accuracy: 0.7750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 12/75000 [00:10<14:48:12,  1.41it/s]

noise size: 0.05403600876626367


training phase 0 -> loss: 0.9035, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 13/75000 [00:11<14:33:27,  1.43it/s]

noise size: 0.05133420832795048


training phase 0 -> loss: 0.9390, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 14/75000 [00:12<14:21:34,  1.45it/s]

noise size: 0.04876749791155295


training phase 0 -> loss: 0.8891, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 15/75000 [00:12<14:18:10,  1.46it/s]

noise size: 0.046329123015975304


training phase 0 -> loss: 1.1188, accuracy: 0.7000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 16/75000 [00:13<14:11:13,  1.47it/s]

noise size: 0.04401266686517654


training phase 0 -> loss: 0.8773, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 17/75000 [00:14<14:06:51,  1.48it/s]

noise size: 0.04181203352191771


training phase 0 -> loss: 0.9264, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 18/75000 [00:14<14:07:26,  1.47it/s]

noise size: 0.039721431845821824


training phase 0 -> loss: 0.7931, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 19/75000 [00:15<14:06:45,  1.48it/s]

noise size: 0.037735360253530734


training phase 0 -> loss: 0.9390, accuracy: 0.8000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 20/75000 [00:16<15:34:55,  1.34it/s]

noise size: 0.035848592240854196


training phase 0 -> loss: 0.8463, accuracy: 0.8000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 21/75000 [00:17<16:59:51,  1.23it/s]

noise size: 0.03405616262881148


training phase 0 -> loss: 0.7072, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 22/75000 [00:18<18:18:20,  1.14it/s]

noise size: 0.03235335449737091


training phase 0 -> loss: 0.8284, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 23/75000 [00:19<17:00:29,  1.22it/s]

noise size: 0.030735686772502362


training phase 0 -> loss: 0.9804, accuracy: 0.6750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 24/75000 [00:19<16:05:36,  1.29it/s]

noise size: 0.029198902433877242


training phase 0 -> loss: 0.8981, accuracy: 0.7500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 25/75000 [00:20<15:30:48,  1.34it/s]

noise size: 0.027738957312183378


training phase 0 -> loss: 0.8728, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 26/75000 [00:21<15:08:54,  1.37it/s]

noise size: 0.026352009446574207


training phase 0 -> loss: 0.6619, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 27/75000 [00:21<14:45:31,  1.41it/s]

noise size: 0.025034408974245494


training phase 0 -> loss: 0.8089, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 28/75000 [00:22<14:36:16,  1.43it/s]

noise size: 0.023782688525533217


training phase 0 -> loss: 0.7965, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 29/75000 [00:23<14:33:41,  1.43it/s]

noise size: 0.022593554099256556


training phase 0 -> loss: 0.7285, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 30/75000 [00:23<14:24:00,  1.45it/s]

noise size: 0.021463876394293726


training phase 0 -> loss: 0.7427, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 31/75000 [00:24<14:25:27,  1.44it/s]

noise size: 0.020390682574579037


training phase 0 -> loss: 0.7781, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 32/75000 [00:25<14:19:30,  1.45it/s]

noise size: 0.019371148445850084


training phase 0 -> loss: 0.7290, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 33/75000 [00:25<14:12:40,  1.47it/s]

noise size: 0.01840259102355758


training phase 0 -> loss: 0.6483, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 34/75000 [00:26<14:17:16,  1.46it/s]

noise size: 0.0174824614723797


training phase 0 -> loss: 0.8343, accuracy: 0.8000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 35/75000 [00:27<14:13:43,  1.46it/s]

noise size: 0.016608338398760712


training phase 0 -> loss: 0.7364, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 36/75000 [00:28<14:17:44,  1.46it/s]

noise size: 0.015777921478822676


training phase 0 -> loss: 0.7644, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 37/75000 [00:28<14:47:56,  1.41it/s]

noise size: 0.014989025404881541


training phase 0 -> loss: 0.7641, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 38/75000 [00:29<15:53:11,  1.31it/s]

noise size: 0.014239574134637464


training phase 0 -> loss: 0.7700, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 39/75000 [00:30<17:42:20,  1.18it/s]

noise size: 0.01352759542790559


training phase 0 -> loss: 0.6762, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 40/75000 [00:31<18:22:24,  1.13it/s]

noise size: 0.012851215656510309


training phase 0 -> loss: 0.8472, accuracy: 0.8000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 41/75000 [00:32<17:05:06,  1.22it/s]

noise size: 0.012208654873684792


training phase 0 -> loss: 0.7740, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 42/75000 [00:33<16:09:34,  1.29it/s]

noise size: 0.011598222130000552


training phase 0 -> loss: 0.6998, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 43/75000 [00:33<15:37:52,  1.33it/s]

noise size: 0.011018311023500524


training phase 0 -> loss: 0.7009, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 44/75000 [00:34<15:08:03,  1.38it/s]

noise size: 0.010467395472325497


training phase 0 -> loss: 0.6138, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 45/75000 [00:35<14:54:55,  1.40it/s]

noise size: 0.009944025698709221


training phase 0 -> loss: 0.4739, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 46/75000 [00:35<14:45:04,  1.41it/s]

noise size: 0.00944682441377376


training phase 0 -> loss: 0.7921, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 47/75000 [00:36<14:28:12,  1.44it/s]

noise size: 0.00897448319308507


training phase 0 -> loss: 0.6066, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 48/75000 [00:37<14:24:00,  1.45it/s]

noise size: 0.008525759033430816


training phase 0 -> loss: 0.6527, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 49/75000 [00:37<14:19:10,  1.45it/s]

noise size: 0.008099471081759275


training phase 0 -> loss: 0.6332, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 50/75000 [00:38<14:11:28,  1.47it/s]

noise size: 0.007694497527671311


training phase 0 -> loss: 0.5497, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 51/75000 [00:39<14:10:19,  1.47it/s]

noise size: 0.007309772651287745


training phase 0 -> loss: 0.6840, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 52/75000 [00:39<14:19:25,  1.45it/s]

noise size: 0.006944284018723357


training phase 0 -> loss: 0.6755, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 53/75000 [00:40<14:13:57,  1.46it/s]

noise size: 0.006597069817787189


training phase 0 -> loss: 0.5701, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 54/75000 [00:41<14:14:10,  1.46it/s]

noise size: 0.006267216326897829


training phase 0 -> loss: 0.6133, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 55/75000 [00:42<15:05:46,  1.38it/s]

noise size: 0.005953855510552938


training phase 0 -> loss: 0.6873, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 56/75000 [00:42<16:16:44,  1.28it/s]

noise size: 0.005656162735025291


training phase 0 -> loss: 0.6154, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 57/75000 [00:44<18:04:04,  1.15it/s]

noise size: 0.005373354598274026


training phase 0 -> loss: 0.5742, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 58/75000 [00:44<18:35:39,  1.12it/s]

noise size: 0.005104686868360324


training phase 0 -> loss: 0.5643, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 59/75000 [00:45<17:11:56,  1.21it/s]

noise size: 0.004849452524942308


training phase 0 -> loss: 0.5741, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 60/75000 [00:46<16:13:49,  1.28it/s]

noise size: 0.004606979898695193


training phase 0 -> loss: 0.5404, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 61/75000 [00:47<15:39:14,  1.33it/s]

noise size: 0.004376630903760433


training phase 0 -> loss: 0.6192, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 62/75000 [00:47<15:08:27,  1.37it/s]

noise size: 0.0041577993585724116


training phase 0 -> loss: 0.5674, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 63/75000 [00:48<14:48:24,  1.41it/s]

noise size: 0.0039499093906437905


training phase 0 -> loss: 0.6133, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 64/75000 [00:49<14:40:03,  1.42it/s]

noise size: 0.0037524139211116006


training phase 0 -> loss: 0.6217, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 65/75000 [00:49<14:42:45,  1.41it/s]

noise size: 0.0035647932250560204


training phase 0 -> loss: 0.4553, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 66/75000 [00:50<14:35:51,  1.43it/s]

noise size: 0.003386553563803219


training phase 0 -> loss: 0.6256, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 67/75000 [00:51<14:26:53,  1.44it/s]

noise size: 0.003217225885613058


training phase 0 -> loss: 0.6917, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 68/75000 [00:51<14:45:11,  1.41it/s]

noise size: 0.0030563645913324047


training phase 0 -> loss: 0.5490, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 69/75000 [00:52<14:33:05,  1.43it/s]

noise size: 0.0029035463617657843


training phase 0 -> loss: 0.5716, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 70/75000 [00:53<14:27:38,  1.44it/s]

noise size: 0.002758369043677495


training phase 0 -> loss: 0.4928, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 71/75000 [00:53<14:29:59,  1.44it/s]

noise size: 0.00262045059149362


training phase 0 -> loss: 0.5511, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 72/75000 [00:54<14:25:31,  1.44it/s]

noise size: 0.002489428061918939


training phase 0 -> loss: 0.4206, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 73/75000 [00:55<15:42:53,  1.32it/s]

noise size: 0.002364956658822992


training phase 0 -> loss: 0.7021, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 74/75000 [00:56<16:48:56,  1.24it/s]

noise size: 0.002246708825881842


training phase 0 -> loss: 0.4535, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 75/75000 [00:57<18:02:50,  1.15it/s]

noise size: 0.00213437338458775


training phase 0 -> loss: 0.5204, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 76/75000 [00:58<17:40:06,  1.18it/s]

noise size: 0.0020276547153583622


training phase 0 -> loss: 0.5560, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 77/75000 [00:58<16:41:45,  1.25it/s]

noise size: 0.001926271979590444


training phase 0 -> loss: 0.4807, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 78/75000 [00:59<15:51:52,  1.31it/s]

noise size: 0.0018299583806109217


training phase 0 -> loss: 0.4846, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 79/75000 [01:00<15:11:39,  1.37it/s]

noise size: 0.0017384604615803755


training phase 0 -> loss: 0.5371, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 80/75000 [01:00<14:53:59,  1.40it/s]

noise size: 0.0016515374385013568


training phase 0 -> loss: 0.5810, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 81/75000 [01:01<14:51:45,  1.40it/s]

noise size: 0.0015689605665762888


training phase 0 -> loss: 0.5272, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 82/75000 [01:02<14:40:34,  1.42it/s]

noise size: 0.0014905125382474742


training phase 0 -> loss: 0.4117, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 83/75000 [01:03<14:31:28,  1.43it/s]

noise size: 0.0014159869113351004


training phase 0 -> loss: 0.5638, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 84/75000 [01:03<14:30:08,  1.43it/s]

noise size: 0.0013451875657683454


training phase 0 -> loss: 0.4221, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 85/75000 [01:04<14:23:49,  1.45it/s]

noise size: 0.001277928187479928


training phase 0 -> loss: 0.5225, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 86/75000 [01:05<14:25:17,  1.44it/s]

noise size: 0.0012140317781059316


training phase 0 -> loss: 0.3780, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 87/75000 [01:05<14:24:57,  1.44it/s]

noise size: 0.001153330189200635


training phase 0 -> loss: 0.5718, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 88/75000 [01:06<14:14:07,  1.46it/s]

noise size: 0.0010956636797406032


training phase 0 -> loss: 0.6206, accuracy: 0.8250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 89/75000 [01:07<14:18:27,  1.45it/s]

noise size: 0.001040880495753573


training phase 0 -> loss: 0.4455, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 90/75000 [01:07<14:33:53,  1.43it/s]

noise size: 0.0009888364709658944


training phase 0 -> loss: 0.3767, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 91/75000 [01:08<16:01:25,  1.30it/s]

noise size: 0.0009393946474175996


training phase 0 -> loss: 0.5519, accuracy: 0.8500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 92/75000 [01:09<17:13:03,  1.21it/s]

noise size: 0.0008924249150467197


training phase 0 -> loss: 0.6120, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 93/75000 [01:10<18:17:36,  1.14it/s]

noise size: 0.0008478036692943836


training phase 0 -> loss: 0.3604, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 94/75000 [01:11<17:02:40,  1.22it/s]

noise size: 0.0008054134858296644


training phase 0 -> loss: 0.4979, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 95/75000 [01:12<17:02:37,  1.22it/s]

noise size: 0.0007651428115381812


training phase 0 -> loss: 0.3991, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 96/75000 [01:12<16:14:03,  1.28it/s]

noise size: 0.000726885670961272


training phase 0 -> loss: 0.5382, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 97/75000 [01:13<15:38:48,  1.33it/s]

noise size: 0.0006905413874132084


training phase 0 -> loss: 0.4347, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 98/75000 [01:14<15:13:31,  1.37it/s]

noise size: 0.0006560143180425479


training phase 0 -> loss: 0.5276, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 99/75000 [01:15<15:04:30,  1.38it/s]

noise size: 0.0006232136021404205


training phase 0 -> loss: 0.4935, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 100/75000 [01:15<14:43:28,  1.41it/s]

noise size: 0.0005920529220333994


training phase 0 -> loss: 0.5283, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 101/75000 [01:16<16:18:38,  1.28it/s]

noise size: 0.0005624502759317294


training phase 0 -> loss: 0.5270, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 102/75000 [01:17<17:31:41,  1.19it/s]

noise size: 0.0005343277621351429


training phase 0 -> loss: 0.3750, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 103/75000 [01:18<18:12:24,  1.14it/s]

noise size: 0.0005076113740283857


training phase 0 -> loss: 0.4272, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 104/75000 [01:19<16:59:07,  1.22it/s]

noise size: 0.00048223080532696635


training phase 0 -> loss: 0.4102, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 105/75000 [01:19<16:04:13,  1.29it/s]

noise size: 0.00045811926506061804


training phase 0 -> loss: 0.5110, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 106/75000 [01:20<15:45:22,  1.32it/s]

noise size: 0.0004352133018075871


training phase 0 -> loss: 0.4484, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 107/75000 [01:21<16:47:55,  1.24it/s]

noise size: 0.00041345263671720774


training phase 0 -> loss: 0.4924, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 108/75000 [01:22<17:36:17,  1.18it/s]

noise size: 0.00039278000488134735


training phase 0 -> loss: 0.4890, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 109/75000 [01:23<18:38:08,  1.12it/s]

noise size: 0.00037314100463728


training phase 0 -> loss: 0.5292, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 110/75000 [01:24<17:44:03,  1.17it/s]

noise size: 0.000354483954405416


training phase 0 -> loss: 0.4274, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 111/75000 [01:25<16:50:04,  1.24it/s]

noise size: 0.00033675975668514516


training phase 0 -> loss: 0.4399, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 112/75000 [01:25<16:07:05,  1.29it/s]

noise size: 0.0003199217688508879


training phase 0 -> loss: 0.3759, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 113/75000 [01:26<15:28:16,  1.34it/s]

noise size: 0.00030392568040834347


training phase 0 -> loss: 0.3975, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 114/75000 [01:27<14:57:35,  1.39it/s]

noise size: 0.0002887293963879263


training phase 0 -> loss: 0.3955, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 115/75000 [01:27<14:55:17,  1.39it/s]

noise size: 0.00027429292656852995


training phase 0 -> loss: 0.3970, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 116/75000 [01:28<14:52:20,  1.40it/s]

noise size: 0.00026057828024010345


training phase 0 -> loss: 0.3478, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 117/75000 [01:29<14:31:56,  1.43it/s]

noise size: 0.00024754936622809826


training phase 0 -> loss: 0.4380, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 118/75000 [01:29<14:34:26,  1.43it/s]

noise size: 0.00023517189791669334


training phase 0 -> loss: 0.3705, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 119/75000 [01:30<14:32:05,  1.43it/s]

noise size: 0.00022341330302085867


training phase 0 -> loss: 0.4213, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 120/75000 [01:31<14:22:06,  1.45it/s]

noise size: 0.00021224263786981574


training phase 0 -> loss: 0.4077, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 121/75000 [01:31<14:19:00,  1.45it/s]

noise size: 0.00020163050597632494


training phase 0 -> loss: 0.5312, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 122/75000 [01:32<14:16:30,  1.46it/s]

noise size: 0.0001915489806775087


training phase 0 -> loss: 0.4307, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 123/75000 [01:33<14:19:41,  1.45it/s]

noise size: 0.00018197153164363326


training phase 0 -> loss: 0.2962, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 124/75000 [01:34<14:56:13,  1.39it/s]

noise size: 0.0001728729550614516


training phase 0 -> loss: 0.4339, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 125/75000 [01:34<16:05:31,  1.29it/s]

noise size: 0.00016422930730837902


training phase 0 -> loss: 0.3535, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 126/75000 [01:35<17:32:32,  1.19it/s]

noise size: 0.00015601784194296006


training phase 0 -> loss: 0.3909, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 127/75000 [01:36<18:09:46,  1.15it/s]

noise size: 0.00014821694984581206


training phase 0 -> loss: 0.4592, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 128/75000 [01:37<16:59:39,  1.22it/s]

noise size: 0.00014080610235352146


training phase 0 -> loss: 0.4873, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 129/75000 [01:38<16:17:49,  1.28it/s]

noise size: 0.0001337657972358454


training phase 0 -> loss: 0.3969, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 130/75000 [01:38<15:45:43,  1.32it/s]

noise size: 0.0001270775073740531


training phase 0 -> loss: 0.3447, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 131/75000 [01:39<15:16:53,  1.36it/s]

noise size: 0.00012072363200535044


training phase 0 -> loss: 0.3599, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 132/75000 [01:40<14:59:37,  1.39it/s]

noise size: 0.00011468745040508291


training phase 0 -> loss: 0.3535, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 133/75000 [01:41<14:56:05,  1.39it/s]

noise size: 0.00010895307788482875


training phase 0 -> loss: 0.3639, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 134/75000 [01:41<14:51:35,  1.40it/s]

noise size: 0.00010350542399058731


training phase 0 -> loss: 0.3235, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 135/75000 [01:42<14:46:37,  1.41it/s]

noise size: 9.833015279105794e-05


training phase 0 -> loss: 0.3780, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 136/75000 [01:43<14:48:11,  1.40it/s]

noise size: 9.341364515150504e-05


training phase 0 -> loss: 0.3844, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 137/75000 [01:43<14:35:06,  1.43it/s]

noise size: 8.874296289392978e-05


training phase 0 -> loss: 0.4254, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 138/75000 [01:44<14:30:58,  1.43it/s]

noise size: 8.430581474923329e-05


training phase 0 -> loss: 0.4126, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 139/75000 [01:45<14:25:52,  1.44it/s]

noise size: 8.009052401177162e-05


training phase 0 -> loss: 0.2664, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 140/75000 [01:45<14:16:58,  1.46it/s]

noise size: 7.608599781118304e-05


training phase 0 -> loss: 0.3573, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 141/75000 [01:46<14:29:52,  1.43it/s]

noise size: 7.228169792062389e-05


training phase 0 -> loss: 0.4079, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 142/75000 [01:47<15:42:15,  1.32it/s]

noise size: 6.866761302459269e-05


training phase 0 -> loss: 0.3636, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 143/75000 [01:48<16:50:26,  1.23it/s]

noise size: 6.523423237336306e-05


training phase 0 -> loss: 0.3012, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 144/75000 [01:49<18:09:16,  1.15it/s]

noise size: 6.19725207546949e-05


training phase 0 -> loss: 0.2950, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 145/75000 [01:50<17:37:37,  1.18it/s]

noise size: 5.8873894716960144e-05


training phase 0 -> loss: 0.4033, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 146/75000 [01:50<16:41:57,  1.25it/s]

noise size: 5.5930199981112136e-05


training phase 0 -> loss: 0.2653, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 147/75000 [01:51<16:15:44,  1.28it/s]

noise size: 5.3133689982056524e-05


training phase 0 -> loss: 0.3187, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 148/75000 [01:52<15:40:02,  1.33it/s]

noise size: 5.0477005482953695e-05


training phase 0 -> loss: 0.3454, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 149/75000 [01:53<15:07:48,  1.37it/s]

noise size: 4.7953155208806006e-05


training phase 0 -> loss: 0.3086, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 150/75000 [01:53<14:55:32,  1.39it/s]

noise size: 4.55554974483657e-05


training phase 0 -> loss: 0.3100, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 151/75000 [01:54<14:43:19,  1.41it/s]

noise size: 4.327772257594741e-05


training phase 0 -> loss: 0.3699, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 152/75000 [01:55<14:37:22,  1.42it/s]

noise size: 4.111383644715004e-05


training phase 0 -> loss: 0.4301, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 153/75000 [01:55<14:32:25,  1.43it/s]

noise size: 3.9058144624792534e-05


training phase 0 -> loss: 0.2714, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 154/75000 [01:56<14:46:35,  1.41it/s]

noise size: 3.7105237393552906e-05


training phase 0 -> loss: 0.2529, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 155/75000 [01:57<14:34:05,  1.43it/s]

noise size: 3.524997552387526e-05


training phase 0 -> loss: 0.3209, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 156/75000 [01:57<14:27:16,  1.44it/s]

noise size: 3.34874767476815e-05


training phase 0 -> loss: 0.2320, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 157/75000 [01:58<14:23:14,  1.44it/s]

noise size: 3.1813102910297426e-05


training phase 0 -> loss: 0.3157, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 158/75000 [01:59<14:17:19,  1.45it/s]

noise size: 3.0222447764782554e-05


training phase 0 -> loss: 0.3305, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 159/75000 [02:00<14:43:30,  1.41it/s]

noise size: 2.8711325376543424e-05


training phase 0 -> loss: 0.2624, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 160/75000 [02:00<16:17:46,  1.28it/s]

noise size: 2.727575910771625e-05


training phase 0 -> loss: 0.2900, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 161/75000 [02:02<18:05:55,  1.15it/s]

noise size: 2.5911971152330435e-05


training phase 0 -> loss: 0.3747, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 162/75000 [02:03<19:01:58,  1.09it/s]

noise size: 2.461637259471391e-05


training phase 0 -> loss: 0.3661, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 163/75000 [02:03<17:38:10,  1.18it/s]

noise size: 2.3385553964978216e-05


training phase 0 -> loss: 0.4157, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 164/75000 [02:04<16:34:25,  1.25it/s]

noise size: 2.2216276266729303e-05


training phase 0 -> loss: 0.2121, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 165/75000 [02:05<16:00:58,  1.30it/s]

noise size: 2.1105462453392836e-05


training phase 0 -> loss: 0.3461, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 166/75000 [02:05<15:38:53,  1.33it/s]

noise size: 2.0050189330723194e-05


training phase 0 -> loss: 0.4099, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 167/75000 [02:06<15:07:46,  1.37it/s]

noise size: 1.9047679864187035e-05


training phase 0 -> loss: 0.3175, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 168/75000 [02:07<14:49:04,  1.40it/s]

noise size: 1.8095295870977683e-05


training phase 0 -> loss: 0.2629, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 169/75000 [02:07<14:49:30,  1.40it/s]

noise size: 1.71905310774288e-05


training phase 0 -> loss: 0.3570, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 170/75000 [02:08<14:36:34,  1.42it/s]

noise size: 1.6331004523557357e-05


training phase 0 -> loss: 0.2599, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 171/75000 [02:09<14:32:42,  1.43it/s]

noise size: 1.5514454297379488e-05


training phase 0 -> loss: 0.3198, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 172/75000 [02:09<14:24:29,  1.44it/s]

noise size: 1.4738731582510512e-05


training phase 0 -> loss: 0.2120, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 173/75000 [02:10<14:15:43,  1.46it/s]

noise size: 1.4001795003384986e-05


training phase 0 -> loss: 0.3047, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 174/75000 [02:11<14:13:53,  1.46it/s]

noise size: 1.3301705253215736e-05


training phase 0 -> loss: 0.3231, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 175/75000 [02:12<14:10:45,  1.47it/s]

noise size: 1.2636619990554949e-05


training phase 0 -> loss: 0.2087, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 176/75000 [02:12<14:04:38,  1.48it/s]

noise size: 1.2004788991027201e-05


training phase 0 -> loss: 0.2837, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 177/75000 [02:13<15:16:38,  1.36it/s]

noise size: 1.140454954147584e-05


training phase 0 -> loss: 0.1866, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 178/75000 [02:14<16:24:16,  1.27it/s]

noise size: 1.0834322064402047e-05


training phase 0 -> loss: 0.2396, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 179/75000 [02:15<17:47:30,  1.17it/s]

noise size: 1.0292605961181944e-05


training phase 0 -> loss: 0.2693, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 180/75000 [02:16<17:45:27,  1.17it/s]

noise size: 9.777975663122847e-06


training phase 0 -> loss: 0.3872, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 181/75000 [02:17<16:37:47,  1.25it/s]

noise size: 9.289076879966705e-06


training phase 0 -> loss: 0.3354, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 182/75000 [02:17<15:54:35,  1.31it/s]

noise size: 8.82462303596837e-06


training phase 0 -> loss: 0.4779, accuracy: 0.7750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 183/75000 [02:18<15:26:25,  1.35it/s]

noise size: 8.38339188416995e-06


training phase 0 -> loss: 0.2373, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 184/75000 [02:19<14:58:55,  1.39it/s]

noise size: 7.964222289961452e-06


training phase 0 -> loss: 0.4307, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 185/75000 [02:19<14:49:00,  1.40it/s]

noise size: 7.566011175463379e-06


training phase 0 -> loss: 0.2580, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 186/75000 [02:20<15:25:42,  1.35it/s]

noise size: 7.18771061669021e-06


training phase 0 -> loss: 0.2616, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 187/75000 [02:21<15:04:38,  1.38it/s]

noise size: 6.8283250858556995e-06


training phase 0 -> loss: 0.2853, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 188/75000 [02:21<14:54:44,  1.39it/s]

noise size: 6.4869088315629144e-06


training phase 0 -> loss: 0.2625, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 189/75000 [02:22<14:40:12,  1.42it/s]

noise size: 6.162563389984768e-06


training phase 0 -> loss: 0.3182, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 190/75000 [02:23<14:36:11,  1.42it/s]

noise size: 5.85443522048553e-06


training phase 0 -> loss: 0.1866, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 191/75000 [02:24<14:36:19,  1.42it/s]

noise size: 5.561713459461253e-06


training phase 0 -> loss: 0.3359, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 192/75000 [02:24<14:29:34,  1.43it/s]

noise size: 5.28362778648819e-06


training phase 0 -> loss: 0.2771, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 193/75000 [02:25<14:24:40,  1.44it/s]

noise size: 5.019446397163781e-06


training phase 0 -> loss: 0.2553, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 194/75000 [02:26<14:40:11,  1.42it/s]

noise size: 4.768474077305592e-06


training phase 0 -> loss: 0.3023, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 195/75000 [02:27<16:21:03,  1.27it/s]

noise size: 4.530050373440312e-06


training phase 0 -> loss: 0.1952, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 196/75000 [02:28<17:49:35,  1.17it/s]

noise size: 4.303547854768296e-06


training phase 0 -> loss: 0.2137, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 197/75000 [02:29<18:35:00,  1.12it/s]

noise size: 4.088370462029881e-06


training phase 0 -> loss: 0.2248, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 198/75000 [02:29<17:14:14,  1.21it/s]

noise size: 3.883951938928387e-06


training phase 0 -> loss: 0.2673, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 199/75000 [02:30<16:16:58,  1.28it/s]

noise size: 3.689754341981967e-06


training phase 0 -> loss: 0.3642, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 200/75000 [02:31<15:35:57,  1.33it/s]

noise size: 3.5052666248828686e-06


training phase 0 -> loss: 0.2448, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 201/75000 [02:31<15:19:45,  1.36it/s]

noise size: 3.330003293638725e-06


training phase 0 -> loss: 0.1776, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 202/75000 [02:32<15:00:16,  1.38it/s]

noise size: 3.1635031289567887e-06


training phase 0 -> loss: 0.2006, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 203/75000 [02:33<14:52:59,  1.40it/s]

noise size: 3.005327972508949e-06


training phase 0 -> loss: 0.3136, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 204/75000 [02:33<14:40:59,  1.41it/s]

noise size: 2.8550615738835014e-06


training phase 0 -> loss: 0.2280, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 205/75000 [02:34<14:33:13,  1.43it/s]

noise size: 2.712308495189326e-06


training phase 0 -> loss: 0.1386, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 206/75000 [02:35<14:25:34,  1.44it/s]

noise size: 2.57669307042986e-06


training phase 0 -> loss: 0.2463, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 207/75000 [02:35<14:33:55,  1.43it/s]

noise size: 2.447858416908367e-06


training phase 0 -> loss: 0.3035, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 208/75000 [02:36<14:20:14,  1.45it/s]

noise size: 2.3254654960629483e-06


training phase 0 -> loss: 0.2443, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 209/75000 [02:37<14:15:26,  1.46it/s]

noise size: 2.2091922212598007e-06


training phase 0 -> loss: 0.1795, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 210/75000 [02:38<14:35:28,  1.42it/s]

noise size: 2.0987326101968105e-06


training phase 0 -> loss: 0.2341, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 211/75000 [02:38<14:22:47,  1.44it/s]

noise size: 1.9937959796869698e-06


training phase 0 -> loss: 0.1859, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 212/75000 [02:39<15:22:24,  1.35it/s]

noise size: 1.8941061807026212e-06


training phase 0 -> loss: 0.1481, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 213/75000 [02:40<16:25:15,  1.27it/s]

noise size: 1.79940087166749e-06


training phase 0 -> loss: 0.1790, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 214/75000 [02:41<17:32:07,  1.18it/s]

noise size: 1.7094308280841156e-06


training phase 0 -> loss: 0.2739, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 215/75000 [02:42<17:39:27,  1.18it/s]

noise size: 1.6239592866799097e-06


training phase 0 -> loss: 0.1895, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 216/75000 [02:43<16:47:36,  1.24it/s]

noise size: 1.5427613223459142e-06


training phase 0 -> loss: 0.2268, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 217/75000 [02:43<16:03:47,  1.29it/s]

noise size: 1.4656232562286185e-06


training phase 0 -> loss: 0.1716, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 218/75000 [02:44<15:36:13,  1.33it/s]

noise size: 1.3923420934171876e-06


training phase 0 -> loss: 0.2231, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 219/75000 [02:45<15:00:55,  1.38it/s]

noise size: 1.3227249887463282e-06


training phase 0 -> loss: 0.1797, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 220/75000 [02:45<14:54:56,  1.39it/s]

noise size: 1.2565887393090117e-06


training phase 0 -> loss: 0.2139, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 221/75000 [02:46<14:43:35,  1.41it/s]

noise size: 1.193759302343561e-06


training phase 0 -> loss: 0.2438, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 222/75000 [02:47<14:29:40,  1.43it/s]

noise size: 1.134071337226383e-06


training phase 0 -> loss: 0.3400, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 223/75000 [02:47<14:26:22,  1.44it/s]

noise size: 1.0773677703650638e-06


training phase 0 -> loss: 0.2063, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 224/75000 [02:48<14:24:18,  1.44it/s]

noise size: 1.0234993818468106e-06


training phase 0 -> loss: 0.3399, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 225/75000 [02:49<14:26:26,  1.44it/s]

noise size: 9.7232441275447e-07


training phase 0 -> loss: 0.2114, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 226/75000 [02:49<14:20:26,  1.45it/s]

noise size: 9.237081921167466e-07


training phase 0 -> loss: 0.2370, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 227/75000 [02:50<14:22:51,  1.44it/s]

noise size: 8.775227825109092e-07


training phase 0 -> loss: 0.2269, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 228/75000 [02:51<14:18:08,  1.45it/s]

noise size: 8.336466433853637e-07


training phase 0 -> loss: 0.2595, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 229/75000 [02:51<14:13:40,  1.46it/s]

noise size: 7.919643112160955e-07


training phase 0 -> loss: 0.1237, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 230/75000 [02:52<16:02:52,  1.29it/s]

noise size: 7.523660956552907e-07


training phase 0 -> loss: 0.1638, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 231/75000 [02:53<17:08:28,  1.21it/s]

noise size: 7.14747790872526e-07


training phase 0 -> loss: 0.3117, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 232/75000 [02:54<18:33:57,  1.12it/s]

noise size: 6.790104013288997e-07


training phase 0 -> loss: 0.2731, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 233/75000 [02:55<17:30:00,  1.19it/s]

noise size: 6.450598812624547e-07


training phase 0 -> loss: 0.4010, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 234/75000 [02:56<16:22:42,  1.27it/s]

noise size: 6.128068871993319e-07


training phase 0 -> loss: 0.3992, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 235/75000 [02:57<15:55:15,  1.30it/s]

noise size: 5.821665428393653e-07


training phase 0 -> loss: 0.1930, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 236/75000 [02:57<15:23:56,  1.35it/s]

noise size: 5.530582156973969e-07


training phase 0 -> loss: 0.1893, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 237/75000 [02:58<15:03:56,  1.38it/s]

noise size: 5.254053049125271e-07


training phase 0 -> loss: 0.1690, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 238/75000 [02:59<14:43:53,  1.41it/s]

noise size: 4.991350396669007e-07


training phase 0 -> loss: 0.2216, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 239/75000 [02:59<14:23:59,  1.44it/s]

noise size: 4.741782876835556e-07


training phase 0 -> loss: 0.2922, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 240/75000 [03:00<14:28:41,  1.43it/s]

noise size: 4.504693732993778e-07


training phase 0 -> loss: 0.2228, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 241/75000 [03:01<14:18:07,  1.45it/s]

noise size: 4.2794590463440887e-07


training phase 0 -> loss: 0.1726, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 242/75000 [03:01<14:07:26,  1.47it/s]

noise size: 4.065486094026884e-07


training phase 0 -> loss: 0.2926, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 243/75000 [03:02<14:10:02,  1.47it/s]

noise size: 3.8622117893255396e-07


training phase 0 -> loss: 0.2460, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 244/75000 [03:03<14:18:37,  1.45it/s]

noise size: 3.6691011998592624e-07


training phase 0 -> loss: 0.3406, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 245/75000 [03:03<14:41:06,  1.41it/s]

noise size: 3.485646139866299e-07


training phase 0 -> loss: 0.2372, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 246/75000 [03:04<14:28:48,  1.43it/s]

noise size: 3.311363832872984e-07


training phase 0 -> loss: 0.2730, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 247/75000 [03:05<15:01:20,  1.38it/s]

noise size: 3.1457956412293345e-07


training phase 0 -> loss: 0.1893, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 248/75000 [03:06<16:13:23,  1.28it/s]

noise size: 2.9885058591678676e-07


training phase 0 -> loss: 0.1493, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 249/75000 [03:07<17:57:15,  1.16it/s]

noise size: 2.839080566209474e-07


training phase 0 -> loss: 0.2775, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 250/75000 [03:08<18:57:50,  1.09it/s]

noise size: 2.6971265378990003e-07


training phase 0 -> loss: 0.2437, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 251/75000 [03:09<17:40:48,  1.17it/s]

noise size: 2.56227021100405e-07


training phase 0 -> loss: 0.2166, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 252/75000 [03:09<16:40:34,  1.25it/s]

noise size: 2.434156700453848e-07


training phase 0 -> loss: 0.2714, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 253/75000 [03:10<16:01:49,  1.30it/s]

noise size: 2.3124488654311553e-07


training phase 0 -> loss: 0.2119, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 254/75000 [03:11<15:35:03,  1.33it/s]

noise size: 2.1968264221595975e-07


training phase 0 -> loss: 0.1357, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 255/75000 [03:11<15:31:53,  1.34it/s]

noise size: 2.0869851010516177e-07


training phase 0 -> loss: 0.1563, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 256/75000 [03:12<15:14:59,  1.36it/s]

noise size: 1.9826358459990365e-07


training phase 0 -> loss: 0.1146, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 257/75000 [03:13<15:09:01,  1.37it/s]

noise size: 1.8835040536990848e-07


training phase 0 -> loss: 0.1963, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 258/75000 [03:14<15:00:03,  1.38it/s]

noise size: 1.7893288510141304e-07


training phase 0 -> loss: 0.2096, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 259/75000 [03:14<14:53:23,  1.39it/s]

noise size: 1.6998624084634237e-07


training phase 0 -> loss: 0.1568, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 260/75000 [03:15<14:47:57,  1.40it/s]

noise size: 1.6148692880402525e-07


training phase 0 -> loss: 0.2134, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 261/75000 [03:16<14:30:36,  1.43it/s]

noise size: 1.53412582363824e-07


training phase 0 -> loss: 0.2582, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 262/75000 [03:16<14:48:40,  1.40it/s]

noise size: 1.4574195324563278e-07


training phase 0 -> loss: 0.1791, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 263/75000 [03:17<14:37:35,  1.42it/s]

noise size: 1.3845485558335112e-07


training phase 0 -> loss: 0.3361, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 264/75000 [03:18<14:27:14,  1.44it/s]

noise size: 1.3153211280418356e-07


training phase 0 -> loss: 0.2146, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 265/75000 [03:19<15:48:38,  1.31it/s]

noise size: 1.2495550716397436e-07


training phase 0 -> loss: 0.1840, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 266/75000 [03:20<16:50:53,  1.23it/s]

noise size: 1.1870773180577564e-07


training phase 0 -> loss: 0.1934, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 267/75000 [03:21<18:38:15,  1.11it/s]

noise size: 1.1277234521548686e-07


training phase 0 -> loss: 0.2526, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 268/75000 [03:21<17:45:38,  1.17it/s]

noise size: 1.0713372795471251e-07


training phase 0 -> loss: 0.1601, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 269/75000 [03:22<16:38:38,  1.25it/s]

noise size: 1.0177704155697688e-07


training phase 0 -> loss: 0.2457, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 270/75000 [03:23<16:10:02,  1.28it/s]

noise size: 9.668818947912803e-08


training phase 0 -> loss: 0.1579, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 271/75000 [03:24<15:38:27,  1.33it/s]

noise size: 9.185378000517163e-08


training phase 0 -> loss: 0.1726, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 272/75000 [03:24<15:14:59,  1.36it/s]

noise size: 8.726109100491304e-08


training phase 0 -> loss: 0.2558, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 273/75000 [03:25<15:01:30,  1.38it/s]

noise size: 8.289803645466738e-08


training phase 0 -> loss: 0.2332, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 274/75000 [03:26<14:47:11,  1.40it/s]

noise size: 7.875313463193401e-08


training phase 0 -> loss: 0.2811, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 275/75000 [03:26<14:30:16,  1.43it/s]

noise size: 7.481547790033731e-08


training phase 0 -> loss: 0.2880, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 276/75000 [03:27<14:36:15,  1.42it/s]

noise size: 7.107470400532044e-08


training phase 0 -> loss: 0.1952, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 277/75000 [03:28<14:32:05,  1.43it/s]

noise size: 6.752096880505442e-08


training phase 0 -> loss: 0.2021, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 278/75000 [03:28<14:26:05,  1.44it/s]

noise size: 6.41449203648017e-08


training phase 0 -> loss: 0.1810, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 279/75000 [03:29<14:24:24,  1.44it/s]

noise size: 6.09376743465616e-08


training phase 0 -> loss: 0.1640, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 280/75000 [03:30<15:04:41,  1.38it/s]

noise size: 5.7890790629233524e-08


training phase 0 -> loss: 0.2580, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 281/75000 [03:31<15:07:15,  1.37it/s]

noise size: 5.4996251097771844e-08


training phase 0 -> loss: 0.1921, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 282/75000 [03:31<15:58:01,  1.30it/s]

noise size: 5.224643854288325e-08


training phase 0 -> loss: 0.2512, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 283/75000 [03:32<17:21:13,  1.20it/s]

noise size: 4.963411661573909e-08


training phase 0 -> loss: 0.1494, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 284/75000 [03:34<18:56:38,  1.10it/s]

noise size: 4.715241078495213e-08


training phase 0 -> loss: 0.1756, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 285/75000 [03:34<18:58:21,  1.09it/s]

noise size: 4.479479024570452e-08


training phase 0 -> loss: 0.2006, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 286/75000 [03:35<17:33:42,  1.18it/s]

noise size: 4.2555050733419295e-08


training phase 0 -> loss: 0.1775, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 287/75000 [03:36<16:34:56,  1.25it/s]

noise size: 4.042729819674833e-08


training phase 0 -> loss: 0.2419, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 288/75000 [03:37<16:02:02,  1.29it/s]

noise size: 3.840593328691091e-08


training phase 0 -> loss: 0.1958, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 289/75000 [03:37<15:35:22,  1.33it/s]

noise size: 3.648563662256537e-08


training phase 0 -> loss: 0.1701, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 290/75000 [03:38<15:20:04,  1.35it/s]

noise size: 3.4661354791437095e-08


training phase 0 -> loss: 0.1477, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 291/75000 [03:39<15:02:42,  1.38it/s]

noise size: 3.292828705186524e-08


training phase 0 -> loss: 0.2068, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 292/75000 [03:39<14:48:20,  1.40it/s]

noise size: 3.1281872699271974e-08


training phase 0 -> loss: 0.2322, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 293/75000 [03:40<14:39:04,  1.42it/s]

noise size: 2.9717779064308373e-08


training phase 0 -> loss: 0.2671, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 294/75000 [03:41<14:29:51,  1.43it/s]

noise size: 2.8231890111092953e-08


training phase 0 -> loss: 0.1619, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 295/75000 [03:41<14:24:31,  1.44it/s]

noise size: 2.6820295605538305e-08


training phase 0 -> loss: 0.1534, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 296/75000 [03:42<14:20:21,  1.45it/s]

noise size: 2.5479280825261387e-08


training phase 0 -> loss: 0.2468, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 297/75000 [03:43<14:19:22,  1.45it/s]

noise size: 2.4205316783998316e-08


training phase 0 -> loss: 0.1348, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 298/75000 [03:43<14:27:37,  1.43it/s]

noise size: 2.2995050944798398e-08


training phase 0 -> loss: 0.2062, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 299/75000 [03:44<14:33:36,  1.43it/s]

noise size: 2.1845298397558478e-08


training phase 0 -> loss: 0.2285, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 300/75000 [03:45<15:53:31,  1.31it/s]

noise size: 2.0753033477680553e-08


training phase 0 -> loss: 0.3143, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 301/75000 [03:46<16:49:16,  1.23it/s]

noise size: 1.9715381803796525e-08


training phase 0 -> loss: 0.1526, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 302/75000 [03:47<18:15:48,  1.14it/s]

noise size: 1.8729612713606698e-08


training phase 0 -> loss: 0.2040, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 303/75000 [03:48<17:17:57,  1.20it/s]

noise size: 1.779313207792636e-08


training phase 0 -> loss: 0.1239, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 304/75000 [03:49<16:36:18,  1.25it/s]

noise size: 1.6903475474030043e-08


training phase 0 -> loss: 0.1457, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 305/75000 [03:49<16:15:45,  1.28it/s]

noise size: 1.605830170032854e-08


training phase 0 -> loss: 0.3000, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 306/75000 [03:50<15:38:35,  1.33it/s]

noise size: 1.525538661531211e-08


training phase 0 -> loss: 0.2225, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 307/75000 [03:51<15:19:44,  1.35it/s]

noise size: 1.4492617284546504e-08


training phase 0 -> loss: 0.3475, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 308/75000 [03:51<15:04:23,  1.38it/s]

noise size: 1.3767986420319179e-08


training phase 0 -> loss: 0.2375, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 309/75000 [03:52<14:48:47,  1.40it/s]

noise size: 1.3079587099303219e-08


training phase 0 -> loss: 0.2068, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 310/75000 [03:53<14:44:30,  1.41it/s]

noise size: 1.2425607744338058e-08


training phase 0 -> loss: 0.2211, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 311/75000 [03:53<14:55:01,  1.39it/s]

noise size: 1.1804327357121154e-08


training phase 0 -> loss: 0.1397, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 312/75000 [03:54<14:42:43,  1.41it/s]

noise size: 1.1214110989265096e-08


training phase 0 -> loss: 0.1946, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 313/75000 [03:55<14:38:39,  1.42it/s]

noise size: 1.0653405439801841e-08


training phase 0 -> loss: 0.2526, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 314/75000 [03:56<14:35:29,  1.42it/s]

noise size: 1.0120735167811749e-08


training phase 0 -> loss: 0.2212, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 315/75000 [03:56<14:28:08,  1.43it/s]

noise size: 9.61469840942116e-09


training phase 0 -> loss: 0.1640, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 316/75000 [03:57<14:21:25,  1.44it/s]

noise size: 9.133963488950102e-09


training phase 0 -> loss: 0.1566, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 317/75000 [03:58<15:21:14,  1.35it/s]

noise size: 8.677265314502596e-09


training phase 0 -> loss: 0.1699, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 318/75000 [03:59<16:46:20,  1.24it/s]

noise size: 8.243402048777466e-09


training phase 0 -> loss: 0.2001, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 319/75000 [04:00<18:06:54,  1.15it/s]

noise size: 7.831231946338592e-09


training phase 0 -> loss: 0.1756, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 320/75000 [04:01<18:06:56,  1.15it/s]

noise size: 7.439670349021662e-09


training phase 0 -> loss: 0.1292, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 321/75000 [04:01<17:04:21,  1.22it/s]

noise size: 7.067686831570578e-09


training phase 0 -> loss: 0.1691, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 322/75000 [04:02<16:13:44,  1.28it/s]

noise size: 6.714302489992049e-09


training phase 0 -> loss: 0.1257, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 323/75000 [04:03<15:44:17,  1.32it/s]

noise size: 6.3785873654924456e-09


training phase 0 -> loss: 0.2482, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 324/75000 [04:03<15:14:57,  1.36it/s]

noise size: 6.059657997217823e-09


training phase 0 -> loss: 0.1737, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 325/75000 [04:04<15:05:15,  1.37it/s]

noise size: 5.756675097356932e-09


training phase 0 -> loss: 0.1923, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 326/75000 [04:05<14:48:27,  1.40it/s]

noise size: 5.468841342489085e-09


training phase 0 -> loss: 0.0730, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 327/75000 [04:05<14:40:00,  1.41it/s]

noise size: 5.19539927536463e-09


training phase 0 -> loss: 0.1377, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 328/75000 [04:06<14:40:26,  1.41it/s]

noise size: 4.9356293115963985e-09


training phase 0 -> loss: 0.1183, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 329/75000 [04:07<14:40:51,  1.41it/s]

noise size: 4.688847846016579e-09


training phase 0 -> loss: 0.1609, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 330/75000 [04:08<14:32:42,  1.43it/s]

noise size: 4.45440545371575e-09


training phase 0 -> loss: 0.1236, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 331/75000 [04:08<14:33:00,  1.43it/s]

noise size: 4.231685181029962e-09


training phase 0 -> loss: 0.1644, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 332/75000 [04:09<14:24:53,  1.44it/s]

noise size: 4.020100921978463e-09


training phase 0 -> loss: 0.1275, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 333/75000 [04:10<14:13:47,  1.46it/s]

noise size: 3.81909587587954e-09


training phase 0 -> loss: 0.1969, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 334/75000 [04:10<14:16:37,  1.45it/s]

noise size: 3.6281410820855627e-09


training phase 0 -> loss: 0.1473, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 335/75000 [04:11<15:47:57,  1.31it/s]

noise size: 3.4467340279812844e-09


training phase 0 -> loss: 0.1708, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 336/75000 [04:12<16:56:59,  1.22it/s]

noise size: 3.27439732658222e-09


training phase 0 -> loss: 0.1735, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 337/75000 [04:13<18:00:06,  1.15it/s]

noise size: 3.110677460253109e-09


training phase 0 -> loss: 0.1602, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 338/75000 [04:14<17:19:39,  1.20it/s]

noise size: 2.9551435872404534e-09


training phase 0 -> loss: 0.1277, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 339/75000 [04:15<16:32:42,  1.25it/s]

noise size: 2.8073864078784307e-09


training phase 0 -> loss: 0.3412, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 340/75000 [04:15<15:56:10,  1.30it/s]

noise size: 2.6670170874845092e-09


training phase 0 -> loss: 0.1220, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 341/75000 [04:16<15:23:48,  1.35it/s]

noise size: 2.533666233110284e-09


training phase 0 -> loss: 0.1194, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 342/75000 [04:17<15:03:16,  1.38it/s]

noise size: 2.4069829214547697e-09


training phase 0 -> loss: 0.2058, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 343/75000 [04:17<14:53:16,  1.39it/s]

noise size: 2.2866337753820313e-09


training phase 0 -> loss: 0.1824, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 344/75000 [04:18<14:38:03,  1.42it/s]

noise size: 2.1723020866129295e-09


training phase 0 -> loss: 0.1597, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 345/75000 [04:19<14:33:00,  1.43it/s]

noise size: 2.063686982282283e-09


training phase 0 -> loss: 0.2056, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 346/75000 [04:19<14:28:11,  1.43it/s]

noise size: 1.9605026331681687e-09


training phase 0 -> loss: 0.1818, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 347/75000 [04:20<14:12:39,  1.46it/s]

noise size: 1.8624775015097601e-09


training phase 0 -> loss: 0.1913, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 348/75000 [04:21<14:05:10,  1.47it/s]

noise size: 1.769353626434272e-09


training phase 0 -> loss: 0.2734, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 349/75000 [04:21<14:05:58,  1.47it/s]

noise size: 1.6808859451125584e-09


training phase 0 -> loss: 0.2114, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 350/75000 [04:22<14:20:10,  1.45it/s]

noise size: 1.5968416478569305e-09


training phase 0 -> loss: 0.0956, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 351/75000 [04:23<14:17:13,  1.45it/s]

noise size: 1.516999565464084e-09


training phase 0 -> loss: 0.1588, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 352/75000 [04:24<14:35:21,  1.42it/s]

noise size: 1.4411495871908797e-09


training phase 0 -> loss: 0.1375, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 353/75000 [04:25<15:58:39,  1.30it/s]

noise size: 1.3690921078313357e-09


training phase 0 -> loss: 0.3096, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 354/75000 [04:26<17:29:29,  1.19it/s]

noise size: 1.3006375024397688e-09


training phase 0 -> loss: 0.1871, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 355/75000 [04:27<18:33:28,  1.12it/s]

noise size: 1.2356056273177802e-09


training phase 0 -> loss: 0.1303, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 356/75000 [04:27<17:13:26,  1.20it/s]

noise size: 1.1738253459518912e-09


training phase 0 -> loss: 0.1497, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 357/75000 [04:28<16:22:48,  1.27it/s]

noise size: 1.1151340786542967e-09


training phase 0 -> loss: 0.1332, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 358/75000 [04:29<15:45:51,  1.32it/s]

noise size: 1.0593773747215819e-09


training phase 0 -> loss: 0.2773, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 359/75000 [04:29<15:12:32,  1.36it/s]

noise size: 1.0064085059855027e-09


training phase 0 -> loss: 0.2255, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 360/75000 [04:30<14:51:34,  1.40it/s]

noise size: 9.560880806862275e-10


training phase 0 -> loss: 0.1430, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 361/75000 [04:31<14:37:13,  1.42it/s]

noise size: 9.08283676651916e-10


training phase 0 -> loss: 0.1687, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 362/75000 [04:31<14:33:45,  1.42it/s]

noise size: 8.628694928193202e-10


training phase 0 -> loss: 0.1945, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 363/75000 [04:32<14:28:37,  1.43it/s]

noise size: 8.197260181783541e-10


training phase 0 -> loss: 0.2221, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 364/75000 [04:33<14:29:34,  1.43it/s]

noise size: 7.787397172694363e-10


training phase 0 -> loss: 0.1501, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 365/75000 [04:33<14:21:55,  1.44it/s]

noise size: 7.398027314059645e-10


training phase 0 -> loss: 0.2355, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 366/75000 [04:34<14:25:40,  1.44it/s]

noise size: 7.028125948356663e-10


training phase 0 -> loss: 0.2219, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 367/75000 [04:35<14:26:22,  1.44it/s]

noise size: 6.676719650938829e-10


training phase 0 -> loss: 0.2777, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 368/75000 [04:36<14:22:25,  1.44it/s]

noise size: 6.342883668391888e-10


training phase 0 -> loss: 0.2381, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 369/75000 [04:36<14:27:37,  1.43it/s]

noise size: 6.025739484972293e-10


training phase 0 -> loss: 0.3436, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 370/75000 [04:37<15:51:23,  1.31it/s]

noise size: 5.724452510723678e-10


training phase 0 -> loss: 0.2789, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 371/75000 [04:38<16:57:59,  1.22it/s]

noise size: 5.438229885187493e-10


training phase 0 -> loss: 0.2741, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 372/75000 [04:39<18:17:17,  1.13it/s]

noise size: 5.166318390928118e-10


training phase 0 -> loss: 0.4223, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 373/75000 [04:40<18:09:14,  1.14it/s]

noise size: 4.908002471381712e-10


training phase 0 -> loss: 0.2377, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 374/75000 [04:41<17:47:32,  1.17it/s]

noise size: 4.662602347812625e-10


training phase 0 -> loss: 0.3034, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   0%|          | 375/75000 [04:42<16:43:06,  1.24it/s]

noise size: 4.429472230421994e-10


training phase 0 -> loss: 0.2412, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 376/75000 [04:42<16:05:34,  1.29it/s]

noise size: 4.2079986189008943e-10


training phase 0 -> loss: 0.2031, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 377/75000 [04:43<15:38:01,  1.33it/s]

noise size: 3.997598687955849e-10


training phase 0 -> loss: 0.3348, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 378/75000 [04:44<15:08:49,  1.37it/s]

noise size: 3.7977187535580564e-10


training phase 0 -> loss: 0.2952, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 379/75000 [04:44<15:12:38,  1.36it/s]

noise size: 3.6078328158801534e-10


training phase 0 -> loss: 0.2230, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 380/75000 [04:45<14:54:28,  1.39it/s]

noise size: 3.4274411750861456e-10


training phase 0 -> loss: 0.3373, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 381/75000 [04:46<14:42:19,  1.41it/s]

noise size: 3.256069116331838e-10


training phase 0 -> loss: 0.2719, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 382/75000 [04:46<14:36:39,  1.42it/s]

noise size: 3.093265660515246e-10


training phase 0 -> loss: 0.1965, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 383/75000 [04:47<14:25:08,  1.44it/s]

noise size: 2.9386023774894836e-10


training phase 0 -> loss: 0.3561, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 384/75000 [04:48<14:28:02,  1.43it/s]

noise size: 2.7916722586150093e-10


training phase 0 -> loss: 0.1718, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 385/75000 [04:48<14:29:35,  1.43it/s]

noise size: 2.652088645684259e-10


training phase 0 -> loss: 0.3228, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 386/75000 [04:49<14:26:57,  1.43it/s]

noise size: 2.5194842134000456e-10


training phase 0 -> loss: 0.3202, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 387/75000 [04:50<14:55:48,  1.39it/s]

noise size: 2.393510002730043e-10


training phase 0 -> loss: 0.1484, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 388/75000 [04:51<16:11:17,  1.28it/s]

noise size: 2.273834502593541e-10


training phase 0 -> loss: 0.2526, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 389/75000 [04:52<17:36:09,  1.18it/s]

noise size: 2.1601427774638639e-10


training phase 0 -> loss: 0.4069, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 390/75000 [04:53<18:19:13,  1.13it/s]

noise size: 2.0521356385906706e-10


training phase 0 -> loss: 0.2631, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 391/75000 [04:54<17:16:02,  1.20it/s]

noise size: 1.949528856661137e-10


training phase 0 -> loss: 0.1942, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 392/75000 [04:54<16:26:41,  1.26it/s]

noise size: 1.85205241382808e-10


training phase 0 -> loss: 0.1919, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 393/75000 [04:55<15:52:19,  1.31it/s]

noise size: 1.7594497931366759e-10


training phase 0 -> loss: 0.3134, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 394/75000 [04:56<15:29:24,  1.34it/s]

noise size: 1.671477303479842e-10


training phase 0 -> loss: 0.1537, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 395/75000 [04:56<15:02:12,  1.38it/s]

noise size: 1.5879034383058498e-10


training phase 0 -> loss: 0.2012, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 396/75000 [04:57<14:46:05,  1.40it/s]

noise size: 1.5085082663905573e-10


training phase 0 -> loss: 0.1343, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 397/75000 [04:58<14:48:34,  1.40it/s]

noise size: 1.4330828530710293e-10


training phase 0 -> loss: 0.1774, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 398/75000 [04:58<14:49:34,  1.40it/s]

noise size: 1.3614287104174777e-10


training phase 0 -> loss: 0.2619, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 399/75000 [04:59<14:49:18,  1.40it/s]

noise size: 1.2933572748966038e-10


training phase 0 -> loss: 0.2950, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 400/75000 [05:00<14:43:42,  1.41it/s]

noise size: 1.2286894111517737e-10


training phase 0 -> loss: 0.1822, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 401/75000 [05:01<14:27:06,  1.43it/s]

noise size: 1.167254940594185e-10


training phase 0 -> loss: 0.0825, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 402/75000 [05:01<14:22:44,  1.44it/s]

noise size: 1.1088921935644756e-10


training phase 0 -> loss: 0.2851, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 403/75000 [05:02<14:24:05,  1.44it/s]

noise size: 1.0534475838862517e-10


training phase 0 -> loss: 0.3945, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 404/75000 [05:03<14:18:03,  1.45it/s]

noise size: 1.0007752046919391e-10


training phase 0 -> loss: 0.1508, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 405/75000 [05:03<15:34:54,  1.33it/s]

noise size: 9.507364444573421e-11


training phase 0 -> loss: 0.2247, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 406/75000 [05:04<16:49:53,  1.23it/s]

noise size: 9.03199622234475e-11


training phase 0 -> loss: 0.1605, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 407/75000 [05:05<18:10:59,  1.14it/s]

noise size: 8.580396411227512e-11


training phase 0 -> loss: 0.2066, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 408/75000 [05:06<17:53:18,  1.16it/s]

noise size: 8.151376590666135e-11


training phase 0 -> loss: 0.3119, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 409/75000 [05:07<16:55:15,  1.22it/s]

noise size: 7.743807761132829e-11


training phase 0 -> loss: 0.2153, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 410/75000 [05:08<16:20:29,  1.27it/s]

noise size: 7.356617373076187e-11


training phase 0 -> loss: 0.2189, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 411/75000 [05:08<15:44:39,  1.32it/s]

noise size: 6.988786504422378e-11


training phase 0 -> loss: 0.0912, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 412/75000 [05:09<15:18:53,  1.35it/s]

noise size: 6.639347179201259e-11


training phase 0 -> loss: 0.2462, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 413/75000 [05:10<15:03:51,  1.38it/s]

noise size: 6.307379820241195e-11


training phase 0 -> loss: 0.2825, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 414/75000 [05:11<15:13:03,  1.36it/s]

noise size: 5.992010829229135e-11


training phase 0 -> loss: 0.1797, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 415/75000 [05:11<14:55:33,  1.39it/s]

noise size: 5.6924102877676776e-11


training phase 0 -> loss: 0.3120, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 416/75000 [05:12<14:39:15,  1.41it/s]

noise size: 5.407789773379293e-11


training phase 0 -> loss: 0.1082, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 417/75000 [05:13<14:30:38,  1.43it/s]

noise size: 5.1374002847103286e-11


training phase 0 -> loss: 0.2022, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 418/75000 [05:13<14:26:50,  1.43it/s]

noise size: 4.880530270474812e-11


training phase 0 -> loss: 0.2481, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 419/75000 [05:14<14:20:14,  1.44it/s]

noise size: 4.6365037569510706e-11


training phase 0 -> loss: 0.2712, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 420/75000 [05:15<14:18:54,  1.45it/s]

noise size: 4.4046785691035167e-11


training phase 0 -> loss: 0.2787, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 421/75000 [05:15<14:18:15,  1.45it/s]

noise size: 4.1844446406483405e-11


training phase 0 -> loss: 0.1813, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 422/75000 [05:16<14:49:42,  1.40it/s]

noise size: 3.975222408615923e-11


training phase 0 -> loss: 0.1751, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 423/75000 [05:17<15:58:36,  1.30it/s]

noise size: 3.776461288185127e-11


training phase 0 -> loss: 0.2336, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 424/75000 [05:18<17:15:58,  1.20it/s]

noise size: 3.58763822377587e-11


training phase 0 -> loss: 0.1969, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 425/75000 [05:19<18:21:06,  1.13it/s]

noise size: 3.408256312587077e-11


training phase 0 -> loss: 0.1083, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 426/75000 [05:20<17:04:03,  1.21it/s]

noise size: 3.237843496957723e-11


training phase 0 -> loss: 0.2788, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 427/75000 [05:20<16:12:48,  1.28it/s]

noise size: 3.075951322109837e-11


training phase 0 -> loss: 0.0943, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 428/75000 [05:21<15:37:15,  1.33it/s]

noise size: 2.922153756004345e-11


training phase 0 -> loss: 0.1297, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 429/75000 [05:22<15:11:40,  1.36it/s]

noise size: 2.7760460682041273e-11


training phase 0 -> loss: 0.1493, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 430/75000 [05:22<14:59:08,  1.38it/s]

noise size: 2.6372437647939207e-11


training phase 0 -> loss: 0.2474, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 431/75000 [05:23<14:43:35,  1.41it/s]

noise size: 2.5053815765542246e-11


training phase 0 -> loss: 0.1146, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 432/75000 [05:24<14:38:29,  1.41it/s]

noise size: 2.3801124977265133e-11


training phase 0 -> loss: 0.3960, accuracy: 0.9000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 433/75000 [05:25<14:40:39,  1.41it/s]

noise size: 2.2611068728401876e-11


training phase 0 -> loss: 0.1525, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 434/75000 [05:25<14:41:09,  1.41it/s]

noise size: 2.1480515291981783e-11


training phase 0 -> loss: 0.2287, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 435/75000 [05:26<14:35:54,  1.42it/s]

noise size: 2.0406489527382693e-11


training phase 0 -> loss: 0.1429, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 436/75000 [05:27<14:34:55,  1.42it/s]

noise size: 1.9386165051013556e-11


training phase 0 -> loss: 0.1331, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 437/75000 [05:27<14:31:29,  1.43it/s]

noise size: 1.8416856798462876e-11


training phase 0 -> loss: 0.1829, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 438/75000 [05:28<14:30:51,  1.43it/s]

noise size: 1.749601395853973e-11


training phase 0 -> loss: 0.1918, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 439/75000 [05:29<14:27:12,  1.43it/s]

noise size: 1.6621213260612744e-11


training phase 0 -> loss: 0.2139, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 440/75000 [05:30<15:29:04,  1.34it/s]

noise size: 1.5790152597582105e-11


training phase 0 -> loss: 0.1089, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 441/75000 [05:31<16:59:20,  1.22it/s]

noise size: 1.5000644967703e-11


training phase 0 -> loss: 0.1884, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 442/75000 [05:32<18:11:28,  1.14it/s]

noise size: 1.4250612719317848e-11


training phase 0 -> loss: 0.1532, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 443/75000 [05:32<18:01:35,  1.15it/s]

noise size: 1.3538082083351955e-11


training phase 0 -> loss: 0.1534, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 444/75000 [05:33<16:55:14,  1.22it/s]

noise size: 1.2861177979184357e-11


training phase 0 -> loss: 0.1519, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 445/75000 [05:34<16:05:57,  1.29it/s]

noise size: 1.2218119080225138e-11


training phase 0 -> loss: 0.0865, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 446/75000 [05:35<15:37:03,  1.33it/s]

noise size: 1.1607213126213881e-11


training phase 0 -> loss: 0.2005, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 447/75000 [05:35<15:21:58,  1.35it/s]

noise size: 1.1026852469903187e-11


training phase 0 -> loss: 0.1589, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 448/75000 [05:36<15:00:17,  1.38it/s]

noise size: 1.0475509846408028e-11


training phase 0 -> loss: 0.2122, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 449/75000 [05:37<14:57:02,  1.39it/s]

noise size: 9.951734354087626e-12


training phase 0 -> loss: 0.2676, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 450/75000 [05:37<14:56:40,  1.39it/s]

noise size: 9.454147636383244e-12


training phase 0 -> loss: 0.1235, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 451/75000 [05:38<14:57:48,  1.38it/s]

noise size: 8.98144025456408e-12


training phase 0 -> loss: 0.1883, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 452/75000 [05:39<14:45:37,  1.40it/s]

noise size: 8.532368241835877e-12


training phase 0 -> loss: 0.2985, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 453/75000 [05:40<14:49:37,  1.40it/s]

noise size: 8.105749829744083e-12


training phase 0 -> loss: 0.0992, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 454/75000 [05:40<14:42:38,  1.41it/s]

noise size: 7.700462338256877e-12


training phase 0 -> loss: 0.3811, accuracy: 0.8750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 455/75000 [05:41<14:34:40,  1.42it/s]

noise size: 7.315439221344033e-12


training phase 0 -> loss: 0.1319, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 456/75000 [05:42<14:28:18,  1.43it/s]

noise size: 6.949667260276831e-12


training phase 0 -> loss: 0.1239, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 457/75000 [05:42<14:51:15,  1.39it/s]

noise size: 6.602183897262989e-12


training phase 0 -> loss: 0.0912, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 458/75000 [05:43<16:52:40,  1.23it/s]

noise size: 6.2720747023998394e-12


training phase 0 -> loss: 0.1555, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 459/75000 [05:45<18:40:09,  1.11it/s]

noise size: 5.958470967279847e-12


training phase 0 -> loss: 0.1179, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 460/75000 [05:45<19:04:08,  1.09it/s]

noise size: 5.660547418915855e-12


training phase 0 -> loss: 0.1053, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 461/75000 [05:46<17:28:08,  1.19it/s]

noise size: 5.3775200479700615e-12


training phase 0 -> loss: 0.1889, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 462/75000 [05:47<16:38:03,  1.24it/s]

noise size: 5.108644045571558e-12


training phase 0 -> loss: 0.1552, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 463/75000 [05:48<15:58:28,  1.30it/s]

noise size: 4.8532118432929804e-12


training phase 0 -> loss: 0.1586, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 464/75000 [05:48<15:33:39,  1.33it/s]

noise size: 4.610551251128331e-12


training phase 0 -> loss: 0.1881, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 465/75000 [05:49<16:04:10,  1.29it/s]

noise size: 4.380023688571914e-12


training phase 0 -> loss: 0.1429, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 466/75000 [05:50<15:40:16,  1.32it/s]

noise size: 4.161022504143318e-12


training phase 0 -> loss: 0.0950, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 467/75000 [05:51<15:26:51,  1.34it/s]

noise size: 3.952971378936151e-12


training phase 0 -> loss: 0.1413, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 468/75000 [05:51<15:04:36,  1.37it/s]

noise size: 3.755322809989344e-12


training phase 0 -> loss: 0.1298, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 469/75000 [05:52<14:50:46,  1.39it/s]

noise size: 3.5675566694898764e-12


training phase 0 -> loss: 0.1882, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 470/75000 [05:53<14:39:27,  1.41it/s]

noise size: 3.3891788360153825e-12


training phase 0 -> loss: 0.1822, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 471/75000 [05:53<14:27:26,  1.43it/s]

noise size: 3.219719894214613e-12


training phase 0 -> loss: 0.1775, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 472/75000 [05:54<14:24:28,  1.44it/s]

noise size: 3.0587338995038823e-12


training phase 0 -> loss: 0.1353, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 473/75000 [05:55<14:19:46,  1.44it/s]

noise size: 2.9057972045286882e-12


training phase 0 -> loss: 0.0998, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 474/75000 [05:55<14:12:46,  1.46it/s]

noise size: 2.7605073443022535e-12


training phase 0 -> loss: 0.1334, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 475/75000 [05:56<15:41:58,  1.32it/s]

noise size: 2.6224819770871405e-12


training phase 0 -> loss: 0.1731, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 476/75000 [05:57<17:08:38,  1.21it/s]

noise size: 2.4913578782327834e-12


training phase 0 -> loss: 0.1215, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 477/75000 [05:58<18:13:24,  1.14it/s]

noise size: 2.366789984321144e-12


training phase 0 -> loss: 0.1945, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 478/75000 [05:59<17:23:07,  1.19it/s]

noise size: 2.248450485105087e-12


training phase 0 -> loss: 0.1483, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 479/75000 [06:00<16:31:02,  1.25it/s]

noise size: 2.1360279608498322e-12


training phase 0 -> loss: 0.1605, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 480/75000 [06:00<15:53:21,  1.30it/s]

noise size: 2.0292265628073407e-12


training phase 0 -> loss: 0.1853, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 481/75000 [06:01<15:26:57,  1.34it/s]

noise size: 1.9277652346669735e-12


training phase 0 -> loss: 0.1045, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 482/75000 [06:02<15:00:45,  1.38it/s]

noise size: 1.8313769729336246e-12


training phase 0 -> loss: 0.1215, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 483/75000 [06:02<14:52:58,  1.39it/s]

noise size: 1.7398081242869432e-12


training phase 0 -> loss: 0.0545, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 484/75000 [06:03<14:46:28,  1.40it/s]

noise size: 1.652817718072596e-12


training phase 0 -> loss: 0.1660, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 485/75000 [06:04<14:35:21,  1.42it/s]

noise size: 1.570176832168966e-12


training phase 0 -> loss: 0.1543, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 486/75000 [06:05<14:32:24,  1.42it/s]

noise size: 1.4916679905605177e-12


training phase 0 -> loss: 0.2737, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 487/75000 [06:05<14:48:29,  1.40it/s]

noise size: 1.4170845910324916e-12


training phase 0 -> loss: 0.2195, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 488/75000 [06:06<14:34:07,  1.42it/s]

noise size: 1.346230361480867e-12


training phase 0 -> loss: 0.1133, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 489/75000 [06:07<14:27:40,  1.43it/s]

noise size: 1.2789188434068236e-12


training phase 0 -> loss: 0.2795, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 490/75000 [06:07<14:20:19,  1.44it/s]

noise size: 1.2149729012364822e-12


training phase 0 -> loss: 0.1059, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 491/75000 [06:08<14:44:57,  1.40it/s]

noise size: 1.154224256174658e-12


training phase 0 -> loss: 0.1927, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 492/75000 [06:09<15:34:34,  1.33it/s]

noise size: 1.0965130433659251e-12


training phase 0 -> loss: 0.2156, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 493/75000 [06:10<16:35:08,  1.25it/s]

noise size: 1.0416873911976289e-12


training phase 0 -> loss: 0.1504, accuracy: 0.9750, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 494/75000 [06:11<18:05:44,  1.14it/s]

noise size: 9.896030216377474e-13


training phase 0 -> loss: 0.0821, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 495/75000 [06:12<17:58:57,  1.15it/s]

noise size: 9.4012287055586e-13


training phase 0 -> loss: 0.2404, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 496/75000 [06:12<16:54:35,  1.22it/s]

noise size: 8.931167270280669e-13


training phase 0 -> loss: 0.1805, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 497/75000 [06:13<16:06:45,  1.28it/s]

noise size: 8.484608906766635e-13


training phase 0 -> loss: 0.0863, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 498/75000 [06:14<15:53:17,  1.30it/s]

noise size: 8.060378461428303e-13


training phase 0 -> loss: 0.2273, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 499/75000 [06:15<15:33:06,  1.33it/s]

noise size: 7.657359538356887e-13


training phase 0 -> loss: 0.1101, accuracy: 1.0000, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|          | 500/75000 [06:15<15:21:11,  1.35it/s]

noise size: 7.274491561439042e-13



  0%|          | 0/75 [00:00<?, ?it/s][A
  1%|▏         | 1/75 [00:00<00:35,  2.10it/s][A
val_phase 0 -> loss: 0.1689, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   1%|▏         | 1/75 [00:00<00:35,  2.10it/s][A
val_phase 0 -> loss: 0.1689, accuracy: 0.9500, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   3%|▎         | 2/75 [00:00<00:25,  2.81it/s][A
val_phase 0 -> loss: 0.2598, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vector_1: 0.2000, loss_importance_vector_2: 0.2000, loss_importance_vector_3: 0.2000, loss_importance_vector_4: 0.2000, :   3%|▎         | 2/75 [00:00<00:25,  2.81it/s][A
val_phase 0 -> loss: 0.2598, accuracy: 0.9250, loss_importance_vector_0: 0.2000, loss_importance_vec

Best validation accuracy 0.9570000036557516
saved models to /content/HowToTrainYourMAMLPytorch/omniglot_5_8_0.1_64_20_2/saved_models
epoch 1 -> train_loss_mean: 0.3299, train_loss_std: 0.2189, train_accuracy_mean: 0.9451, train_accuracy_std: 0.0540, train_loss_importance_vector_0_mean: 0.2000, train_loss_importance_vector_0_std: 0.0000, train_loss_importance_vector_1_mean: 0.2000, train_loss_importance_vector_1_std: 0.0000, train_loss_importance_vector_2_mean: 0.2000, train_loss_importance_vector_2_std: 0.0000, train_loss_importance_vector_3_mean: 0.2000, train_loss_importance_vector_3_std: 0.0000, train_loss_importance_vector_4_mean: 0.2000, train_loss_importance_vector_4_std: 0.0000, val_loss_mean: 0.1528, val_loss_std: 0.0667, val_accuracy_mean: 0.9570, val_accuracy_std: 0.0302, val_loss_importance_vector_0_mean: 0.2000, val_loss_importance_vector_0_std: 0.0000, val_loss_importance_vector_1_mean: 0.2000, val_loss_importance_vector_1_std: 0.0000, val_loss_importance_vector_2_mean: 0.

training phase 1 -> loss: 0.1050, accuracy: 1.0000, loss_importance_vector_0: 0.1800, loss_importance_vector_1: 0.1800, loss_importance_vector_2: 0.1800, loss_importance_vector_3: 0.1800, loss_importance_vector_4: 0.2800, :   1%|          | 501/75000 [06:37<142:46:38,  6.90s/it]

noise size: 6.91076698336709e-13


training phase 1 -> loss: 0.1511, accuracy: 0.9750, loss_importance_vector_0: 0.1800, loss_importance_vector_1: 0.1800, loss_importance_vector_2: 0.1800, loss_importance_vector_3: 0.1800, loss_importance_vector_4: 0.2800, :   1%|          | 502/75000 [06:37<105:45:01,  5.11s/it]

noise size: 6.565228634198735e-13


training phase 1 -> loss: 0.2488, accuracy: 0.9250, loss_importance_vector_0: 0.1800, loss_importance_vector_1: 0.1800, loss_importance_vector_2: 0.1800, loss_importance_vector_3: 0.1800, loss_importance_vector_4: 0.2800, :   1%|          | 503/75000 [06:39<80:31:58,  3.89s/it]

noise size: 6.236967202488797e-13


training phase 1 -> loss: 0.0638, accuracy: 1.0000, loss_importance_vector_0: 0.1800, loss_importance_vector_1: 0.1800, loss_importance_vector_2: 0.1800, loss_importance_vector_3: 0.1800, loss_importance_vector_4: 0.2800, :   1%|          | 504/75000 [06:40<62:30:24,  3.02s/it]

noise size: 5.925118842364357e-13


training phase 1 -> loss: 0.1201, accuracy: 1.0000, loss_importance_vector_0: 0.1800, loss_importance_vector_1: 0.1800, loss_importance_vector_2: 0.1800, loss_importance_vector_3: 0.1800, loss_importance_vector_4: 0.2800, :   1%|          | 505/75000 [06:40<48:09:17,  2.33s/it]