# load data

# Load model

In [1]:
import torchvision
import torch
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Select last layer out

In [2]:
## MY

"""
Train a Sparse AutoEncoder model

Run on a macbook on a Shakespeare dataset as
python train.py --dataset=shakespeare_char --gpt_ckpt_dir=out_sc_1_2_32 --eval_iters=1 --eval_batch_size=16 --batch_size=128 --device=cpu --eval_interval=100 --n_features=1024 --resampling_interval=150 --wandb_log=True
"""
import os
import torch
import numpy as np
import time


## hyperparameters

# training
n_features = 8096
batch_size = 128 # batch size for autoencoder training
l1_coeff = 3e-3
learning_rate = 3e-4
resampling_interval = 25000 # number of training steps after which neuron resampling will be performed
num_resamples = 4 # number of times resampling is to be performed; it is done 4 times in Anthropic's paper
resampling_data_size = 819200
# evaluation
eval_batch_size = 16 # batch size (number of GPT contexts) for evaluation
eval_iters = 200 # number of iterations in the evaluation loop
eval_interval = 30 # number of training steps after which the autoencoder is evaluated
# I/O
save_checkpoint = True # whether to save model, optimizer, etc or not
save_interval = 10000 # number of training steps after which a checkpoint will be saved
out_dir = 'out' # directory containing trained autoencoder model weights
# wandb logging
wandb_log = True
# system
device = 'cuda'
# reproducibility
seed = 1442

# -----------------------------------------------------------------------------
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
#exec(open('configurator.py').read()) # overrides from command line or config file
config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------

In [41]:
dataset = 'waterbird'
# if wandb_log:
#     import wandb
#     wandb.init(project=f'sparse-autoencoder-{dataset}')#, name=run_name, config=config)
if save_checkpoint:
    ckpt_path = os.path.join('/content/drive/MyDrive/last_layer')
    os.makedirs(ckpt_path, exist_ok=True)

In [36]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
import os
list_ = os.listdir('/content/drive/MyDrive/last_layer')

In [21]:
import pickle as pkl
with open('/content/drive/MyDrive/last_layer/' + list_[0], 'rb') as f:
  x = pkl.load(f)

In [26]:
train_list = [elem for elem in list_ if 'train' in elem]

In [27]:
valid_list = [elem for elem in list_ if 'val' in elem]

In [28]:
test_list = set(list_) - set(valid_list) - set(train_list)

In [30]:
len(train_list)

1272

In [25]:
"""
This file defines an AutoEncoder class, which also contains an implementation of neuron resampling.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F

class AutoEncoder(nn.Module):
    def __init__(self, n_inputs: int, n_latents: int, lam: float = 0.003, resampling_interval: int = 25000):
        """
        n_input: Number of inputs
        n_latents: Number of neurons in the hidden layer
        lam: L1-coefficient for Sparse Autoencoder
        resampling_interval: Number of training steps after which dead neurons will be resampled
        """
        super().__init__()
        self.n_inputs, self.n_latents = n_inputs, n_latents
        self.encoder = nn.Linear(n_inputs, n_latents)
        self.relu = nn.ReLU()
        self.decoder = nn.Linear(n_latents, n_inputs)
        self.lam = lam
        self.resampling_interval = resampling_interval
        self.dead_neurons = None
        self.normalize_decoder_columns()

    def forward(self, x):
        latents = self.encode(x)
        reconstructed = self.decode(latents)
        loss = self.calculate_loss(x, latents, reconstructed)

        if self.training:
            return {'loss': loss, 'latents': latents}
        else:
            return {
                'loss': loss,
                'latents': latents,
                'reconst_acts': reconstructed,
                'mse_loss': self.mse_loss(reconstructed, x),
                'l1_loss': self.l1_loss(latents)
            }

    def encode(self, x):
        bias_corrected_input = x - self.decoder.bias
        return self.relu(self.encoder(bias_corrected_input))

    def decode(self, encoded):
        return self.decoder(encoded)

    def calculate_loss(self, x, encoded, reconstructed):
        mse_loss = self.mse_loss(reconstructed, x)
        l1_loss = self.l1_loss(encoded)
        return mse_loss + self.lam * l1_loss

    def mse_loss(self, reconstructed, original):
        return F.mse_loss(reconstructed, original)

    def l1_loss(self, encoded):
        return F.l1_loss(encoded, torch.zeros_like(encoded), reduction='sum') / encoded.shape[0]

    @torch.no_grad()
    def get_feature_activations(self, inputs, start_idx, end_idx):
        """
        Computes the activations of a subset of features in the hidden layer.

        :param inputs: Input tensor of shape (..., n) where n = d_MLP. It includes batch dimensions.
        :param start_idx: Starting index (inclusive) of the feature subset.
        :param end_idx: Ending index (exclusive) of the feature subset.

        Returns the activations for the specified feature range, reducing computation by
        only processing the necessary part of the network's weights and biases.
        """
        adjusted_inputs = inputs - self.decoder.bias  # Adjust input to account for decoder bias
        weight_subset = self.encoder.weight[start_idx:end_idx, :].t()  # Transpose the subset of weights
        bias_subset = self.encoder.bias[start_idx:end_idx]

        activations = self.relu(adjusted_inputs @ weight_subset + bias_subset)

        return activations

    @torch.no_grad()
    def normalize_decoder_columns(self):
        """
        Normalize the decoder's weight vectors to have unit norm along the feature dimension.
        This normalization can help in maintaining the stability of the network's weights.
        """
        self.decoder.weight.data = F.normalize(self.decoder.weight.data, dim=0)

    def remove_parallel_component_of_decoder_grad(self):
        """
        Remove the component of the gradient parallel to the decoder's weight vectors.
        """
        unit_weights = F.normalize(self.decoder.weight, dim=0) # \hat{b}
        proj = (self.decoder.weight.grad * unit_weights).sum(dim=0) * unit_weights
        self.decoder.weight.grad = self.decoder.weight.grad - proj

    @staticmethod
    def is_dead_neuron_investigation_step(step, resampling_interval, num_resamples):
        """
        Determine if the current step is the start of a phase for investigating dead neurons.
        According to Anthropic's specified policy, it occurs at odd multiples of half the resampling interval.
        """
        return (step > 0) and step % (resampling_interval // 2) == 0 and (step // (resampling_interval // 2)) % 2 != 0 and step < resampling_interval * num_resamples

    @staticmethod
    def is_within_neuron_investigation_phase(step, resampling_interval, num_resamples):
        """
        Check if the current step is within a phase where active neurons are investigated.
        This phase occurs in intervals defined in the specified range, starting at odd multiples of half the resampling interval.
        """
        return any(milestone - resampling_interval // 2 <= step < milestone
                   for milestone in range(resampling_interval, resampling_interval * (num_resamples + 1), resampling_interval))

    @torch.no_grad()
    def initiate_dead_neurons(self):
        self.dead_neurons = set(range(self.n_latents))

    @torch.no_grad()
    def update_dead_neurons(self, latents):
        """
        Update the set of dead neurons based on the current feature activations.
        If a neuron is active (has non-zero activation), it is removed from the dead neuron set.
        """
        active_neurons = torch.nonzero(torch.count_nonzero(latents, dim=0), as_tuple=False).view(-1)
        self.dead_neurons.difference_update(active_neurons.tolist())

    @torch.no_grad()
    def resample_dead_neurons(self, data, optimizer, batch_size=8192):
        """
        Resample the dead neurons by resetting their weights and biases based on the characteristics
        of active neurons. Proceeds only if there are dead neurons to resample.
        """
        if not self.dead_neurons:
            return

        device = self._get_device()
        dead_neurons_t, alive_neurons = self._get_neuron_indices()
        average_enc_norm = self._compute_average_norm_of_alive_neurons(alive_neurons)
        probs = self._compute_loss_probabilities(data, batch_size, device)
        selected_examples = self._select_examples_based_on_probabilities(data, probs)

        self._resample_neurons(selected_examples, dead_neurons_t, average_enc_norm, device)
        self._update_optimizer_parameters(optimizer, dead_neurons_t)

        print('Dead neurons resampled successfully!')
        self.dead_neurons = None

    def _get_device(self):
        return next(self.parameters()).device

    def _get_neuron_indices(self):
        dead_neurons_t = torch.tensor(list(self.dead_neurons), device=self._get_device())
        alive_neurons = torch.tensor([i for i in range(self.n_latents) if i not in self.dead_neurons], device=self._get_device())
        return dead_neurons_t, alive_neurons

    def _compute_average_norm_of_alive_neurons(self, alive_neurons):
        return torch.linalg.vector_norm(self.encoder.weight[alive_neurons], dim=1).mean()

    def _compute_loss_probabilities(self, data, batch_size, device):
        num_batches = (len(data) + batch_size - 1) // batch_size
        probs = torch.zeros(len(data), device=device)
        for i in range(num_batches):
            batch_slice = slice(i * batch_size, (i + 1) * batch_size)
            x_batch = data[batch_slice].to(device)
            probs[batch_slice] = self._compute_batch_loss_squared(x_batch)
        return probs.cpu()

    def _compute_batch_loss_squared(self, x_batch):
        latents = self.encode(x_batch)
        reconst_acts = self.decode(latents)
        mselosses = F.mse_loss(reconst_acts, x_batch, reduction='none').sum(dim=1)
        l1losses = F.l1_loss(latents, torch.zeros_like(latents), reduction='none').sum(dim=1)
        return (mselosses + self.lam * l1losses).square()

    def _select_examples_based_on_probabilities(self, data, probs):
        selection_indices = torch.multinomial(probs, num_samples=len(self.dead_neurons))
        return data[selection_indices].to(dtype=torch.float32)

    def _resample_neurons(self, examples, dead_neurons_t, average_enc_norm, device):
        examples_unit_norm = F.normalize(examples, dim=1).to(device)
        self.decoder.weight[:, dead_neurons_t] = examples_unit_norm.T

        # Renormalize examples to have a certain norm and reset encoder weights and biases
        adjusted_examples = examples_unit_norm * average_enc_norm * 0.2
        self.encoder.weight[dead_neurons_t] = adjusted_examples
        self.encoder.bias[dead_neurons_t] = 0

    def _update_optimizer_parameters(self, optimizer, dead_neurons_t):
        for i, param in enumerate(optimizer.param_groups[0]['params']):
            param_state = optimizer.state[param]
            if i in [0, 1]:  # Encoder weights and biases
                param_state['exp_avg'][dead_neurons_t] = 0
                param_state['exp_avg_sq'][dead_neurons_t] = 0
            elif i == 2:  # Decoder weights
                param_state['exp_avg'][:, dead_neurons_t] = 0
                param_state['exp_avg_sq'][:, dead_neurons_t] = 0

In [32]:
x.shape

torch.Size([128, 2048])

In [33]:
torch.manual_seed(seed)
# initiating ResourceLoader in training mode loads Transformer checkpoint, text data, and autoencoder data
# resourceloader = ResourceLoader(
#                             dataset=dataset,
#                             gpt_ckpt_dir=gpt_ckpt_dir,
#                             device=device,
#                             mode="train",
#                             )

# gpt = resourceloader.transformer # TODO: either it should be called transformer or gpt
autoencoder = AutoEncoder(n_inputs = 2048,
                            n_latents = n_features,
                            lam = l1_coeff,
                            resampling_interval = resampling_interval).to(device)
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=learning_rate)

In [35]:
"""
Three different histogram functions. The difference lies in whether to save the histogram image on disk or not,
color scheme and axes labels.
These can perhaps be combined into one function, but leaving it as it is for now.
"""
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import torch
import os

def make_density_histogram(data, bins='auto'):
    """Makes a histogram image from the provided data and returns it.
    We use it in train.py to plot feature density histograms and log them with W&B."""
    fig, ax = plt.subplots()
    ax.hist(data, bins=bins)
    ax.set_title('Histogram')
    plt.tight_layout()

    buf = BytesIO()  # create a BytesIO buffer
    fig.savefig(buf, format='png')  # save the plot to the buffer in PNG format
    buf.seek(0)  # rewind the buffer to the beginning
    image = Image.open(buf)  # open the image from the buffer

    plt.close(fig)  # close the figure to free memory
    return image

def make_activations_histogram(activations, density, feature_id, dirpath=None):
    """makes a histogram of activations and saves it on the disk
    we later include the histogram in the feature browser"""
    if isinstance(activations, torch.Tensor):
        activations = activations.cpu().numpy()
    plt.hist(activations, bins='auto')  # You can adjust the number of bins as needed
    plt.title(f'Activations (Density = {density:.4f}%)')
    plt.xlabel('Activation')
    plt.ylabel('Frequency')

    # Save the histogram as an image
    image_path = os.path.join(dirpath, 'activations_histograms', f'{feature_id}.png')
    plt.savefig(image_path)
    plt.close()

def make_logits_histogram(logits, feature_id, dirpath=None):
    """
    Makes a histogram of logits for a given feature and saves it as a PNG file
    Input:
        logits: a torch tensor of shape (vocab_size,)
        feature_id: int
        dirpath: histogram is saved as dirpath/logits_histograms/feature_id.png
    """
    plt.hist(logits.cpu().numpy(), bins='auto')  # You can adjust the number of bins as needed

    image_path = os.path.join(dirpath, 'logits_histograms', f'{feature_id}.png')
    plt.savefig(image_path)
    plt.close()

In [52]:

import tqdm as tqdm

############## TRAINING LOOP ###############
start_time = time.time()
num_steps = len(train_list)
step = 0
#num_steps = len(train_loader)
for step, batch in tqdm.tqdm(enumerate(train_list)):
    with open('/content/drive/MyDrive/last_layer/' + batch, 'rb') as f:
      x = pkl.load(f)

    #batch = resourceloader.get_autoencoder_data_batch(step, batch_size=batch_size)
    optimizer.zero_grad(set_to_none=True)
    autoencoder_output = autoencoder(x) # f has shape (batch_size, n_features)
    autoencoder_output['loss'].backward()

    # remove component of gradient parallel to weight
    autoencoder.remove_parallel_component_of_decoder_grad()
    optimizer.step()

    # periodically update the norm of dictionary vectors to ensure they stay close to 1.
    if step % 1000 == 0:
        autoencoder.normalize_decoder_columns()

    ## ------------ perform neuron resampling ----------- ######
    # check if we should start investigating dead/alive neurons at this step
    # This is done at an odd multiple of resampling_interval // 2 in Anthropic's paper.
    if autoencoder.is_dead_neuron_investigation_step(step, resampling_interval, num_resamples):
        print(f'initiating investigation of dead neurons at step = {step}')
        autoencoder.initiate_dead_neurons()

    # check if we should look for dead neurons at this step
    # This is done between an odd and an even multiple of resampling_interval // 2.
    if autoencoder.is_within_neuron_investigation_phase(step, resampling_interval, num_resamples):
        autoencoder.update_dead_neurons(autoencoder_output['latents'])

    # perform neuron resampling if step is a multiple of resampling interval
    if (step+1) % resampling_interval == 0 and step < num_resamples * resampling_interval:
        num_dead_neurons = len(autoencoder.dead_neurons)
        print(f'{num_dead_neurons} neurons to be resampled at step = {step}')
        # if num_dead_neurons > 0:
        #     autoencoder.resample_dead_neurons(data=resourceloader.select_resampling_data(size=resampling_data_size),
        #                                       optimizer=optimizer,
        #                                       batch_size=batch_size)

    ### ------------ log info ----------- ######
    if (step % eval_interval == 0) or step == num_steps - 1:
        print(f'Entering evaluation mode at step = {step}')
        autoencoder.eval()

        log_dict = {'losses/reconstructed_nll': 0, # log-likelihood loss using reconstructed MLP activations
                    'losses/l0_norm': 0, # L0-norm; average number of non-zero components of a feature activation vector
                    'losses/reconstruction_loss': 0, # |xhat - x|^2 <-- L2-norm between MLP activations & their reconstruction
                    'losses/l1_norm': 0, # L1-norm of feature activations
                    'losses/autoencoder_loss': 0, # reconstruction_loss + L1-coeff * l1_loss
                    'losses/nll_score': 0, # ratio of (nll_loss - ablated_loss) to (nll_loss - reconstructed_nll)
                    }

        # initiate a tensor containing the number of tokens on which each feature activates
        feat_acts_count = torch.zeros(n_features, dtype=torch.float32)

        # # get batches of text data and evaluate the autoencoder on MLP activations
        eval_iters = len(valid_list)
        for iter in range(eval_iters):
            if iter % 20 == 0:
                print(f'Performing evaluation at iterations # ({iter} - {min(iter+19, eval_iters)})/{eval_iters}')
           # x, y = resourceloader.get_text_batch(num_contexts=eval_batch_size)

        #     _, nll_loss = gpt(x, y)
        #     mlp_acts = gpt.mlp_activation_hooks[0]
        #     gpt.clear_mlp_activation_hooks() # free up memory
        #     _, ablated_loss = gpt(x, y, mode="replace")

            with torch.no_grad():
                with open('/content/drive/MyDrive/last_layer/' + valid_list[iter], 'rb') as f:
                  x = pkl.load(f)

                autoencoder_output = autoencoder(x)


        #     # for each feature, calculate the TOTAL number of tokens on which it is active; shape:
        feat_acts = autoencoder_output['latents'].to('cpu') # (eval_batch_size, block_size, n_features)
  #     torch.add(feat_acts_count, feat_acts.count_nonzero(dim=[0, 1]), out=feat_acts_count) # (n_features, )

  #     # calculat the AVERAGE number of non-zero entries in each feature vector and log all losses
        log_dict['losses/l0_norm'] += feat_acts.count_nonzero(dim=-1).float().mean().item()
  #     log_dict['losses/reconstructed_nll'] += reconstructed_nll.item()
        log_dict['losses/autoencoder_loss'] += autoencoder_output['loss'].item()
        log_dict['losses/reconstruction_loss'] += autoencoder_output['mse_loss'].item()
        log_dict['losses/l1_norm'] += autoencoder_output['l1_loss'].item()
        #     log_dict['losses/nll_score'] += (nll_loss - reconstructed_nll).item()/(nll_loss - ablated_loss).item()

        # # compute feature densities and plot feature density histogram
        log_feat_acts_density = np.log10(feat_acts_count[feat_acts_count != 0]/(eval_iters * eval_batch_size * n_features)) # (n_features,)
        feat_density_historgram = make_density_histogram(log_feat_acts_density)

        # # take mean of all loss values by dividing by the number of evaluation batches; also log more metrics
        log_dict = {key: val/eval_iters for key, val in log_dict.items()}
        log_dict.update(
                {'training_step': step,
                'training_examples': step * batch_size,
                'debug/mean_dictionary_vector_length': torch.linalg.vector_norm(autoencoder.decoder.weight, dim=0).mean(),
                'feature_density/min_log_feat_density': log_feat_acts_density.min().item() if len(log_feat_acts_density) > 0 else -100,
                'feature_density/num_neurons_with_feature_density_above_1e-3': (log_feat_acts_density > -3).sum(),
                'feature_density/num_neurons_with_feature_density_below_1e-3': (log_feat_acts_density < -3).sum(),
                'feature_density/num_neurons_with_feature_density_below_1e-4': (log_feat_acts_density < -4).sum(),
                'feature_density/num_neurons_with_feature_density_below_1e-5': (log_feat_acts_density < -5).sum(),
                'feature_density/num_alive_neurons': len(log_feat_acts_density),
                })
        # if wandb_log:
        #     import wandb
        #     log_dict.update({'feature_density/feature_density_histograms': wandb.Image(feat_density_historgram)})
        #     wandb.log(log_dict)
        print(log_dict)
        autoencoder.train()
        print(f'Exiting evaluation mode at step = {step}')

    ### ------------ save a checkpoint ----------- ######
    if (step % eval_interval == 0) or step == num_steps - 1:
        checkpoint = {
                'autoencoder': autoencoder.state_dict(),
                'optimizer': optimizer.state_dict(),
                'log_dict': log_dict,
                'config': config,
                'feature_activation_counts': feat_acts_count, # may be used later to identify alive vs dead neurons
                }
        print(f"saving checkpoint to {'/content/drive/MyDrive/'} at training step = {step}")
        torch.save(checkpoint, os.path.join('/content/drive/MyDrive/', 'ckpt.pt'))

# if wandb_log:
#     wandb.finish()

0it [00:00, ?it/s]

Entering evaluation mode at step = 0
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.01068376042903998, 'losses/reconstruction_loss': 0.0005777909969672179, 'losses/l1_norm': 0.019362192887526292, 'losses/autoencoder_loss': 0.0006358775859459853, 'losses/nll_score': 0.0, 'training_step': 0, 'training_examples': 0, 'debug/mean_dictionary_vector_length': tensor(1., device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with_feature_de

27it [00:03, 10.83it/s]

Entering evaluation mode at step = 30
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011158594718346229, 'losses/reconstruction_loss': 0.0005728685511992528, 'losses/l1_norm': 0.020335889779604398, 'losses/autoencoder_loss': 0.0006338761976132026, 'losses/nll_score': 0.0, 'training_step': 30, 'training_examples': 3840, 'debug/mean_dictionary_vector_length': tensor(1.0002, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with_

49it [00:05, 11.85it/s]

Entering evaluation mode at step = 60
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011158594718346229, 'losses/reconstruction_loss': 0.000572180423216942, 'losses/l1_norm': 0.019757937162350386, 'losses/autoencoder_loss': 0.00063145423355775, 'losses/nll_score': 0.0, 'training_step': 60, 'training_examples': 7680, 'debug/mean_dictionary_vector_length': tensor(1.0004, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with_fea

84it [00:09, 13.85it/s]

Entering evaluation mode at step = 90
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.01092117795577416, 'losses/reconstruction_loss': 0.0005699678395803159, 'losses/l1_norm': 0.019446813143216647, 'losses/autoencoder_loss': 0.0006283082736608309, 'losses/nll_score': 0.0, 'training_step': 90, 'training_examples': 11520, 'debug/mean_dictionary_vector_length': tensor(1.0006, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with_

117it [00:13, 10.78it/s]

Entering evaluation mode at step = 120
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009971510141323775, 'losses/reconstruction_loss': 0.0005690176039934158, 'losses/l1_norm': 0.018802086512247723, 'losses/autoencoder_loss': 0.0006254238482469168, 'losses/nll_score': 0.0, 'training_step': 120, 'training_examples': 15360, 'debug/mean_dictionary_vector_length': tensor(1.0008, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

141it [00:15, 11.88it/s]

Entering evaluation mode at step = 150
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011396011480918298, 'losses/reconstruction_loss': 0.0005634194478774682, 'losses/l1_norm': 0.019295316476088304, 'losses/autoencoder_loss': 0.000621305396541571, 'losses/nll_score': 0.0, 'training_step': 150, 'training_examples': 19200, 'debug/mean_dictionary_vector_length': tensor(1.0010, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

174it [00:18, 13.65it/s]

Entering evaluation mode at step = 180
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011158594718346229, 'losses/reconstruction_loss': 0.0005602750640649062, 'losses/l1_norm': 0.01926861970852583, 'losses/autoencoder_loss': 0.0006180809189875921, 'losses/nll_score': 0.0, 'training_step': 180, 'training_examples': 23040, 'debug/mean_dictionary_vector_length': tensor(1.0012, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

210it [00:23, 10.69it/s]

Entering evaluation mode at step = 210
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.01044634366646791, 'losses/reconstruction_loss': 0.0005570435180113866, 'losses/l1_norm': 0.01960153151781131, 'losses/autoencoder_loss': 0.0006158481328151164, 'losses/nll_score': 0.0, 'training_step': 210, 'training_examples': 26880, 'debug/mean_dictionary_vector_length': tensor(1.0014, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with

234it [00:25, 11.77it/s]

Entering evaluation mode at step = 240
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011396011480918298, 'losses/reconstruction_loss': 0.000556051587829223, 'losses/l1_norm': 0.01946101127526699, 'losses/autoencoder_loss': 0.0006144346239475104, 'losses/nll_score': 0.0, 'training_step': 240, 'training_examples': 30720, 'debug/mean_dictionary_vector_length': tensor(1.0015, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with

260it [00:29, 10.48it/s]

Entering evaluation mode at step = 270
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011633428243490366, 'losses/reconstruction_loss': 0.0005562352255368844, 'losses/l1_norm': 0.01914825500586094, 'losses/autoencoder_loss': 0.0006136799661012796, 'losses/nll_score': 0.0, 'training_step': 270, 'training_examples': 34560, 'debug/mean_dictionary_vector_length': tensor(1.0017, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

291it [00:33,  9.09it/s]

Entering evaluation mode at step = 300
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011633428243490366, 'losses/reconstruction_loss': 0.0005545093176456598, 'losses/l1_norm': 0.01939185918905796, 'losses/autoencoder_loss': 0.000612684883750402, 'losses/nll_score': 0.0, 'training_step': 300, 'training_examples': 38400, 'debug/mean_dictionary_vector_length': tensor(1.0019, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with

322it [00:35, 14.04it/s]

Entering evaluation mode at step = 330
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.012345679295368684, 'losses/reconstruction_loss': 0.0005551973978678385, 'losses/l1_norm': 0.01895703260715191, 'losses/autoencoder_loss': 0.0006120684914864027, 'losses/nll_score': 0.0, 'training_step': 330, 'training_examples': 42240, 'debug/mean_dictionary_vector_length': tensor(1.0021, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

353it [00:39, 14.19it/s]

Entering evaluation mode at step = 360
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.013770179870801095, 'losses/reconstruction_loss': 0.0005546151540982417, 'losses/l1_norm': 0.01875361570945153, 'losses/autoencoder_loss': 0.0006108760165098386, 'losses/nll_score': 0.0, 'training_step': 360, 'training_examples': 46080, 'debug/mean_dictionary_vector_length': tensor(1.0023, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

379it [00:43,  9.03it/s]

Entering evaluation mode at step = 390
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.012108261768634502, 'losses/reconstruction_loss': 0.0005552687037449616, 'losses/l1_norm': 0.018498628567426633, 'losses/autoencoder_loss': 0.0006107645921218089, 'losses/nll_score': 0.0, 'training_step': 390, 'training_examples': 49920, 'debug/mean_dictionary_vector_length': tensor(1.0024, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

411it [00:45, 14.42it/s]

Entering evaluation mode at step = 420
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011633428243490366, 'losses/reconstruction_loss': 0.0005544043886355865, 'losses/l1_norm': 0.01859137033804869, 'losses/autoencoder_loss': 0.0006101784797815176, 'losses/nll_score': 0.0, 'training_step': 420, 'training_examples': 53760, 'debug/mean_dictionary_vector_length': tensor(1.0026, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

443it [00:49, 14.38it/s]

Entering evaluation mode at step = 450
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.01068376042903998, 'losses/reconstruction_loss': 0.0005544921717582605, 'losses/l1_norm': 0.01822838416466346, 'losses/autoencoder_loss': 0.000609177331893872, 'losses/nll_score': 0.0, 'training_step': 450, 'training_examples': 57600, 'debug/mean_dictionary_vector_length': tensor(1.0028, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with_

474it [00:53, 11.18it/s]

Entering evaluation mode at step = 480
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009496675852017526, 'losses/reconstruction_loss': 0.000553952721066964, 'losses/l1_norm': 0.018229800921220046, 'losses/autoencoder_loss': 0.0006086421318543263, 'losses/nll_score': 0.0, 'training_step': 480, 'training_examples': 61440, 'debug/mean_dictionary_vector_length': tensor(1.0030, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

510it [00:57, 11.65it/s]

Entering evaluation mode at step = 510
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00878442556430132, 'losses/reconstruction_loss': 0.0005538837554363104, 'losses/l1_norm': 0.018171856036553018, 'losses/autoencoder_loss': 0.0006083993193430778, 'losses/nll_score': 0.0, 'training_step': 510, 'training_examples': 65280, 'debug/mean_dictionary_vector_length': tensor(1.0033, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

534it [00:59, 12.55it/s]

Entering evaluation mode at step = 540
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009496675852017526, 'losses/reconstruction_loss': 0.0005539649476607641, 'losses/l1_norm': 0.018089109506362524, 'losses/autoencoder_loss': 0.0006082322544012314, 'losses/nll_score': 0.0, 'training_step': 540, 'training_examples': 69120, 'debug/mean_dictionary_vector_length': tensor(1.0035, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

567it [01:03, 11.31it/s]

Entering evaluation mode at step = 570
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00854700880172925, 'losses/reconstruction_loss': 0.0005542420519468111, 'losses/l1_norm': 0.017721967819409493, 'losses/autoencoder_loss': 0.0006074079622824987, 'losses/nll_score': 0.0, 'training_step': 570, 'training_examples': 72960, 'debug/mean_dictionary_vector_length': tensor(1.0037, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

600it [01:33,  1.15it/s]

Entering evaluation mode at step = 600
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.007834757749850933, 'losses/reconstruction_loss': 0.0005543020386726428, 'losses/l1_norm': 0.017625955434945915, 'losses/autoencoder_loss': 0.0006071799076520479, 'losses/nll_score': 0.0, 'training_step': 600, 'training_examples': 76800, 'debug/mean_dictionary_vector_length': tensor(1.0039, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

630it [02:04,  1.04it/s]

Entering evaluation mode at step = 630
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009259259089445457, 'losses/reconstruction_loss': 0.0005526359742268538, 'losses/l1_norm': 0.017881538623418562, 'losses/autoencoder_loss': 0.0006062805843658936, 'losses/nll_score': 0.0, 'training_step': 630, 'training_examples': 80640, 'debug/mean_dictionary_vector_length': tensor(1.0042, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

660it [02:34,  1.13it/s]

Entering evaluation mode at step = 660
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.007597340987278865, 'losses/reconstruction_loss': 0.0005531406555420313, 'losses/l1_norm': 0.01746054490407308, 'losses/autoencoder_loss': 0.0006055222967496285, 'losses/nll_score': 0.0, 'training_step': 660, 'training_examples': 84480, 'debug/mean_dictionary_vector_length': tensor(1.0044, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

690it [03:04,  1.12it/s]

Entering evaluation mode at step = 690
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00830959127499507, 'losses/reconstruction_loss': 0.000552268555531135, 'losses/l1_norm': 0.017720855199373685, 'losses/autoencoder_loss': 0.0006054311226575803, 'losses/nll_score': 0.0, 'training_step': 690, 'training_examples': 88320, 'debug/mean_dictionary_vector_length': tensor(1.0046, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with

720it [03:34,  1.13it/s]

Entering evaluation mode at step = 720
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00878442556430132, 'losses/reconstruction_loss': 0.0005529356690553519, 'losses/l1_norm': 0.017490898951506004, 'losses/autoencoder_loss': 0.0006054083888347333, 'losses/nll_score': 0.0, 'training_step': 720, 'training_examples': 92160, 'debug/mean_dictionary_vector_length': tensor(1.0049, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

750it [04:05,  1.10it/s]

Entering evaluation mode at step = 750
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.008072174512423001, 'losses/reconstruction_loss': 0.0005529987601897655, 'losses/l1_norm': 0.017248882697178766, 'losses/autoencoder_loss': 0.0006047453826818711, 'losses/nll_score': 0.0, 'training_step': 750, 'training_examples': 96000, 'debug/mean_dictionary_vector_length': tensor(1.0051, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

780it [04:35,  1.08it/s]

Entering evaluation mode at step = 780
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00854700880172925, 'losses/reconstruction_loss': 0.0005526876506897119, 'losses/l1_norm': 0.01741392337358915, 'losses/autoencoder_loss': 0.0006049294024705887, 'losses/nll_score': 0.0, 'training_step': 780, 'training_examples': 99840, 'debug/mean_dictionary_vector_length': tensor(1.0053, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_with

810it [05:05,  1.07it/s]

Entering evaluation mode at step = 810
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009021842326873388, 'losses/reconstruction_loss': 0.0005535982453670257, 'losses/l1_norm': 0.016922293565212153, 'losses/autoencoder_loss': 0.0006043651165106358, 'losses/nll_score': 0.0, 'training_step': 810, 'training_examples': 103680, 'debug/mean_dictionary_vector_length': tensor(1.0055, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_w

840it [05:36,  1.03it/s]

Entering evaluation mode at step = 840
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009734093378751706, 'losses/reconstruction_loss': 0.0005528294027615816, 'losses/l1_norm': 0.016915263273777105, 'losses/autoencoder_loss': 0.0006035752116869658, 'losses/nll_score': 0.0, 'training_step': 840, 'training_examples': 107520, 'debug/mean_dictionary_vector_length': tensor(1.0057, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_w

870it [06:06,  1.09it/s]

Entering evaluation mode at step = 870
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009021842326873388, 'losses/reconstruction_loss': 0.0005525299467337438, 'losses/l1_norm': 0.016793958651713837, 'losses/autoencoder_loss': 0.0006029118234530473, 'losses/nll_score': 0.0, 'training_step': 870, 'training_examples': 111360, 'debug/mean_dictionary_vector_length': tensor(1.0059, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_w

900it [06:36,  1.04it/s]

Entering evaluation mode at step = 900
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009496675852017526, 'losses/reconstruction_loss': 0.000552419859629411, 'losses/l1_norm': 0.01684604699795063, 'losses/autoencoder_loss': 0.0006029580075007218, 'losses/nll_score': 0.0, 'training_step': 900, 'training_examples': 115200, 'debug/mean_dictionary_vector_length': tensor(1.0062, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

930it [07:07,  1.16it/s]

Entering evaluation mode at step = 930
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00830959127499507, 'losses/reconstruction_loss': 0.0005525564536070212, 'losses/l1_norm': 0.01659213885282859, 'losses/autoencoder_loss': 0.0006023328751325607, 'losses/nll_score': 0.0, 'training_step': 930, 'training_examples': 119040, 'debug/mean_dictionary_vector_length': tensor(1.0066, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wit

960it [07:38,  1.13it/s]

Entering evaluation mode at step = 960
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.00830959127499507, 'losses/reconstruction_loss': 0.0005522319235098668, 'losses/l1_norm': 0.016548344722160928, 'losses/autoencoder_loss': 0.0006018769569121874, 'losses/nll_score': 0.0, 'training_step': 960, 'training_examples': 122880, 'debug/mean_dictionary_vector_length': tensor(1.0069, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_wi

990it [08:20,  1.02it/s]

Entering evaluation mode at step = 990
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009259259089445457, 'losses/reconstruction_loss': 0.0005522422396983856, 'losses/l1_norm': 0.016437337948725775, 'losses/autoencoder_loss': 0.0006015542417000502, 'losses/nll_score': 0.0, 'training_step': 990, 'training_examples': 126720, 'debug/mean_dictionary_vector_length': tensor(1.0071, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_w

1020it [08:50,  1.06s/it]

Entering evaluation mode at step = 1020
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009496675852017526, 'losses/reconstruction_loss': 0.0005595010633652026, 'losses/l1_norm': 0.01775368054707845, 'losses/autoencoder_loss': 0.0006127621118838971, 'losses/nll_score': 0.0, 'training_step': 1020, 'training_examples': 130560, 'debug/mean_dictionary_vector_length': tensor(1.0001, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_

1050it [09:21,  1.09it/s]

Entering evaluation mode at step = 1050
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.008072174512423001, 'losses/reconstruction_loss': 0.0005608148490771269, 'losses/l1_norm': 0.01695233430617895, 'losses/autoencoder_loss': 0.0006116718435898805, 'losses/nll_score': 0.0, 'training_step': 1050, 'training_examples': 134400, 'debug/mean_dictionary_vector_length': tensor(1.0004, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_

1080it [09:51,  1.04it/s]

Entering evaluation mode at step = 1080
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.009021842326873388, 'losses/reconstruction_loss': 0.0005602848071318406, 'losses/l1_norm': 0.01695432724096836, 'losses/autoencoder_loss': 0.0006111477716610982, 'losses/nll_score': 0.0, 'training_step': 1080, 'training_examples': 138240, 'debug/mean_dictionary_vector_length': tensor(1.0010, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_

1110it [10:22,  1.09it/s]

Entering evaluation mode at step = 1110
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.010208926903895842, 'losses/reconstruction_loss': 0.0005606185549344772, 'losses/l1_norm': 0.016718283677712466, 'losses/autoencoder_loss': 0.0006107733799861028, 'losses/nll_score': 0.0, 'training_step': 1110, 'training_examples': 142080, 'debug/mean_dictionary_vector_length': tensor(1.0014, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons

1140it [10:54,  1.12it/s]

Entering evaluation mode at step = 1140
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.01068376042903998, 'losses/reconstruction_loss': 0.0005598129370273688, 'losses/l1_norm': 0.01670379516405937, 'losses/autoencoder_loss': 0.0006099243003588456, 'losses/nll_score': 0.0, 'training_step': 1140, 'training_examples': 145920, 'debug/mean_dictionary_vector_length': tensor(1.0017, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_w

1170it [11:23,  1.14it/s]

Entering evaluation mode at step = 1170
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011158594718346229, 'losses/reconstruction_loss': 0.0005599768020403691, 'losses/l1_norm': 0.016552844108679354, 'losses/autoencoder_loss': 0.0006096353515600547, 'losses/nll_score': 0.0, 'training_step': 1170, 'training_examples': 149760, 'debug/mean_dictionary_vector_length': tensor(1.0020, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons

1200it [11:54,  1.08it/s]

Entering evaluation mode at step = 1200
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.013532763872391138, 'losses/reconstruction_loss': 0.000555747355788182, 'losses/l1_norm': 0.017204000399662897, 'losses/autoencoder_loss': 0.0006073593424680906, 'losses/nll_score': 0.0, 'training_step': 1200, 'training_examples': 153600, 'debug/mean_dictionary_vector_length': tensor(1.0023, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_

1230it [12:24,  1.10it/s]

Entering evaluation mode at step = 1230
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.012345679295368684, 'losses/reconstruction_loss': 0.0005468029337815749, 'losses/l1_norm': 0.01831692762863942, 'losses/autoencoder_loss': 0.000601753735771546, 'losses/nll_score': 0.0, 'training_step': 1230, 'training_examples': 157440, 'debug/mean_dictionary_vector_length': tensor(1.0025, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_w

1260it [12:55,  1.02it/s]

Entering evaluation mode at step = 1260
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.01044634366646791, 'losses/reconstruction_loss': 0.0005471232132269786, 'losses/l1_norm': 0.017751366664201785, 'losses/autoencoder_loss': 0.0006003773365265284, 'losses/nll_score': 0.0, 'training_step': 1260, 'training_examples': 161280, 'debug/mean_dictionary_vector_length': tensor(1.0028, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons_

1271it [13:07,  1.01it/s]

Entering evaluation mode at step = 1271
Performing evaluation at iterations # (0 - 19)/156
Performing evaluation at iterations # (20 - 39)/156
Performing evaluation at iterations # (40 - 59)/156
Performing evaluation at iterations # (60 - 79)/156
Performing evaluation at iterations # (80 - 99)/156
Performing evaluation at iterations # (100 - 119)/156
Performing evaluation at iterations # (120 - 139)/156
Performing evaluation at iterations # (140 - 156)/156
{'losses/reconstructed_nll': 0.0, 'losses/l0_norm': 0.011158594718346229, 'losses/reconstruction_loss': 0.0005463149685126084, 'losses/l1_norm': 0.017904124198815763, 'losses/autoencoder_loss': 0.0006000273502790011, 'losses/nll_score': 0.0, 'training_step': 1271, 'training_examples': 162688, 'debug/mean_dictionary_vector_length': tensor(1.0029, device='cuda:0', grad_fn=<MeanBackward0>), 'feature_density/min_log_feat_density': -100, 'feature_density/num_neurons_with_feature_density_above_1e-3': tensor(0), 'feature_density/num_neurons

1272it [13:10,  1.61it/s]


In [53]:
torch.save(checkpoint, os.path.join('/content/drive/MyDrive/', 'ckpt_final.pt'))