In [164]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from PIL import Image


def denormalize(T, coords):
    return (0.5 * ((coords + 1.0) * T))


def bounding_box(x, y, size, color='w'):
    x = int(x - (size / 2))
    y = int(y - (size / 2))
    rect = patches.Rectangle(
        (x, y), size, size, linewidth=1, edgecolor=color, fill=False
    )
    return rect


class AverageMeter(object):
    """
    Computes and stores the average and
    current value.
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def resize_array(x, size):
    # 3D and 4D tensors allowed only
    assert x.ndim in [3, 4], "Only 3D and 4D Tensors allowed!"

    # 4D Tensor
    if x.ndim == 4:
        res = []
        for i in range(x.shape[0]):
            img = array2img(x[i])
            img = img.resize((size, size))
            img = np.asarray(img, dtype='float32')
            img = np.expand_dims(img, axis=0)
            img /= 255.0
            res.append(img)
        res = np.concatenate(res)
        res = np.expand_dims(res, axis=1)
        return res

    # 3D Tensor
    img = array2img(x)
    img = img.resize((size, size))
    res = np.asarray(img, dtype='float32')
    res = np.expand_dims(res, axis=0)
    res /= 255.0
    return res


def img2array(data_path, desired_size=None, expand=False, view=False):
    """
    Util function for loading RGB image into a numpy array.

    Returns array of shape (1, H, W, C).
    """
    img = Image.open(data_path)
    img = img.convert('RGB')
    if desired_size:
        img = img.resize((desired_size[1], desired_size[0]))
    if view:
        img.show()
    x = np.asarray(img, dtype='float32')
    if expand:
        x = np.expand_dims(x, axis=0)
    x /= 255.0
    return x


def array2img(x):
    """
    Util function for converting anumpy array to a PIL img.

    Returns PIL RGB img.
    """
    x = np.asarray(x)
    x = x + max(-np.min(x), 0)
    x_max = np.max(x)
    if x_max != 0:
        x /= x_max
    x *= 255
    return Image.fromarray(x.astype('uint8'), 'RGB')


def plot_images(images, gd_truth):

    images = images.squeeze()
    assert len(images) == len(gd_truth) == 9

    # Create figure with sub-plots.
    fig, axes = plt.subplots(3, 3)

    for i, ax in enumerate(axes.flat):
        # plot the image
        ax.imshow(images[i], cmap="Greys_r")

        xlabel = "{}".format(gd_truth[i])
        ax.set_xlabel(xlabel)
        ax.set_xticks([])
        ax.set_yticks([])

    plt.show()


def prepare_dirs(config):
    for path in [config.data_dir, config.ckpt_dir, config.logs_dir]:
        if not os.path.exists(path):
            os.makedirs(path)


def save_config(config):
    model_name = 'ram_{}_{}x{}_{}'.format(
        config.num_glimpses, config.patch_size,
        config.patch_size, config.glimpse_scale
    )
    filename = model_name + '_params.json'
    param_path = os.path.join(config.ckpt_dir, filename)

    print("[*] Model Checkpoint Dir: {}".format(config.ckpt_dir))
    print("[*] Param Path: {}".format(param_path))

    with open(param_path, 'w') as fp:
        json.dump(config.__dict__, fp, indent=4, sort_keys=True)

In [165]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from skimage import io, transform
import numpy as np
#from utils import plot_images
#from rdkit import Chem
#from rdkit.Chem import Draw
#from PIL import Image
import os
import torch
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
import os

class ToxicDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        print (csv_file)
        self.data = pd.read_csv(csv_file)
        self.max_tox=self.data.loc[:, (self.data.columns != 'SMILES')& (self.data.columns !='Unnamed: 0')].as_matrix()
        self.root_dir = root_dir
        self.transform = transform
    def __len__(self):
        return len(self.max_tox)
    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir,str(idx)+'.png')
        image = io.imread(img_name)
        image = image[np.newaxis, :, :]
        image.astype(float)
        y = self.max_tox[idx]
        sample = {'image': image, 'y': y}
        if self.transform:
            sample = self.transform
        return sample['image'], sample['y']
    
def get_train_valid_loader(data_dir,
                           batch_size,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True,
                           show_sample=False,
                           num_workers=4,
                           pin_memory=False):
    """
    Utility function for loading and returning train and valid
    multi-process iterators over the MNIST dataset. A sample
    9x9 grid of the images can be optionally displayed.

    If using CUDA, num_workers should be set to 1 and pin_memory to True.

    Args
    ----
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - random_seed: fix seed for reproducibility.
    - valid_size: percentage split of the training set used for
      the validation set. Should be a float in the range [0, 1].
      In the paper, this number is set to 0.1.
    - shuffle: whether to shuffle the train/validation indices.
    - show_sample: plot 9x9 sample grid of the dataset.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.

    Returns
    -------
    - train_loader: training set iterator.
    - valid_loader: validation set iterator.
    """
    error_msg = "[!] valid_size should be in the range [0, 1]."
    assert ((valid_size >= 0) and (valid_size <= 1)), error_msg

    # define transforms
    #normalize = transforms.Normalize((0.1307,), (0.3081,))
    trans = transforms.Compose([
        transforms.ToTensor()#, normalize,
    ])

    # load dataset

    #dataset1 = datasets.MNIST(
    #    data_dir, train=True, download=True, transform=trans
    #)
    dataset = ToxicDataset(csv_file="aggregate_tox.csv", root_dir="../Data/")

    num_train = len(dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    # visualize some images
    if show_sample:
        sample_loader = torch.utils.data.DataLoader(
            dataset, batch_size=9, shuffle=shuffle,
            num_workers=num_workers, pin_memory=pin_memory
        )
        data_iter = iter(sample_loader)
        images, labels = data_iter.next()
        X = images.numpy()
        X = np.transpose(X, [0, 2, 3, 1])
        plot_images(X, labels)

    return (train_loader, valid_loader)


def get_test_loader(data_dir,
                    batch_size,
                    num_workers=4,
                    pin_memory=False):
    """
    Utility function for loading and returning a multi-process
    test iterator over the MNIST dataset.

    If using CUDA, num_workers should be set to 1 and pin_memory to True.

    Args
    ----
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.

    Returns
    -------
    - data_loader: test set iterator.
    """
    # define transforms
    #normalize = transforms.Normalize((0.1307,), (0.3081,))
    trans = transforms.Compose([
        transforms.ToTensor()#, normalize,
    ])

    # load dataset
    dataset = ToxicDataset(csv_file="aggregate_tox.csv", root_dir="../Data/")
    #dataset = datasets.MNIST(
    #    data_dir, train=False, download=True, transform=trans
    #)

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    return data_loader



'''for i in range(len(ds)):
    sample = ds[i]
    #print (type(sample['image']))
    print (i, sample['image'].shape, sample['y'])
    if i==3:
        break

data_loader = DataLoader(ds,batch_size = 4, shuffle = True)'''

"for i in range(len(ds)):\n    sample = ds[i]\n    #print (type(sample['image']))\n    print (i, sample['image'].shape, sample['y'])\n    if i==3:\n        break\n\ndata_loader = DataLoader(ds,batch_size = 4, shuffle = True)"

In [166]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

import numpy as np


class retina(object):
    """
    A retina that extracts a foveated glimpse `phi`
    around location `l` from an image `x`. It encodes
    the region around `l` at a high-resolution but uses
    a progressively lower resolution for pixels further
    from `l`, resulting in a compressed representation
    of the original image `x`.

    Args
    ----
    - x: a 4D Tensor of shape (B, H, W, C). The minibatch
      of images.
    - l: a 2D Tensor of shape (B, 2). Contains normalized
      coordinates in the range [-1, 1].
    - g: size of the first square patch.
    - k: number of patches to extract in the glimpse.
    - s: scaling factor that controls the size of
      successive patches.

    Returns
    -------
    - phi: a 5D tensor of shape (B, k, g, g, C). The
      foveated glimpse of the image.
    """
    def __init__(self, g, k, s):
        self.g = g
        self.k = k
        self.s = s

    def foveate(self, x, l):
        """
        Extract `k` square patches of size `g`, centered
        at location `l`. The initial patch is a square of
        size `g`, and each subsequent patch is a square
        whose side is `s` times the size of the previous
        patch.

        The `k` patches are finally resized to (g, g) and
        concatenated into a tensor of shape (B, k, g, g, C).
        """
        phi = []
        size = self.g

        # extract k patches of increasing size
        for i in range(self.k):
            phi.append(self.extract_patch(x, l, size))
            size = int(self.s * size)

        # resize the patches to squares of size g
        for i in range(1, len(phi)):
            k = phi[i].shape[-1] // self.g
            phi[i] = F.avg_pool2d(phi[i], k)

        # concatenate into a single tensor and flatten
        phi = torch.cat(phi, 1)
        phi = phi.view(phi.shape[0], -1)

        return phi

    def extract_patch(self, x, l, size):
        """
        Extract a single patch for each image in the
        minibatch `x`.

        Args
        ----
        - x: a 4D Tensor of shape (B, H, W, C). The minibatch
          of images.
        - l: a 2D Tensor of shape (B, 2).
        - size: a scalar defining the size of the extracted patch.

        Returns
        -------
        - patch: a 4D Tensor of shape (B, size, size, C)
        """
        B, C, H, W = x.shape

        # denormalize coords of patch center
        coords = self.denormalize(H, l)

        # compute top left corner of patch
        patch_x = coords[:, 0] - (size // 2)
        patch_y = coords[:, 1] - (size // 2)

        # loop through mini-batch and extract
        patch = []
        for i in range(B):
            im = x[i].unsqueeze(dim=0)
            T = im.shape[-1]

            # compute slice indices
            from_x, to_x = patch_x[i], patch_x[i] + size
            from_y, to_y = patch_y[i], patch_y[i] + size

            # cast to ints
            from_x, to_x = from_x.item(), to_x.item()
            from_y, to_y = from_y.item(), to_y.item()

            # pad tensor in case exceeds
            if self.exceeds(from_x, to_x, from_y, to_y, T):
                pad_dims = (
                    size//2+1, size//2+1,
                    size//2+1, size//2+1,
                    0, 0,
                    0, 0,
                )
                im = F.pad(im, pad_dims, "constant", 0)

                # add correction factor
                from_x += (size//2+1)
                to_x += (size//2+1)
                from_y += (size//2+1)
                to_y += (size//2+1)

            # and finally extract
            patch.append(im[:, :, from_y:to_y, from_x:to_x])

        # concatenate into a single tensor
        patch = torch.cat(patch)

        return patch

    def denormalize(self, T, coords):
        """
        Convert coordinates in the range [-1, 1] to
        coordinates in the range [0, T] where `T` is
        the size of the image.
        """
        return (0.5 * ((coords + 1.0) * T)).long()

    def exceeds(self, from_x, to_x, from_y, to_y, T):
        """
        Check whether the extracted patch will exceed
        the boundaries of the image of size `T`.
        """
        if (
            (from_x < 0) or (from_y < 0) or (to_x > T) or (to_y > T)
        ):
            return True
        return False


class glimpse_network(nn.Module):
    """
    A network that combines the "what" and the "where"
    into a glimpse feature vector `g_t`.

    - "what": glimpse extracted from the retina.
    - "where": location tuple where glimpse was extracted.

    Concretely, feeds the output of the retina `phi` to
    a fc layer and the glimpse location vector `l_t_prev`
    to a fc layer. Finally, these outputs are fed each
    through a fc layer and their sum is rectified.

    In other words:

        `g_t = relu( fc( fc(l) ) + fc( fc(phi) ) )`

    Args
    ----
    - h_g: hidden layer size of the fc layer for `phi`.
    - h_l: hidden layer size of the fc layer for `l`.
    - g: size of the square patches in the glimpses extracted
      by the retina.
    - k: number of patches to extract per glimpse.
    - s: scaling factor that controls the size of successive patches.
    - c: number of channels in each image.
    - x: a 4D Tensor of shape (B, H, W, C). The minibatch
      of images.
    - l_t_prev: a 2D tensor of shape (B, 2). Contains the glimpse
      coordinates [x, y] for the previous timestep `t-1`.

    Returns
    -------
    - g_t: a 2D tensor of shape (B, hidden_size). The glimpse
      representation returned by the glimpse network for the
      current timestep `t`.
    """
    def __init__(self, h_g, h_l, g, k, s, c):
        super(glimpse_network, self).__init__()
        self.retina = retina(g, k, s)

        # glimpse layer
        D_in = k*g*g*c
        self.fc1 = nn.Linear(D_in, h_g)

        # location layer
        D_in = 2
        self.fc2 = nn.Linear(D_in, h_l)

        self.fc3 = nn.Linear(h_g, h_g+h_l)
        self.fc4 = nn.Linear(h_l, h_g+h_l)

    def forward(self, x, l_t_prev):
        # generate glimpse phi from image x
        phi = self.retina.foveate(x, l_t_prev)

        # flatten location vector
        l_t_prev = l_t_prev.view(l_t_prev.size(0), -1)

        # feed phi and l to respective fc layers
        #print(phi.type())
        phi_out = F.relu(self.fc1(phi))
        l_out = F.relu(self.fc2(l_t_prev))

        what = self.fc3(phi_out)
        where = self.fc4(l_out)

        # feed to fc layer
        g_t = F.relu(what + where)

        return g_t


class core_network(nn.Module):
    """
    An RNN that maintains an internal state that integrates
    information extracted from the history of past observations.
    It encodes the agent's knowledge of the environment through
    a state vector `h_t` that gets updated at every time step `t`.

    Concretely, it takes the glimpse representation `g_t` as input,
    and combines it with its internal state `h_t_prev` at the previous
    time step, to produce the new internal state `h_t` at the current
    time step.

    In other words:

        `h_t = relu( fc(h_t_prev) + fc(g_t) )`

    Args
    ----
    - input_size: input size of the rnn.
    - hidden_size: hidden size of the rnn.
    - g_t: a 2D tensor of shape (B, hidden_size). The glimpse
      representation returned by the glimpse network for the
      current timestep `t`.
    - h_t_prev: a 2D tensor of shape (B, hidden_size). The
      hidden state vector for the previous timestep `t-1`.

    Returns
    -------
    - h_t: a 2D tensor of shape (B, hidden_size). The hidden
      state vector for the current timestep `t`.
    """
    def __init__(self, input_size, hidden_size):
        super(core_network, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)

    def forward(self, g_t, h_t_prev):
        h1 = self.i2h(g_t)
        h2 = self.h2h(h_t_prev)
        h_t = F.relu(h1 + h2)
        return h_t


class action_network(nn.Module):
    """
    Uses the internal state `h_t` of the core network to
    produce the final output classification.

    Concretely, feeds the hidden state `h_t` through a fc
    layer followed by a softmax to create a vector of
    output probabilities over the possible classes.

    Hence, the environment action `a_t` is drawn from a
    distribution conditioned on an affine transformation
    of the hidden state vector `h_t`, or in other words,
    the action network is simply a linear softmax classifier.

    Args
    ----
    - input_size: input size of the fc layer.
    - output_size: output size of the fc layer.
    - h_t: the hidden state vector of the core network for
      the current time step `t`.

    Returns
    -------
    - a_t: output probability vector over the classes.
    """
    def __init__(self, input_size, output_size):
        super(action_network, self).__init__()
        print (input_size)
        self.model = nn.Sequential(
        nn.Linear(input_size, 512),
        nn.ReLU(),
        nn.Dropout(0.2),
        nn.Linear(input_size, 512),
        nn.ReLU(),
        nn.Dropout(0.2),
        nn.Linear(512, 29))
    def forward(self, h_t):
        #a_t = F.log_softmax(self.fc(h_t), dim=1)
        a_t = self.model.forward(h_t)
        #print (a_t.shape)
        return a_t


class location_network(nn.Module):
    """
    Uses the internal state `h_t` of the core network to
    produce the location coordinates `l_t` for the next
    time step.

    Concretely, feeds the hidden state `h_t` through a fc
    layer followed by a tanh to clamp the output beween
    [-1, 1]. This produces a 2D vector of means used to
    parametrize a two-component Gaussian with a fixed
    variance from which the location coordinates `l_t`
    for the next time step are sampled.

    Hence, the location `l_t` is chosen stochastically
    from a distribution conditioned on an affine
    transformation of the hidden state vector `h_t`.

    Args
    ----
    - input_size: input size of the fc layer.
    - output_size: output size of the fc layer.
    - std: standard deviation of the normal distribution.
    - h_t: the hidden state vector of the core network for
      the current time step `t`.

    Returns
    -------
    - mu: a 2D vector of shape (B, 2).
    - l_t: a 2D vector of shape (B, 2).
    """
    def __init__(self, input_size, output_size, std):
        super(location_network, self).__init__()
        self.std = std
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, h_t):
        # compute mean
        mu = F.tanh(self.fc(h_t.detach()))

        # reparametrization trick
        noise = torch.zeros_like(mu)
        noise.data.normal_(std=self.std)
        l_t = mu + noise

        # bound between [-1, 1]
        l_t = F.tanh(l_t)

        return mu, l_t


class baseline_network(nn.Module):
    """
    Regresses the baseline in the reward function
    to reduce the variance of the gradient update.

    Args
    ----
    - input_size: input size of the fc layer.
    - output_size: output size of the fc layer.
    - h_t: the hidden state vector of the core network
      for the current time step `t`.

    Returns
    -------
    - b_t: a 2D vector of shape (B, 1). The baseline
      for the current time step `t`.
    """
    def __init__(self, input_size, output_size):
        super(baseline_network, self).__init__()
        self.fc1 = nn.Linear(input_size, 29)
        #self.fc2 = nn.Linear(1000,1)

    def forward(self, h_t):
        #b_t = self.fc2(F.relu(self.fc1(h_t.detach())))
        b_t = self.fc1(h_t.detach())
        return b_t


In [167]:
import math

import torch
import torch.nn as nn

from torch.distributions import Normal


class RecurrentAttention(nn.Module):
    """
    A Recurrent Model of Visual Attention (RAM) [1].

    RAM is a recurrent neural network that processes
    inputs sequentially, attending to different locations
    within the image one at a time, and incrementally
    combining information from these fixations to build
    up a dynamic internal representation of the image.

    References
    ----------
    - Minh et. al., https://arxiv.org/abs/1406.6247
    """
    def __init__(self,
                 g,
                 k,
                 s,
                 c,
                 h_g,
                 h_l,
                 std,
                 hidden_size,
                 num_classes):
        """
        Initialize the recurrent attention model and its
        different components.

        Args
        ----
        - g: size of the square patches in the glimpses extracted
          by the retina.
        - k: number of patches to extract per glimpse.
        - s: scaling factor that controls the size of successive patches.
        - c: number of channels in each image.
        - h_g: hidden layer size of the fc layer for `phi`.
        - h_l: hidden layer size of the fc layer for `l`.
        - std: standard deviation of the Gaussian policy.
        - hidden_size: hidden size of the rnn.
        - num_classes: number of classes in the dataset.
        - num_glimpses: number of glimpses to take per image,
          i.e. number of BPTT steps.
        """
        super(RecurrentAttention, self).__init__()
        self.std = std

        self.sensor = glimpse_network(h_g, h_l, g, k, s, c)
        self.rnn = core_network(hidden_size, hidden_size)
        self.locator = location_network(hidden_size, 2, std)
        self.classifier = action_network(hidden_size, num_classes)
        self.baseliner = baseline_network(hidden_size, 1)

    def forward(self, x, l_t_prev, h_t_prev, last=False):
        """
        Run the recurrent attention model for 1 timestep
        on the minibatch of images `x`.

        Args
        ----
        - x: a 4D Tensor of shape (B, H, W, C). The minibatch
          of images.
        - l_t_prev: a 2D tensor of shape (B, 2). The location vector
          containing the glimpse coordinates [x, y] for the previous
          timestep `t-1`.
        - h_t_prev: a 2D tensor of shape (B, hidden_size). The hidden
          state vector for the previous timestep `t-1`.
        - last: a bool indicating whether this is the last timestep.
          If True, the action network returns an output probability
          vector over the classes and the baseline `b_t` for the
          current timestep `t`. Else, the core network returns the
          hidden state vector for the next timestep `t+1` and the
          location vector for the next timestep `t+1`.

        Returns
        -------
        - h_t: a 2D tensor of shape (B, hidden_size). The hidden
          state vector for the current timestep `t`.
        - mu: a 2D tensor of shape (B, 2). The mean that parametrizes
          the Gaussian policy.
        - l_t: a 2D tensor of shape (B, 2). The location vector
          containing the glimpse coordinates [x, y] for the
          current timestep `t`.
        - b_t: a vector of length (B,). The baseline for the
          current time step `t`.
        - log_probas: a 2D tensor of shape (B, num_classes). The
          output log probability vector over the classes.
        - log_pi: a vector of length (B,).
        """
        g_t = self.sensor(x, l_t_prev)
        h_t = self.rnn(g_t, h_t_prev)
        mu, l_t = self.locator(h_t)
        b_t = self.baseliner(h_t).squeeze()

        # we assume both dimensions are independent
        # 1. pdf of the joint is the product of the pdfs
        # 2. log of the product is the sum of the logs
        log_pi = Normal(mu, self.std).log_prob(l_t)
        log_pi = torch.sum(log_pi, dim=1)

        if last:
            log_probas = self.classifier(h_t)
            #return h_t, l_t, log_probas, log_pi
            return h_t, l_t, b_t, log_probas, log_pi

        return h_t, l_t,b_t, log_pi


In [168]:
import torch
import torch.nn.functional as F

from torch.autograd import Variable
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

import os
import time
import shutil
import pickle

from tqdm import tqdm
#from utils import AverageMeter
#from model import RecurrentAttention
from tensorboard_logger import configure, log_value


class Trainer(object):
    """
    Trainer encapsulates all the logic necessary for
    training the Recurrent Attention Model.

    All hyperparameters are provided by the user in the
    config file.
    """
    def __init__(self, config, data_loader):
        """- h_t: a 2D tensor of shape (B, hidden_size). The hidden
          state vector for the current timestep `t`.
        - mu: a 2D tensor of shape (B, 2). The mean that parametrizes
          the Gaussian policy.
        - l_t: a 2D tensor of shape (B, 2). The location vector
          containing the glimpse coordinates [x, y] for the
          current timestep `t`.
        - b_t: a vector of length (B,). The baseline for the
          current time step `t`.
        - log_probas: a 2D tensor of shape (B, num_classes). The
          output log probability vector over the classes.
        - log_pi: a vector of length (B,).
        Construct a new Trainer instance.

        Args
        ----
        - config: object containing command line arguments.
        - data_loader: data iterator
        """
        self.config = config

        # glimpse network params
        self.patch_size = config.patch_size
        self.glimpse_scale = config.glimpse_scale
        self.num_patches = config.num_patches
        self.loc_hidden = config.loc_hidden
        self.glimpse_hidden = config.glimpse_hidden

        # core network params
        self.num_glimpses = config.num_glimpses
        self.hidden_size = config.hidden_size

        # reinforce params
        self.std = config.std
        self.M = config.M

        # data params
        if config.is_train:
            self.train_loader = data_loader[0]
            self.valid_loader = data_loader[1]
            self.num_train = len(self.train_loader.sampler.indices)
            self.num_valid = len(self.valid_loader.sampler.indices)
        else:
            self.test_loader = data_loader
            self.num_test = len(self.test_loader.dataset)
        self.num_classes = 29
        self.num_channels = 1

        # training params
        self.epochs = config.epochs
        self.start_epoch = 0
        self.momentum = config.momentum
        self.lr = config.init_lr

        # misc params
        self.use_gpu = config.use_gpu
        self.best = config.best
        self.ckpt_dir = config.ckpt_dir
        self.logs_dir = config.logs_dir
        self.best_valid_acc = 0.
        self.counter = 0
        self.lr_patience = config.lr_patience
        self.train_patience = config.train_patience
        self.use_tensorboard = config.use_tensorboard
        self.resume = config.resume
        self.print_freq = config.print_freq
        self.plot_freq = config.plot_freq
        self.model_name = 'ram_{}_{}x{}_{}'.format(
            config.num_glimpses, config.patch_size,
            config.patch_size, config.glimpse_scale
        )

        self.plot_dir = './plots/' + self.model_name + '/'
        if not os.path.exists(self.plot_dir):
            os.makedirs(self.plot_dir)

        # configure tensorboard logging
        if self.use_tensorboard:
            tensorboard_dir = self.logs_dir + self.model_name
            print('[*] Saving tensorboard logs to {}'.format(tensorboard_dir))
            if not os.path.exists(tensorboard_dir):
                os.makedirs(tensorboard_dir)
            configure(tensorboard_dir)

        # build RAM model
        self.model = RecurrentAttention(
            self.patch_size, self.num_patches, self.glimpse_scale,
            self.num_channels, self.loc_hidden, self.glimpse_hidden,
            self.std, self.hidden_size, self.num_classes,
        )
        if self.use_gpu:
            self.model.cuda()

        print('[*] Number of model parameters: {:,}'.format(
            sum([p.data.nelement() for p in self.model.parameters()])))

        # # initialize optimizer and scheduler
        # self.optimizer = optim.SGD(
        #     self.model.parameters(), lr=self.lr, momentum=self.momentum,
        # )
        # self.scheduler = ReduceLROnPlateau(
        #     self.optimizer, 'min', patience=self.lr_patience
        # )
        self.optimizer = optim.Adam(
            self.model.parameters(), lr=3e-4,
        )

    def reset(self):
        """
        Initialize the hidden state of the core network
        and the location vector.

        This is called once every time a new minibatch
        `x` is introduced.
        """
        dtype = (
            torch.cuda.FloatTensor if self.use_gpu else torch.FloatTensor
        )

        h_t = torch.zeros(self.batch_size, self.hidden_size)
        h_t = Variable(h_t).type(dtype)

        l_t = torch.Tensor(self.batch_size, 2).uniform_(-1, 1)
        l_t = Variable(l_t).type(dtype)

        return h_t, l_t

    def train(self):
        """
        Train the model on the training set.

        A checkpoint of the model is saved after each epoch
        and if the validation accuracy is improved upon,
        a separate ckpt is created for use on the test set.
        """
        # load the most recent checkpoint
        if self.resume:
            self.load_checkpoint(self.best)
            
        if self.use_gpu:
            self.model.cuda()
            
        print("\n[*] Train on {} samples, validate on {} samples".format(
            self.num_train, self.num_valid)
        )

        for epoch in range(self.start_epoch, self.epochs):

            print(
                '\nEpoch: {}/{} - LR: {:.6f}'.format(
                    epoch+1, self.epochs, self.lr)
            )

            # train for 1 epoch
            train_loss, train_acc = self.train_one_epoch(epoch)

            # evaluate on validation set
            valid_loss, valid_acc = self.validate(epoch)

            # # reduce lr if validation loss plateaus
            # self.scheduler.step(valid_loss)

            is_best = valid_acc < self.best_valid_acc
            msg1 = "train loss: {:.3f} - train acc: {:.3f} "
            msg2 = "- val loss: {:.3f} - val acc: {:.3f}"
            if is_best:
                self.counter = 0
                msg2 += " [*]"
            msg = msg1 + msg2
            log_file = open(str(self.num_patches)+"_"+str(self.num_glimpses)+"_"+str(self.glimpse_scale)+".txt", "a+")
            log_file.write(msg.format(train_loss, train_acc, valid_loss, valid_acc))
            log_file.write("\n")
            log_file.close()
            print(msg.format(train_loss, train_acc, valid_loss, valid_acc))

            # check for improvement
            if not is_best:
                self.counter += 1
            if self.counter > self.train_patience:
                print("[!] No improvement in a while, stopping training.")
                return
            self.best_valid_acc = min(valid_acc, self.best_valid_acc)
            self.save_checkpoint(
                {'epoch': epoch + 1,
                 'model_state': self.model.state_dict(),
                 'optim_state': self.optimizer.state_dict(),
                 'best_valid_acc': self.best_valid_acc,
                 }, is_best
            )

    def train_one_epoch(self, epoch):
        """
        Train the model for 1 epoch of the training set.

        An epoch corresponds to one full pass through the entire
        training set in successive mini-batches.

        This is used by train() and should not be called manually.
        """
        batch_time = AverageMeter()
        losses = AverageMeter()
        accs = AverageMeter()

        tic = time.time()
        with tqdm(total=self.num_train) as pbar:
            for i, (x, y) in enumerate(self.train_loader):
                x = x.type(torch.FloatTensor)
                if self.use_gpu:
                    x, y = x.cuda(), y.cuda()

                x, y = Variable(x), Variable(y)

                plot = False
                if (epoch % self.plot_freq == 0) and (i == 0):
                    plot = True

                # initialize location vector and hidden state
                self.batch_size = x.shape[0]
                h_t, l_t = self.reset()

                # save images
                imgs = []
                imgs.append(x[0:9])

                # extract the glimpses
                locs = []
                log_pi = []
                baselines = []
                for t in range(self.num_glimpses - 1):
                    # forward pass through model
                    h_t, l_t,b_t, p = self.model(x, l_t, h_t)

                    # store
                    locs.append(l_t[0:9])
                    baselines.append(b_t)
                    log_pi.append(p)

                # last iteration
                h_t, l_t,b_t, log_probas, p = self.model(
                    x, l_t, h_t, last=True
                )
                
                log_pi.append(p)
                baselines.append(b_t)
                locs.append(l_t[0:9])

                # convert list to tensors and reshape
                
                
                baselines = torch.stack(baselines).transpose(1, 0)
                y = y.type(torch.FloatTensor)
                if self.use_gpu:
                    y =y.cuda()
                log_pi = torch.stack(log_pi).transpose(1, 0)
                y_copy = y.clone()
                log_prob_det = log_probas.detach()
                log_prob_det[y!=y]=0
                y[y!=y]=0
                
                R = (log_prob_det.detach()-y).abs().float()
                R = R.unsqueeze(1)
                R = R.repeat(1, self.num_glimpses,1)
            
                
                adjusted_reward = R - baselines.detach()#32*7*29
                log_pi = log_pi.unsqueeze(2).repeat(1,1,29)#32*7*29
                loss_reinforce = torch.sum(-log_pi * adjusted_reward, dim=[1,2])
                loss_reinforce = torch.mean(loss_reinforce, dim=0)
                loss_action = F.mse_loss(log_probas[~torch.isnan(y_copy)],y_copy[~torch.isnan(y_copy)])
                loss_baseline = F.mse_loss(baselines, R)
                
                loss = loss_reinforce+loss_action+loss_baseline


                acc = F.mse_loss(log_probas[~torch.isnan(y_copy)],y_copy[~torch.isnan(y_copy)])
                
                # store
                losses.update(loss.item(), x.size()[0])
                accs.update(acc.item(), x.size()[0])

                # compute gradients and update SGD
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # measure elapsed time
                toc = time.time()
                batch_time.update(toc-tic)

                pbar.set_description(
                    (
                        "{:.1f}s - loss: {:.3f} - acc: {:.3f}".format(
                            (toc-tic), loss.item(), acc.item()
                        )
                    )
                )
                pbar.update(self.batch_size)

                # dump the glimpses and locs
                if plot:
                    if self.use_gpu:
                        imgs = [g.cpu().data.numpy().squeeze() for g in imgs]
                        locs = [l.cpu().data.numpy() for l in locs]
                    else:
                        imgs = [g.data.numpy().squeeze() for g in imgs]
                        locs = [l.data.numpy() for l in locs]
                    pickle.dump(
                        imgs, open(
                            self.plot_dir + "g_{}.p".format(epoch+1),
                            "wb"
                        )
                    )
                    pickle.dump(
                        locs, open(
                            self.plot_dir + "l_{}.p".format(epoch+1),
                            "wb"
                        )
                    )

                # log to tensorboard
                if self.use_tensorboard:
                    iteration = epoch*len(self.train_loader) + i
                    log_value('train_loss', losses.avg, iteration)
                    log_value('train_acc', accs.avg, iteration)

            return losses.avg, accs.avg

    def validate(self, epoch):
        """
        Evaluate the model on the validation set.
        """
        losses = AverageMeter()
        accs = AverageMeter()

        for i, (x, y) in enumerate(self.valid_loader):
            x = x.type(torch.FloatTensor)
            y = y.type(torch.FloatTensor)
            if self.use_gpu:
                x, y = x.cuda(), y.cuda()
            x, y = Variable(x), Variable(y)

            # duplicate 10 times
            x = x.repeat(self.M, 1, 1, 1)

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # extract the glimpses
            log_pi = []
            baselines = []
            for t in range(self.num_glimpses - 1):
                # forward pass through model
                h_t, l_t,b_t, p = self.model(x, l_t, h_t)

                # store
                baselines.append(b_t)
                log_pi.append(p)

            # last iteration
            h_t, l_t,b_t, log_probas, p = self.model(
                x, l_t, h_t, last=True
            )
            log_pi.append(p)
            baselines.append(b_t)
            
            # convert list to tensors and reshape
            baselines = torch.stack(baselines).transpose(1, 0)
            log_pi = torch.stack(log_pi).transpose(1, 0)
            log_probas = log_probas.view(
                self.M, -1, log_probas.shape[-1]
            )
            #print(log_probas.shape)
            
            log_probas = torch.mean(log_probas, dim=0)
            #print(baselines.shape)
            baselines = baselines.contiguous().view(
                self.M, -1, baselines.shape[-2], baselines.shape[-1]
            )
            #print(baselines.shape)
            baselines = torch.mean(baselines, dim=0)
            #print(baselines.shape)
            
            log_pi = log_pi.contiguous().view(
                self.M, -1, log_pi.shape[-1]
            )
            log_pi = torch.mean(log_pi, dim=0)
            
            y = y.type(torch.FloatTensor)
            if self.use_gpu:
                y =y.cuda()
            
            y_copy = y.clone()
            log_prob_det = log_probas.detach()
            log_prob_det[y!=y]=0
            y[y!=y]=0

            R = (log_prob_det.detach()-y).abs().float()
            R = R.unsqueeze(1)
            R = R.repeat(1, self.num_glimpses,1)
            
            adjusted_reward = R - baselines.detach()#32*7*29
            
            log_pi = log_pi.unsqueeze(2).repeat(1,1,29)#32*7*29
            loss_reinforce = torch.sum(-log_pi * adjusted_reward, dim=[1,2])#
            
            loss_reinforce = torch.mean(loss_reinforce, dim=0)
            
            loss_action = F.mse_loss(log_probas[~torch.isnan(y_copy)],y_copy[~torch.isnan(y_copy)])
            
            loss_baseline = F.mse_loss(baselines, R)
            
            loss = loss_reinforce+loss_action+loss_baseline
            
            
            acc = F.mse_loss(log_probas[~torch.isnan(y_copy)],y_copy[~torch.isnan(y_copy)])

            # store
            losses.update(loss.item(), x.size()[0])
            accs.update(acc.item(), x.size()[0])

            # log to tensorboard
            if self.use_tensorboard:
                iteration = epoch*len(self.valid_loader) + i
                log_value('valid_loss', losses.avg, iteration)
                log_value('valid_acc', accs.avg, iteration)

        return losses.avg, accs.avg

    def test(self):
        """
        Test the model on the held-out test data.
        This function should only be called at the very
        end once the model has finished training.
        """
        correct = 0

        # load the best checkpoint
        self.load_checkpoint(best=self.best)

        for i, (x, y) in enumerate(self.test_loader):
            if self.use_gpu:
                x, y = x.cuda(), y.cuda()
            x, y = Variable(x, volatile=True), Variable(y)

            # duplicate 10 times
            x = x.repeat(self.M, 1, 1, 1)

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # extract the glimpses
            for t in range(self.num_glimpses - 1):
                # forward pass through model
                h_t, l_t, b_t, p = self.model(x, l_t, h_t)

            # last iteration
            h_t, l_t, b_t, log_probas, p = self.model(
                x, l_t, h_t, last=True
            )

            log_probas = log_probas.view(
                self.M, -1, log_probas.shape[-1]
            )
            log_probas = torch.mean(log_probas, dim=0)

            pred = log_probas.data.max(1, keepdim=True)[1]
            correct += pred.eq(y.data.view_as(pred)).cpu().sum()

        perc = (100. * correct) / (self.num_test)
        error = 100 - perc
        print(
            '[*] Test Acc: {}/{} ({:.2f}% - {:.2f}%)'.format(
                correct, self.num_test, perc, error)
        )

    def save_checkpoint(self, state, is_best):
        """
        Save a copy of the model so that it can be loaded at a future
        date. This function is used when the model is being evaluated
        on the test data.

        If this model has reached the best validation accuracy thus
        far, a seperate file with the suffix `best` is created.
        """
        # print("[*] Saving model to {}".format(self.ckpt_dir))

        filename = self.model_name + '_ckpt.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        torch.save(state, ckpt_path)

        if is_best:
            filename = self.model_name + '_model_best.pth.tar'
            shutil.copyfile(
                ckpt_path, os.path.join(self.ckpt_dir, filename)
            )

    def load_checkpoint(self, best=False):
        """
        Load the best copy of a model. This is useful for 2 cases:

        - Resuming training with the most recent model checkpoint.
        - Loading the best validation model to evaluate on the test data.

        Params
        ------
        - best: if set to True, loads the best model. Use this if you want
          to evaluate your model on the test data. Else, set to False in
          which case the most recent version of the checkpoint is used.
        """
        print("[*] Loading model from {}".format(self.ckpt_dir))

        filename = self.model_name + '_ckpt.pth.tar'
        if best:
            filename = self.model_name + '_model_best.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        ckpt = torch.load(ckpt_path)

        # load variables from checkpoint
        self.start_epoch = ckpt['epoch']
        self.best_valid_acc = ckpt['best_valid_acc']
        self.model.load_state_dict(ckpt['model_state'])
        self.optimizer.load_state_dict(ckpt['optim_state'])

        if best:
            print(
                "[*] Loaded {} checkpoint @ epoch {} "
                "with best valid acc of {:.3f}".format(
                    filename, ckpt['epoch'], ckpt['best_valid_acc'])
            )
        else:
            print(
                "[*] Loaded {} checkpoint @ epoch {}".format(
                    filename, ckpt['epoch'])
            )


In [163]:
#Params of model
class Config(object):
      def __init__(self):
        #size of extracted patch at highest res
        self.patch_size =64

        #Scale of successive patches
        self.glimpse_scale = 2

        # # of downscaled patches per glimpse
        self.num_patches = 2

        #hidden size of loc fc
        self.loc_hidden = 256

        #hidden size of glimpse fc
        self.glimpse_hidden = 256

        # core network params
        ## of glimpses, i.e. BPTT iterations
        self.num_glimpses = 7

        #hidden size of rnn  
        self.hidden_size = 512

        # reinforce params

        #gaussian policy standard deviation
        self.std = 0.17 #0.17
        #Monte Carlo sampling for valid and test sets
        self.M = 10

        # data params
        #Proportion of training set used for validation
        self.valid_size=0.1

        ## of images in each batch of data
        self.batch_size = 32

        ## of subprocesses to use for data loading
        self.num_workers = 4

        #Whether to shuffle the train and valid indices
        self.shuffle = True

        #Whether to visualize a sample grid of the data
        self.show_sample=False

        # training params

        #Whether to train or test the model
        self.is_train = True

        #Whether to train or test the model
        self.momentum=0.5

        ## of epochs to train for
        self.epochs = 500

        #Initial learning rate value
        self.init_lr = 3e-4

        #Number of epochs to wait before reducing lr
        self.lr_patience = 100

        #Number of epochs to wait before stopping train
        self.train_patience = 100

        # other params
        #Whether to run on the GPU
        self.use_gpu = True

        #Load best model or most recent for testing
        self.best = False
        #Seed to ensure reproducibility
        self.random_seed = 1

        #Directory in which data is stored
        self.data_dir = './data'

        #Directory in which to save model checkpoints
        self.ckpt_dir = './ckpt'
        #Directory in which Tensorboard logs wil be stored
        self.logs_dir='./logs/'
        #Whether to use tensorboard for visualization
        self.use_tensorboard= True
        #Whether to resume training from checkpoint
        self.resume = False

        #How frequently to print training details
        self.print_freq = 10
        #How frequently to plot glimpses
        self.plot_freq = 1

config = Config()

In [169]:
#Main cell
import torch

#from trainer import Trainer
#from config import get_config
#from utils import prepare_dirs, save_config
#from my_data_loader import get_test_loader, get_train_valid_loader


# ensure directories are setup
prepare_dirs(config)

# ensure reproducibility
torch.manual_seed(config.random_seed)
kwargs = {}
if config.use_gpu:
    torch.cuda.manual_seed(config.random_seed)
    kwargs = {'num_workers': 1, 'pin_memory': True}

# instantiate data loaders
if config.is_train:
    data_loader = get_train_valid_loader(
        config.data_dir, config.batch_size,
        config.random_seed, config.valid_size,
        config.shuffle, config.show_sample, **kwargs
    )
else:
    data_loader = get_test_loader(
        config.data_dir, config.batch_size, **kwargs
    )

# instantiate trainer
print ("Cuda testing")
if (torch.cuda.is_available()):
    print("Ura")
trainer = Trainer(config, data_loader)

# either train
if config.is_train:
    save_config(config)
    trainer.train()

# or load a pretrained model and test
else:
    trainer.test()


'''if __name__ == '__main__':
    config, unparsed = get_config()
    main(config)'''


  0%|          | 0/78357 [00:00<?, ?it/s]

aggregate_tox.csv
Cuda testing
Ura
[*] Saving tensorboard logs to ./logs/ram_7_64x64_2
512
[*] Number of model parameters: 3,442,748
[*] Model Checkpoint Dir: ./ckpt
[*] Param Path: ./ckpt/ram_7_64x64_2_params.json

[*] Train on 78357 samples, validate on 8706 samples

Epoch: 1/500 - LR: 0.000300


188.9s - loss: 27.471 - acc: 0.564: 100%|██████████| 78357/78357 [03:08<00:00, 477.40it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 7.574 - train acc: 1.123 - val loss: 37.973 - val acc: 0.659

Epoch: 2/500 - LR: 0.000300


187.9s - loss: 34.995 - acc: 0.597: 100%|██████████| 78357/78357 [03:07<00:00, 394.26it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.883 - train acc: 0.771 - val loss: 38.589 - val acc: 0.639

Epoch: 3/500 - LR: 0.000300


188.5s - loss: -5.247 - acc: 0.355: 100%|██████████| 78357/78357 [03:08<00:00, 475.06it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 1.011 - train acc: 0.735 - val loss: -5.886 - val acc: 0.617

Epoch: 4/500 - LR: 0.000300


183.7s - loss: -5.154 - acc: 0.611: 100%|██████████| 78357/78357 [03:03<00:00, 400.88it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.703 - train acc: 0.714 - val loss: -3.578 - val acc: 0.950

Epoch: 5/500 - LR: 0.000300


186.2s - loss: 0.388 - acc: 0.436: 100%|██████████| 78357/78357 [03:06<00:00, 509.51it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.708 - train acc: 0.693 - val loss: -0.108 - val acc: 0.614

Epoch: 6/500 - LR: 0.000300


176.7s - loss: 1.464 - acc: 0.439: 100%|██████████| 78357/78357 [02:56<00:00, 443.33it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.667 - train acc: 0.674 - val loss: 1.013 - val acc: 0.624

Epoch: 7/500 - LR: 0.000300


175.2s - loss: 1.712 - acc: 0.503: 100%|██████████| 78357/78357 [02:55<00:00, 447.00it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.363 - train acc: 0.650 - val loss: 0.253 - val acc: 0.618

Epoch: 8/500 - LR: 0.000300


166.9s - loss: 2.197 - acc: 0.670: 100%|██████████| 78357/78357 [02:46<00:00, 448.98it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.406 - train acc: 0.645 - val loss: 0.657 - val acc: 0.615

Epoch: 9/500 - LR: 0.000300


169.2s - loss: -3.102 - acc: 0.828: 100%|██████████| 78357/78357 [02:49<00:00, 450.61it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.685 - train acc: 0.639 - val loss: 0.562 - val acc: 0.611

Epoch: 10/500 - LR: 0.000300


168.0s - loss: 0.018 - acc: 1.184: 100%|██████████| 78357/78357 [02:48<00:00, 446.04it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.662 - train acc: 0.639 - val loss: 0.628 - val acc: 0.610

Epoch: 11/500 - LR: 0.000300


161.9s - loss: 2.121 - acc: 0.353: 100%|██████████| 78357/78357 [02:41<00:00, 483.93it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.672 - train acc: 0.637 - val loss: 0.532 - val acc: 0.615

Epoch: 12/500 - LR: 0.000300


170.0s - loss: 1.680 - acc: 0.635: 100%|██████████| 78357/78357 [02:50<00:00, 448.12it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.649 - train acc: 0.638 - val loss: 0.797 - val acc: 0.602

Epoch: 13/500 - LR: 0.000300


172.0s - loss: 1.135 - acc: 0.313: 100%|██████████| 78357/78357 [02:51<00:00, 427.21it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.701 - train acc: 0.638 - val loss: 0.667 - val acc: 0.612

Epoch: 14/500 - LR: 0.000300


167.9s - loss: 1.598 - acc: 0.595: 100%|██████████| 78357/78357 [02:47<00:00, 466.46it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.659 - train acc: 0.635 - val loss: 0.781 - val acc: 0.610

Epoch: 15/500 - LR: 0.000300


159.0s - loss: 1.610 - acc: 0.648: 100%|██████████| 78357/78357 [02:38<00:00, 450.81it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.699 - train acc: 0.635 - val loss: 0.432 - val acc: 0.611

Epoch: 16/500 - LR: 0.000300


169.0s - loss: 2.002 - acc: 0.442: 100%|██████████| 78357/78357 [02:48<00:00, 472.93it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.658 - train acc: 0.637 - val loss: 0.706 - val acc: 0.604

Epoch: 17/500 - LR: 0.000300


168.4s - loss: 1.869 - acc: 0.431: 100%|██████████| 78357/78357 [02:48<00:00, 417.73it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.674 - train acc: 0.633 - val loss: 0.717 - val acc: 0.600

Epoch: 18/500 - LR: 0.000300


173.0s - loss: 1.580 - acc: 0.736: 100%|██████████| 78357/78357 [02:53<00:00, 447.66it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.678 - train acc: 0.634 - val loss: 0.567 - val acc: 0.604

Epoch: 19/500 - LR: 0.000300


166.0s - loss: 1.595 - acc: 0.613: 100%|██████████| 78357/78357 [02:45<00:00, 441.74it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.637 - train acc: 0.632 - val loss: 0.787 - val acc: 0.609

Epoch: 20/500 - LR: 0.000300


167.7s - loss: -1.795 - acc: 0.422: 100%|██████████| 78357/78357 [02:47<00:00, 442.38it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.686 - train acc: 0.634 - val loss: 0.561 - val acc: 0.605

Epoch: 21/500 - LR: 0.000300


168.2s - loss: 0.643 - acc: 0.577: 100%|██████████| 78357/78357 [02:48<00:00, 451.82it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.667 - train acc: 0.633 - val loss: 0.552 - val acc: 0.610

Epoch: 22/500 - LR: 0.000300


162.7s - loss: 0.049 - acc: 0.503: 100%|██████████| 78357/78357 [02:42<00:00, 481.42it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.622 - train acc: 0.631 - val loss: 0.743 - val acc: 0.609

Epoch: 23/500 - LR: 0.000300


169.9s - loss: -0.584 - acc: 0.914: 100%|██████████| 78357/78357 [02:49<00:00, 461.11it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.646 - train acc: 0.634 - val loss: 0.751 - val acc: 0.611

Epoch: 24/500 - LR: 0.000300


165.1s - loss: 0.467 - acc: 0.645: 100%|██████████| 78357/78357 [02:45<00:00, 474.35it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.683 - train acc: 0.631 - val loss: 0.619 - val acc: 0.608

Epoch: 25/500 - LR: 0.000300


166.5s - loss: 1.026 - acc: 0.437: 100%|██████████| 78357/78357 [02:46<00:00, 450.64it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.673 - train acc: 0.633 - val loss: 0.618 - val acc: 0.614

Epoch: 26/500 - LR: 0.000300


166.2s - loss: 2.742 - acc: 0.226: 100%|██████████| 78357/78357 [02:46<00:00, 443.09it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.652 - train acc: 0.633 - val loss: 0.618 - val acc: 0.611

Epoch: 27/500 - LR: 0.000300


169.0s - loss: 2.421 - acc: 0.401: 100%|██████████| 78357/78357 [02:49<00:00, 452.02it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.640 - train acc: 0.632 - val loss: 0.718 - val acc: 0.606

Epoch: 28/500 - LR: 0.000300


169.9s - loss: 2.164 - acc: 0.304: 100%|██████████| 78357/78357 [02:49<00:00, 461.01it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.653 - train acc: 0.631 - val loss: 0.620 - val acc: 0.607

Epoch: 29/500 - LR: 0.000300


170.4s - loss: 0.951 - acc: 0.524: 100%|██████████| 78357/78357 [02:50<00:00, 448.18it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.659 - train acc: 0.630 - val loss: 0.570 - val acc: 0.606

Epoch: 30/500 - LR: 0.000300


162.5s - loss: 2.298 - acc: 0.327: 100%|██████████| 78357/78357 [02:42<00:00, 448.37it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.668 - train acc: 0.632 - val loss: 0.514 - val acc: 0.613

Epoch: 31/500 - LR: 0.000300


169.5s - loss: -3.465 - acc: 1.238: 100%|██████████| 78357/78357 [02:49<00:00, 441.44it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.648 - train acc: 0.633 - val loss: 0.693 - val acc: 0.607

Epoch: 32/500 - LR: 0.000300


167.9s - loss: 1.698 - acc: 0.463: 100%|██████████| 78357/78357 [02:47<00:00, 466.46it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.656 - train acc: 0.632 - val loss: 0.737 - val acc: 0.604

Epoch: 33/500 - LR: 0.000300


172.4s - loss: 0.980 - acc: 0.486: 100%|██████████| 78357/78357 [02:52<00:00, 461.74it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.670 - train acc: 0.630 - val loss: 0.588 - val acc: 0.607

Epoch: 34/500 - LR: 0.000300


170.5s - loss: 1.962 - acc: 0.761: 100%|██████████| 78357/78357 [02:50<00:00, 450.11it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.663 - train acc: 0.632 - val loss: 0.606 - val acc: 0.613

Epoch: 35/500 - LR: 0.000300


171.4s - loss: 0.379 - acc: 0.483: 100%|██████████| 78357/78357 [02:51<00:00, 447.37it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.653 - train acc: 0.631 - val loss: 0.691 - val acc: 0.608

Epoch: 36/500 - LR: 0.000300


169.7s - loss: -0.779 - acc: 0.486: 100%|██████████| 78357/78357 [02:49<00:00, 461.68it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.676 - train acc: 0.632 - val loss: 0.569 - val acc: 0.608

Epoch: 37/500 - LR: 0.000300


170.7s - loss: 0.558 - acc: 0.418: 100%|██████████| 78357/78357 [02:50<00:00, 434.76it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.657 - train acc: 0.633 - val loss: 0.696 - val acc: 0.607

Epoch: 38/500 - LR: 0.000300


171.0s - loss: 0.952 - acc: 0.565: 100%|██████████| 78357/78357 [02:51<00:00, 443.25it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.687 - train acc: 0.633 - val loss: 0.705 - val acc: 0.600

Epoch: 39/500 - LR: 0.000300


169.0s - loss: 0.965 - acc: 0.305: 100%|██████████| 78357/78357 [02:49<00:00, 450.59it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.645 - train acc: 0.632 - val loss: 0.506 - val acc: 0.602

Epoch: 40/500 - LR: 0.000300


165.5s - loss: 1.401 - acc: 0.338: 100%|██████████| 78357/78357 [02:45<00:00, 447.10it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.663 - train acc: 0.630 - val loss: 0.730 - val acc: 0.611

Epoch: 41/500 - LR: 0.000300


167.7s - loss: 2.368 - acc: 0.379: 100%|██████████| 78357/78357 [02:47<00:00, 439.99it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.668 - train acc: 0.633 - val loss: 0.759 - val acc: 0.613

Epoch: 42/500 - LR: 0.000300


168.1s - loss: 2.148 - acc: 0.316: 100%|██████████| 78357/78357 [02:48<00:00, 446.11it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.677 - train acc: 0.632 - val loss: 0.456 - val acc: 0.605

Epoch: 43/500 - LR: 0.000300


165.2s - loss: 1.565 - acc: 0.357: 100%|██████████| 78357/78357 [02:45<00:00, 436.95it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.659 - train acc: 0.629 - val loss: 0.585 - val acc: 0.610

Epoch: 44/500 - LR: 0.000300


173.6s - loss: 1.513 - acc: 0.337: 100%|██████████| 78357/78357 [02:53<00:00, 450.25it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.660 - train acc: 0.633 - val loss: 0.683 - val acc: 0.607

Epoch: 45/500 - LR: 0.000300


169.5s - loss: 1.177 - acc: 0.821: 100%|██████████| 78357/78357 [02:49<00:00, 431.30it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.652 - train acc: 0.629 - val loss: 0.566 - val acc: 0.609

Epoch: 46/500 - LR: 0.000300


169.5s - loss: 0.524 - acc: 0.905: 100%|██████████| 78357/78357 [02:49<00:00, 445.78it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.666 - train acc: 0.630 - val loss: 0.783 - val acc: 0.615

Epoch: 47/500 - LR: 0.000300


170.6s - loss: 1.177 - acc: 0.225: 100%|██████████| 78357/78357 [02:50<00:00, 460.75it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.673 - train acc: 0.631 - val loss: 0.565 - val acc: 0.606

Epoch: 48/500 - LR: 0.000300


168.5s - loss: 0.446 - acc: 0.434: 100%|██████████| 78357/78357 [02:48<00:00, 448.40it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.662 - train acc: 0.630 - val loss: 0.660 - val acc: 0.608

Epoch: 49/500 - LR: 0.000300


167.3s - loss: 0.965 - acc: 0.332: 100%|██████████| 78357/78357 [02:47<00:00, 449.31it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.668 - train acc: 0.633 - val loss: 0.733 - val acc: 0.609

Epoch: 50/500 - LR: 0.000300


168.3s - loss: 1.532 - acc: 0.803: 100%|██████████| 78357/78357 [02:48<00:00, 437.91it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.654 - train acc: 0.628 - val loss: 0.577 - val acc: 0.611

Epoch: 51/500 - LR: 0.000300


168.4s - loss: 2.147 - acc: 0.343: 100%|██████████| 78357/78357 [02:48<00:00, 465.21it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.661 - train acc: 0.630 - val loss: 0.619 - val acc: 0.606

Epoch: 52/500 - LR: 0.000300


166.0s - loss: 2.266 - acc: 0.343: 100%|██████████| 78357/78357 [02:46<00:00, 448.78it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.648 - train acc: 0.628 - val loss: 0.792 - val acc: 0.604

Epoch: 53/500 - LR: 0.000300


163.9s - loss: 2.110 - acc: 0.375: 100%|██████████| 78357/78357 [02:43<00:00, 449.91it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.657 - train acc: 0.630 - val loss: 0.627 - val acc: 0.610

Epoch: 54/500 - LR: 0.000300


168.9s - loss: 2.397 - acc: 0.316: 100%|██████████| 78357/78357 [02:48<00:00, 446.25it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.638 - train acc: 0.632 - val loss: 0.764 - val acc: 0.603

Epoch: 55/500 - LR: 0.000300


169.4s - loss: 1.572 - acc: 0.328: 100%|██████████| 78357/78357 [02:49<00:00, 440.94it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.687 - train acc: 0.630 - val loss: 0.553 - val acc: 0.609

Epoch: 56/500 - LR: 0.000300


168.3s - loss: 1.214 - acc: 0.313: 100%|██████████| 78357/78357 [02:48<00:00, 449.67it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.651 - train acc: 0.630 - val loss: 0.609 - val acc: 0.609

Epoch: 57/500 - LR: 0.000300


167.4s - loss: 0.737 - acc: 0.618: 100%|██████████| 78357/78357 [02:47<00:00, 468.07it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.665 - train acc: 0.631 - val loss: 0.690 - val acc: 0.617

Epoch: 58/500 - LR: 0.000300


168.1s - loss: 2.473 - acc: 0.284: 100%|██████████| 78357/78357 [02:48<00:00, 445.93it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.659 - train acc: 0.629 - val loss: 0.605 - val acc: 0.606

Epoch: 59/500 - LR: 0.000300


170.4s - loss: -0.539 - acc: 0.549: 100%|██████████| 78357/78357 [02:50<00:00, 451.94it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.656 - train acc: 0.630 - val loss: 0.594 - val acc: 0.602

Epoch: 60/500 - LR: 0.000300


169.6s - loss: 1.724 - acc: 0.588: 100%|██████████| 78357/78357 [02:49<00:00, 461.75it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.672 - train acc: 0.632 - val loss: 0.626 - val acc: 0.609

Epoch: 61/500 - LR: 0.000300


166.6s - loss: -5.460 - acc: 1.738: 100%|██████████| 78357/78357 [02:46<00:00, 447.52it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.660 - train acc: 0.629 - val loss: 0.661 - val acc: 0.610

Epoch: 62/500 - LR: 0.000300


170.4s - loss: -0.696 - acc: 0.534: 100%|██████████| 78357/78357 [02:50<00:00, 442.07it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.665 - train acc: 0.631 - val loss: 0.521 - val acc: 0.606

Epoch: 63/500 - LR: 0.000300


166.3s - loss: 2.836 - acc: 0.846: 100%|██████████| 78357/78357 [02:46<00:00, 446.40it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.641 - train acc: 0.629 - val loss: 0.906 - val acc: 0.609

Epoch: 64/500 - LR: 0.000300


166.4s - loss: -1.128 - acc: 0.291: 100%|██████████| 78357/78357 [02:46<00:00, 422.71it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.674 - train acc: 0.629 - val loss: 0.406 - val acc: 0.606

Epoch: 65/500 - LR: 0.000300


165.6s - loss: -0.834 - acc: 0.682: 100%|██████████| 78357/78357 [02:45<00:00, 446.82it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.655 - train acc: 0.630 - val loss: 0.706 - val acc: 0.605

Epoch: 66/500 - LR: 0.000300


168.0s - loss: 1.502 - acc: 0.287: 100%|██████████| 78357/78357 [02:48<00:00, 424.55it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.660 - train acc: 0.630 - val loss: 0.642 - val acc: 0.607

Epoch: 67/500 - LR: 0.000300


162.8s - loss: 0.741 - acc: 0.561: 100%|██████████| 78357/78357 [02:42<00:00, 481.19it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.635 - train acc: 0.632 - val loss: 0.718 - val acc: 0.605

Epoch: 68/500 - LR: 0.000300


170.3s - loss: -0.102 - acc: 1.085: 100%|██████████| 78357/78357 [02:50<00:00, 450.79it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.650 - train acc: 0.630 - val loss: 0.699 - val acc: 0.605

Epoch: 69/500 - LR: 0.000300


168.1s - loss: 1.590 - acc: 0.275: 100%|██████████| 78357/78357 [02:48<00:00, 447.01it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.670 - train acc: 0.631 - val loss: 0.576 - val acc: 0.602

Epoch: 70/500 - LR: 0.000300


171.1s - loss: 1.733 - acc: 0.245: 100%|██████████| 78357/78357 [02:51<00:00, 424.69it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.659 - train acc: 0.630 - val loss: 0.726 - val acc: 0.612

Epoch: 71/500 - LR: 0.000300


169.8s - loss: -1.576 - acc: 1.365: 100%|██████████| 78357/78357 [02:49<00:00, 453.99it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.653 - train acc: 0.629 - val loss: 0.682 - val acc: 0.612

Epoch: 72/500 - LR: 0.000300


170.9s - loss: 0.889 - acc: 0.544: 100%|██████████| 78357/78357 [02:50<00:00, 398.36it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.669 - train acc: 0.628 - val loss: 0.630 - val acc: 0.606

Epoch: 73/500 - LR: 0.000300


161.4s - loss: 1.043 - acc: 0.588: 100%|██████████| 78357/78357 [02:41<00:00, 485.40it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.685 - train acc: 0.631 - val loss: 0.706 - val acc: 0.611

Epoch: 74/500 - LR: 0.000300


169.1s - loss: 0.823 - acc: 0.921: 100%|██████████| 78357/78357 [02:49<00:00, 463.11it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.678 - train acc: 0.630 - val loss: 0.685 - val acc: 0.603

Epoch: 75/500 - LR: 0.000300


169.5s - loss: -0.211 - acc: 0.909: 100%|██████████| 78357/78357 [02:49<00:00, 456.93it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.653 - train acc: 0.629 - val loss: 0.617 - val acc: 0.598

Epoch: 76/500 - LR: 0.000300


169.1s - loss: 0.885 - acc: 0.414: 100%|██████████| 78357/78357 [02:49<00:00, 449.87it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.649 - train acc: 0.629 - val loss: 0.780 - val acc: 0.609

Epoch: 77/500 - LR: 0.000300


170.3s - loss: 0.948 - acc: 0.675: 100%|██████████| 78357/78357 [02:50<00:00, 463.32it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.658 - train acc: 0.629 - val loss: 0.571 - val acc: 0.606

Epoch: 78/500 - LR: 0.000300


169.4s - loss: 0.797 - acc: 0.507: 100%|██████████| 78357/78357 [02:49<00:00, 446.18it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.643 - train acc: 0.628 - val loss: 0.653 - val acc: 0.605

Epoch: 79/500 - LR: 0.000300


172.5s - loss: -1.411 - acc: 0.651: 100%|██████████| 78357/78357 [02:52<00:00, 446.75it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.660 - train acc: 0.628 - val loss: 0.575 - val acc: 0.603

Epoch: 80/500 - LR: 0.000300


170.3s - loss: -0.058 - acc: 0.384: 100%|██████████| 78357/78357 [02:50<00:00, 435.34it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.665 - train acc: 0.630 - val loss: 0.577 - val acc: 0.605

Epoch: 81/500 - LR: 0.000300


167.6s - loss: 1.839 - acc: 0.240: 100%|██████████| 78357/78357 [02:47<00:00, 454.53it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.639 - train acc: 0.629 - val loss: 0.877 - val acc: 0.606

Epoch: 82/500 - LR: 0.000300


167.7s - loss: 1.084 - acc: 0.426: 100%|██████████| 78357/78357 [02:47<00:00, 467.19it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.683 - train acc: 0.631 - val loss: 0.740 - val acc: 0.603

Epoch: 83/500 - LR: 0.000300


165.0s - loss: 2.823 - acc: 0.470: 100%|██████████| 78357/78357 [02:44<00:00, 437.07it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.634 - train acc: 0.628 - val loss: 0.712 - val acc: 0.607

Epoch: 84/500 - LR: 0.000300


165.3s - loss: 1.638 - acc: 0.347: 100%|██████████| 78357/78357 [02:45<00:00, 446.22it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.647 - train acc: 0.629 - val loss: 0.526 - val acc: 0.603

Epoch: 85/500 - LR: 0.000300


169.9s - loss: 1.815 - acc: 0.503: 100%|██████████| 78357/78357 [02:49<00:00, 461.62it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.632 - train acc: 0.626 - val loss: 0.634 - val acc: 0.606

Epoch: 86/500 - LR: 0.000300


164.4s - loss: -7.107 - acc: 2.015: 100%|██████████| 78357/78357 [02:44<00:00, 448.78it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.658 - train acc: 0.626 - val loss: 0.623 - val acc: 0.607

Epoch: 87/500 - LR: 0.000300


164.9s - loss: 0.655 - acc: 0.368: 100%|██████████| 78357/78357 [02:44<00:00, 448.36it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.673 - train acc: 0.627 - val loss: 0.612 - val acc: 0.611

Epoch: 88/500 - LR: 0.000300


159.8s - loss: 1.286 - acc: 0.244: 100%|██████████| 78357/78357 [02:39<00:00, 490.24it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.659 - train acc: 0.628 - val loss: 0.643 - val acc: 0.608

Epoch: 89/500 - LR: 0.000300


170.7s - loss: 1.290 - acc: 1.686: 100%|██████████| 78357/78357 [02:50<00:00, 448.61it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.652 - train acc: 0.628 - val loss: 0.689 - val acc: 0.616

Epoch: 90/500 - LR: 0.000300


171.1s - loss: 0.513 - acc: 0.513: 100%|██████████| 78357/78357 [02:51<00:00, 447.53it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.666 - train acc: 0.629 - val loss: 0.622 - val acc: 0.607

Epoch: 91/500 - LR: 0.000300


167.8s - loss: -0.429 - acc: 0.770: 100%|██████████| 78357/78357 [02:47<00:00, 437.70it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.661 - train acc: 0.627 - val loss: 0.569 - val acc: 0.607

Epoch: 92/500 - LR: 0.000300


168.2s - loss: 1.814 - acc: 0.742: 100%|██████████| 78357/78357 [02:48<00:00, 434.24it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.653 - train acc: 0.628 - val loss: 0.685 - val acc: 0.610

Epoch: 93/500 - LR: 0.000300


170.8s - loss: 1.654 - acc: 0.206: 100%|██████████| 78357/78357 [02:50<00:00, 446.03it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.655 - train acc: 0.631 - val loss: 0.794 - val acc: 0.606

Epoch: 94/500 - LR: 0.000300


169.9s - loss: 1.779 - acc: 0.578: 100%|██████████| 78357/78357 [02:49<00:00, 445.11it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.676 - train acc: 0.629 - val loss: 0.641 - val acc: 0.603

Epoch: 95/500 - LR: 0.000300


171.4s - loss: -0.642 - acc: 0.530: 100%|██████████| 78357/78357 [02:51<00:00, 447.94it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.635 - train acc: 0.629 - val loss: 0.842 - val acc: 0.612

Epoch: 96/500 - LR: 0.000300


169.3s - loss: 0.687 - acc: 0.265: 100%|██████████| 78357/78357 [02:49<00:00, 447.47it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.657 - train acc: 0.629 - val loss: 0.634 - val acc: 0.603

Epoch: 97/500 - LR: 0.000300


173.2s - loss: 0.268 - acc: 0.465: 100%|██████████| 78357/78357 [02:53<00:00, 440.47it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.633 - train acc: 0.629 - val loss: 0.870 - val acc: 0.610

Epoch: 98/500 - LR: 0.000300


164.0s - loss: -1.432 - acc: 0.755: 100%|██████████| 78357/78357 [02:44<00:00, 447.14it/s] 
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.669 - train acc: 0.627 - val loss: 0.729 - val acc: 0.614

Epoch: 99/500 - LR: 0.000300


171.3s - loss: -0.423 - acc: 0.504: 100%|██████████| 78357/78357 [02:51<00:00, 457.35it/s]
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.646 - train acc: 0.630 - val loss: 0.651 - val acc: 0.614

Epoch: 100/500 - LR: 0.000300


168.4s - loss: 1.264 - acc: 0.587: 100%|██████████| 78357/78357 [02:48<00:00, 465.02it/s]  
  0%|          | 0/78357 [00:00<?, ?it/s]

train loss: 0.690 - train acc: 0.627 - val loss: 0.628 - val acc: 0.604

Epoch: 101/500 - LR: 0.000300


168.8s - loss: 2.077 - acc: 0.327: 100%|██████████| 78357/78357 [02:48<00:00, 447.55it/s] 


train loss: 0.678 - train acc: 0.626 - val loss: 0.570 - val acc: 0.604
[!] No improvement in a while, stopping training.


"if __name__ == '__main__':\n    config, unparsed = get_config()\n    main(config)"