# YoloV3 Implementation 

### Imports

In [1]:
import torch
import torch.nn as nn
import numpy as np
import random 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch.optim as opt
from torch.utils.data import DataLoader
from collections import Counter
from tqdm import tqdm 

## Configuration:

In [2]:
config = [
    (32, 3, 1),
    (64, 3, 2),
    ["B", 1],
    (128, 3, 2),
    ["B", 2],
    (256, 3, 2),
    ["B", 8],
    (512, 3, 2),
    ["B", 8],
    (1024, 3, 2),
    ["B", 4],  # To this point is Darknet-53
    (512, 1, 1),
    (1024, 3, 1),
    "S",
    (256, 1, 1),
    "U",
    (256, 1, 1),
    (512, 3, 1),
    "S",
    (128, 1, 1),
    "U",
    (128, 1, 1),
    (256, 3, 1),
    "S",
]

In [3]:
PASCAL_CLASSES = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor"
]


In [4]:
import albumentations as A
import cv2
import torch

from albumentations.pytorch import ToTensorV2

DATASET = 'PASCAL_VOC'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# seed_everything()  # If you want deterministic behavior
NUM_WORKERS = 2
BATCH_SIZE = 32
IMAGE_SIZE = 416
NUM_CLASSES = 20
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 1e-4
NUM_EPOCHS = 3
CONF_THRESHOLD = 0.05
MAP_IOU_THRESH = 0.5
NMS_IOU_THRESH = 0.45
S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
PIN_MEMORY = True
LOAD_MODEL = False
SAVE_MODEL = True
CHECKPOINT_FILE = "checkpoint.pth.tar"
IMG_DIR = "../input/pascalvoc-yolo/images"
LABEL_DIR = "../input/pascalvoc-yolo/labels"

ANCHORS = [
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
]  # Note these have been rescaled to be between [0, 1]


scale = 1.1
train_transforms = A.Compose(
    [
        A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
        A.PadIfNeeded(
            min_height=int(IMAGE_SIZE * scale),
            min_width=int(IMAGE_SIZE * scale),
            border_mode=cv2.BORDER_CONSTANT,
        ),
        A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
        A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
        A.OneOf(
            [
                A.ShiftScaleRotate(
                    rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT
                ),
                A.IAAAffine(shear=15, p=0.5, mode="constant"),
            ],
            p=1.0,
        ),
        A.HorizontalFlip(p=0.5),
        A.Blur(p=0.1),
        A.CLAHE(p=0.1),
        A.Posterize(p=0.1),
        A.ToGray(p=0.1),
        A.ChannelShuffle(p=0.05),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[],),
)
test_transforms = A.Compose(
    [
        A.LongestMaxSize(max_size=IMAGE_SIZE),
        A.PadIfNeeded(
            min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
        ),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
)



## Models
YoloV3 is developed on the darknet-59 a 59 layer convolutional network. 
<br>Here is the archetecture of the used network:
![Network Image](https://www.researchgate.net/publication/343692784/figure/fig1/AS:932813947809793@1599411584103/Simplified-layer-architecture-of-You-Only-Look-Once-YOLOv3-network.png)

In [5]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
        self.bn_use = bn_act
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1)
        
    def forward(self, x):
        if self.bn_use:
            return self.leaky(self.bn(self.conv(x)))
        
        else:
            return self.conv(x)

In [6]:
class ResidualBlock(nn.Module):
    def __init__(self, channels, use_residual=True, num_repeats=1):
        super().__init__()
        self.layers = nn.ModuleList()
        for repeat in range(num_repeats):
            self.layers += [
                nn.Sequential(
                    CNNBlock(channels, channels // 2, kernel_size=1),
                    CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
                )
            ]

        self.use_residual = use_residual
        self.num_repeats = num_repeats

    def forward(self, x):
        for layer in self.layers:
            if self.use_residual:
                x = x + layer(x)
            else:
                x = layer(x)

        return x


In [7]:
class ScalePrediction(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        self.pred = nn.Sequential(
            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
            CNNBlock(
                2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
            ),
        )
        self.num_classes = num_classes

    def forward(self, x):
        return (
            self.pred(x)
            .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
            .permute(0, 1, 3, 4, 2)
        )

    

In [8]:
class YOLOv3(nn.Module):
    def __init__(self, in_channels=3, num_classes=80):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()

    def forward(self, x):
        outputs = []  # for each scale
        route_connections = []
        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                outputs.append(layer(x))
                continue

            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
                route_connections.append(x)

            elif isinstance(layer, nn.Upsample):
                x = torch.cat([x, route_connections[-1]], dim=1)
                route_connections.pop()

        return outputs

    def _create_conv_layers(self):
        layers = nn.ModuleList()
        in_channels = self.in_channels

        for module in config:
            if isinstance(module, tuple):
                out_channels, kernel_size, stride = module
                layers.append(
                    CNNBlock(
                        in_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=1 if kernel_size == 3 else 0,
                    )
                )
                in_channels = out_channels

            elif isinstance(module, list):
                num_repeats = module[1]
                layers.append(ResidualBlock(in_channels, num_repeats=num_repeats,))

            elif isinstance(module, str):
                if module == "S":
                    layers += [
                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
                        ScalePrediction(in_channels // 2, num_classes=self.num_classes),
                    ]
                    in_channels = in_channels // 2

                elif module == "U":
                    layers.append(nn.Upsample(scale_factor=2),)
                    in_channels = in_channels * 3

        return layers


In [9]:
if __name__ == "__main__":
    num_classes = 20
    IMAGE_SIZE = 416
    model = YOLOv3(num_classes=num_classes)
    x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
    out = model(x)
    assert model(x)[0].shape == (2, 3, IMAGE_SIZE//32, IMAGE_SIZE//32, num_classes + 5)
    assert model(x)[1].shape == (2, 3, IMAGE_SIZE//16, IMAGE_SIZE//16, num_classes + 5)
    assert model(x)[2].shape == (2, 3, IMAGE_SIZE//8, IMAGE_SIZE//8, num_classes + 5)
    print("Success!")

Success!


## Utils
This part contains the utility functions like intersection over union iou_widht_height, Mean_average_precision etc.

In [10]:
def iou_width_height(boxes1, boxes2):
    intersection = torch.min(boxes1[..., 0], boxes2[..., 0]) * torch.min(boxes1[..., 1], boxes2[..., 1])
    
    union = torch.max(boxes1[..., 0], boxes2[..., 0]) * torch.max(boxes1[..., 1], boxes2[..., 1])
    
    iou = intersection / union
    
    return iou

In [11]:
def intersection_over_union(boxes, labels, box_format='midpoint'):
    if box_format == 'midpoint':
        b1x1 = boxes[..., 0:1] - boxes[..., 2:3]/2
        b1y1 = boxes[..., 1:2] - boxes[..., 3:4]/2
        b1x2 = boxes[..., 0:1] + boxes[..., 2:3]/2
        b1y2 = boxes[..., 1:2] + boxes[..., 3:4]/2
        b2x1 = labels[..., 0:1] - labels[..., 2:3]/2
        b2y1 = labels[..., 1:2] - labels[..., 3:4]/2
        b2x2 = labels[..., 0:1] + labels[..., 2:3]/2
        b2y2 = labels[..., 1:2] + labels[..., 3:4]/2
        
    elif box_format == 'corners':
        b1x1 = boxes[..., 0:1]
        b1y1 = boxes[..., 1:2]
        b1x2 = boxes[..., 2:3]
        b1y2 = boxes[..., 3:4]
        b2x1 = labels[..., 0:1]
        b2y1 = labels[..., 1:2]
        b2x2 = labels[..., 2:3]
        b2y2 = labels[..., 3:4]
        
    x1 = torch.max(b1x1, b2x1)
    y1 = torch.max(b1y1, b2y1)
    x2 = torch.min(b1x2, b2x2)
    y2 = torch.min(b1y2, b2y2)
    
    intersection = (x2-x1).clamp(0) * (y2 - y1).clamp(0)
    
    box1_area = abs((b1x2 - b1x1) * (b1y2 - b1y1))
    box2_area = abs((b2x2 - b2x1) * (b2y2 - b2y1))
    
    return intersection / (box1_area + box2_area - intersection + 1e-5)


In [12]:
def non_max_suppression(bboxes, iou_threshold, threshold, box_format='midpoint'):
    assert type(bboxes) == list
    bboxes = [box for box in bboxes if box[1] > threshold]
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    bboxes_nms = []
    
    while bboxes:
        chosen_box = bboxes.pop(0)
        bboxes = [
            box for box in bboxes
            if box[0] != chosen_box[0]
            or intersection_over_union(torch.tensor(chosen_box[2:]), torch.tensor(box[2:]), box_format=box_format)
            < iou_threshold
        ]
        
        bboxes_nms.append(chosen_box)

    return bboxes_nms

In [13]:
def mean_average_precision(predictions, true_images, iou_threshold, box_format='midpoint', num_classes=20):
    average_precision = []
    epsilon = 1e-6
    for c in range(num_classes):
        detections = []
        ground_truth = []
        
        for detection in predictions:
            if detection[1] == c:
                detections.append(detection)
                
        for label in true_images: 
            if label[1] == c:
                ground_truth.append(label)
                
        amount_boxes = Counter(gt[0] for gt in ground_truth)
        for key, value in amount_boxes.item():
            amount_boxes[key] = torch.zeros(value)
            
        detections = sorted(detection, key=lambda x: x[0], reverse=True)
        
        TP = torch.zeros(len(detections))
        FP = torch.zeros(len(detections))        
        
        total_ground_truth = len(ground_truth)
        
        if total_ground_truth == 0:
            continue
            
        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [bbox for bbox in ground_truth if bbox[0] == detection[0]]
            num_gts = len(ground_truth_img)
            best_iou = 0
            
            for idx, box in enumerate(ground_truth_img):
                iou = intersection_over_union(torch.tensor(box[3:]),torch.tensor(detection[3:]), box_format=box_format)
                
                if iou > best_iou:
                    best_iou = iou
                    index_best = iou
                    
            if best_iou > iou_threshold:
                if amount_boxes[detection[0]][index_best] ==  0:
                    TP[detection_idx] = 1
                    amount_boxes[detection[0]][index_best] = 1
                    
                else: 
                    FP[detection_idx] = 1
                    
            else: 
                FP[detection_idx] = 1
                
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)
            
            
            
            

In [14]:
def plot_image(image, boxes):
    """Plots predicted bounding boxes on the image"""
    cmap = plt.get_cmap("tab20b")
    class_labels = COCO_LABELS if DATASET=='COCO' else PASCAL_CLASSES
    colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
    im = np.array(image)
    height, width, _ = im.shape

    # Create figure and axes
    fig, ax = plt.subplots(1)
    # Display the image
    ax.imshow(im)

    # box[0] is x midpoint, box[2] is width
    # box[1] is y midpoint, box[3] is height

    # Create a Rectangle patch
    for box in boxes:
        assert len(box) == 6, "box should contain class pred, confidence, x, y, width, height"
        class_pred = box[0]
        box = box[2:]
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=2,
            edgecolor=colors[int(class_pred)],
            facecolor="none",
        )
        # Add the patch to the Axes
        ax.add_patch(rect)
        plt.text(
            upper_left_x * width,
            upper_left_y * height,
            s=class_labels[int(class_pred)],
            color="white",
            verticalalignment="top",
            bbox={"color": colors[int(class_pred)], "pad": 0},
        )

    plt.show()

In [15]:
def get_evaluation_bboxes(
    loader,
    model,
    iou_threshold,
    anchors,
    threshold,
    box_format="midpoint",
    device="cuda",
):
    # make sure model is in eval before get bboxes
    model.eval()
    train_idx = 0
    all_pred_boxes = []
    all_true_boxes = []
    for batch_idx, (x, labels) in enumerate(tqdm(loader)):
        x = x.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        bboxes = [[] for _ in range(batch_size)]
        for i in range(3):
            S = predictions[i].shape[2]
            anchor = torch.tensor([*anchors[i]]).to(device) * S
            boxes_scale_i = cells_to_bboxes(
                predictions[i], anchor, S=S, is_preds=True
            )
            for idx, (box) in enumerate(boxes_scale_i):
                bboxes[idx] += box

        # we just want one bbox for each label, not one for each scale
        true_bboxes = cells_to_bboxes(
            labels[2], anchor, S=S, is_preds=False
        )

        for idx in range(batch_size):
            nms_boxes = non_max_suppression(
                bboxes[idx],
                iou_threshold=iou_threshold,
                threshold=threshold,
                box_format=box_format,
            )

            for nms_box in nms_boxes:
                all_pred_boxes.append([train_idx] + nms_box)

            for box in true_bboxes[idx]:
                if box[1] > threshold:
                    all_true_boxes.append([train_idx] + box)

            train_idx += 1

    model.train()
    return all_pred_boxes, all_true_boxes

In [16]:
def cells_to_bboxes(predictions, anchors, S, is_preds=True):
    """
    Scales the predictions coming from the model to
    be relative to the entire image such that they for example later
    can be plotted or.
    INPUT:
    predictions: tensor of size (N, 3, S, S, num_classes+5)
    anchors: the anchors used for the predictions
    S: the number of cells the image is divided in on the width (and height)
    is_preds: whether the input is predictions or the true bounding boxes
    OUTPUT:
    converted_bboxes: the converted boxes of sizes (N, num_anchors, S, S, 1+5) with class index,
                      object score, bounding box coordinates
    """
    BATCH_SIZE = predictions.shape[0]
    num_anchors = len(anchors)
    box_predictions = predictions[..., 1:5]
    if is_preds:
        anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
        box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
        box_predictions[..., 2:] = torch.exp(box_predictions[..., 2:]) * anchors
        scores = torch.sigmoid(predictions[..., 0:1])
        best_class = torch.argmax(predictions[..., 5:], dim=-1).unsqueeze(-1)
    else:
        scores = predictions[..., 0:1]
        best_class = predictions[..., 5:6]

    cell_indices = (
        torch.arange(S)
        .repeat(predictions.shape[0], 3, S, 1)
        .unsqueeze(-1)
        .to(predictions.device)
    )
    x = 1 / S * (box_predictions[..., 0:1] + cell_indices)
    y = 1 / S * (box_predictions[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
    w_h = 1 / S * box_predictions[..., 2:4]
    converted_bboxes = torch.cat((best_class, scores, x, y, w_h), dim=-1).reshape(BATCH_SIZE, num_anchors * S * S, 6)
    return converted_bboxes.tolist()

In [17]:
def check_class_accuracy(model, loader, threshold):
    model.eval()
    tot_class_preds, correct_class = 0, 0
    tot_noobj, correct_noobj = 0, 0
    tot_obj, correct_obj = 0, 0

    for idx, (x, y) in enumerate(tqdm(loader)):
        x = x.to(DEVICE)
        with torch.no_grad():
            out = model(x)

        for i in range(3):
            y[i] = y[i].to(DEVICE)
            obj = y[i][..., 0] == 1 # in paper this is Iobj_i
            noobj = y[i][..., 0] == 0  # in paper this is Iobj_i

            correct_class += torch.sum(
                torch.argmax(out[i][..., 5:][obj], dim=-1) == y[i][..., 5][obj]
            )
            tot_class_preds += torch.sum(obj)

            obj_preds = torch.sigmoid(out[i][..., 0]) > threshold
            correct_obj += torch.sum(obj_preds[obj] == y[i][..., 0][obj])
            tot_obj += torch.sum(obj)
            correct_noobj += torch.sum(obj_preds[noobj] == y[i][..., 0][noobj])
            tot_noobj += torch.sum(noobj)

    print(f"Class accuracy is: {(correct_class/(tot_class_preds+1e-16))*100:2f}%")
    print(f"No obj accuracy is: {(correct_noobj/(tot_noobj+1e-16))*100:2f}%")
    print(f"Obj accuracy is: {(correct_obj/(tot_obj+1e-16))*100:2f}%")
    model.train()


In [18]:

def get_mean_std(loader):
    # var[X] = E[X**2] - E[X]**2
    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0

    for data, _ in tqdm(loader):
        channels_sum += torch.mean(data, dim=[0, 2, 3])
        channels_sqrd_sum += torch.mean(data ** 2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches
    std = (channels_sqrd_sum / num_batches - mean ** 2) ** 0.5

    return mean, std


In [19]:
def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)

In [20]:
def load_checkpoint(checkpoint_file, model, optimizer, lr):
    print("=> Loading checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=DEVICE)
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

    # If we don't do this then it will just have learning rate of old checkpoint
    # and it will lead to many hours of debugging \:
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

In [21]:
def get_loaders(train_csv_path, test_csv_path):

    IMAGE_SIZE = 416
    train_dataset = YOLODataset(
        train_csv_path,
        transform=train_transforms,
        S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
        anchors=ANCHORS,
    )
    test_dataset = YOLODataset(
        test_csv_path,
        transform=test_transforms,
        S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
        anchors=ANCHORS,
    )
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
        drop_last=False,
    )
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=False,
        drop_last=False,
    )

    train_eval_dataset = YOLODataset(
        train_csv_path,
        transform=test_transforms,
        S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
        anchors=ANCHORS,
    )
    train_eval_loader = DataLoader(
        dataset=train_eval_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=False,
        drop_last=False,
    )

    return train_loader, test_loader, train_eval_loader


In [22]:
def plot_couple_examples(model, loader, thresh, iou_thresh, anchors):
    model.eval()
    x, y = next(iter(loader))
    x = x.to("cuda")
    print(x.device)
    with torch.no_grad():
        out = model(x)
        bboxes = [[] for _ in range(x.shape[0])]
        for i in range(3):
            batch_size, A, S, _, _ = out[i].shape
            anchor = anchors[i]
            boxes_scale_i = cells_to_bboxes(
                out[i], anchor, S=S, is_preds=True
            )
            for idx, (box) in enumerate(boxes_scale_i):
                bboxes[idx] += box

        model.train()

    for i in range(batch_size):
        nms_boxes = non_max_suppression(
            bboxes[i], iou_threshold=iou_thresh, threshold=thresh, box_format="midpoint",
        )
        plot_image(x[i].permute(1,2,0).detach().cpu(), nms_boxes)

In [23]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## Loss Function
We will use a custom loss fuction that used different loss and wight for different components like, class, no_obj, box, detection. 

In [24]:
class YoloLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        self.bce = nn.BCEWithLogitsLoss()
        self.entropy = nn.CrossEntropyLoss()
        self.sigmoid = nn.Sigmoid()
        
        self.lambda_class = 1
        self.lambda_obj = 1
        self.lambda_noobj = 10
        self.lambda_box = 10
        
    def forward(self, pred, target, anchors):
        obj = target[..., 0] == 1
        noobj = target[..., 0] == 0
        
        # No Object #
        no_obj_loss = self.bce(pred[..., 0:1][noobj], target[..., 0:1][noobj])
        
        # Object Loss #
        anchors = anchors.reshape(1, 3, 1, 1, 2)
        box_pred = torch.cat([self.sigmoid(pred[..., 1:3]), torch.exp(pred[..., 3:5]) * anchors], dim=-1)
        ious = intersection_over_union(box_pred[obj], target[..., 1:5][obj]).detach()
        object_loss = self.mse(self.sigmoid(pred[..., 0:1][obj]), ious * target[..., 0:1][obj])
        
        pred[..., 1:3] = self.sigmoid(pred[...,1:3])
        target[..., 3:5] = torch.log((1e-16 + target[..., 3:5]/ anchors))
        
        box_loss = self.mse(pred[..., 1:5][obj], target[..., 1:5][obj])
        
        
        class_loss = self.entropy(
            (pred[..., 5:][obj]), (target[..., 5][obj].long())
        )
        
        return (
            self.lambda_box * box_loss
            + self.lambda_obj * object_loss
            + self.lambda_noobj * no_obj_loss
            + self.lambda_class * class_loss
        )

In [25]:
data = pd.read_csv('../input/pascalvoc-yolo/100examples.csv')

In [26]:
data

Unnamed: 0,image,text
0,000007.jpg,000007.txt
1,000026.jpg,000026.txt
2,000032.jpg,000032.txt
3,000033.jpg,000033.txt
4,000034.jpg,000034.txt
...,...,...
98,000609.jpg,000609.txt
99,000612.jpg,000612.txt
100,000620.jpg,000620.txt
101,000622.jpg,000622.txt


In [27]:
from PIL import Image, ImageFile
from torch.utils.data import Dataset, DataLoader
import os

In [28]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

class YOLODataset(Dataset):
    def __init__(
        self,
        csv_file,
        img_dir,
        label_dir,
        anchors,
        image_size=416,
        S=[13, 26, 52],
        C=20,
        transform=None,
    ):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.image_size = image_size
        self.transform = transform
        self.S = S
        self.anchors = torch.tensor(anchors[0] + anchors[1] + anchors[2])  # for all 3 scales
        self.num_anchors = self.anchors.shape[0]
        self.num_anchors_per_scale = self.num_anchors // 3
        self.C = C
        self.ignore_iou_thresh = 0.5

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        bboxes = np.roll(np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1).tolist()
        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = np.array(Image.open(img_path).convert("RGB"))

        if self.transform:
            augmentations = self.transform(image=image, bboxes=bboxes)
            image = augmentations["image"]
            bboxes = augmentations["bboxes"]

        # Below assumes 3 scale predictions (as paper) and same num of anchors per scale
        targets = [torch.zeros((self.num_anchors // 3, S, S, 6)) for S in self.S]
        for box in bboxes:
            iou_anchors = iou_width_height(torch.tensor(box[2:4]), self.anchors)
            anchor_indices = iou_anchors.argsort(descending=True, dim=0)
            x, y, width, height, class_label = box
            has_anchor = [False] * 3  # each scale should have one anchor
            for anchor_idx in anchor_indices:
                scale_idx = anchor_idx // self.num_anchors_per_scale
                anchor_on_scale = anchor_idx % self.num_anchors_per_scale
                S = self.S[scale_idx]
                i, j = int(S * y), int(S * x)  # which cell
                anchor_taken = targets[scale_idx][anchor_on_scale, i, j, 0]
                if not anchor_taken and not has_anchor[scale_idx]:
                    targets[scale_idx][anchor_on_scale, i, j, 0] = 1
                    x_cell, y_cell = S * x - j, S * y - i  # both between [0,1]
                    width_cell, height_cell = (
                        width * S,
                        height * S,
                    )  # can be greater than 1 since it's relative to cell
                    box_coordinates = torch.tensor(
                        [x_cell, y_cell, width_cell, height_cell]
                    )
                    targets[scale_idx][anchor_on_scale, i, j, 1:5] = box_coordinates
                    targets[scale_idx][anchor_on_scale, i, j, 5] = int(class_label)
                    has_anchor[scale_idx] = True

                elif not anchor_taken and iou_anchors[anchor_idx] > self.ignore_iou_thresh:
                    targets[scale_idx][anchor_on_scale, i, j, 0] = -1  # ignore prediction

        return image, tuple(targets)



In [29]:
def test():
    anchors = ANCHORS

    transform = test_transforms

    dataset = YOLODataset(
        "../input/pascalvoc-yolo/100examples.csv",
        IMG_DIR,
        LABEL_DIR,
        S=[13, 26, 52],
        anchors=anchors,
        transform=transform,
    )
    S = [13, 26, 52]
    scaled_anchors = torch.tensor(anchors) / (
        1 / torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
    )
    loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
    for x, y in loader:
        boxes = []

        for i in range(y[0].shape[1]):
            anchor = scaled_anchors[i]
            print(anchor.shape)
            print(y[i].shape)
            boxes += cells_to_bboxes(
                y[i], is_preds=False, S=y[i].shape[2], anchors=anchor
            )[0]
        boxes = non_max_suppression(boxes, iou_threshold=1, threshold=0.7, box_format="midpoint")
        print(boxes)
        plot_image(x[0].permute(1, 2, 0).to("cpu"), boxes)


# if __name__ == "__main__":
#     test()

In [30]:
import torch.optim as optim

In [31]:
dev = torch.device('cuda')

In [32]:
def train_fn(train_loader, model, optimizer, loss_fn, scaler, scaled_anchors):
    loop = tqdm(train_loader, leave=True)
    losses = []
    for batch_idx, (x, y) in enumerate(loop):
        x = x.to(DEVICE)
        y0, y1, y2 = (
            y[0].to(DEVICE),
            y[1].to(DEVICE),
            y[2].to(DEVICE),
        )
        print(x.device)
        with torch.cuda.amp.autocast():
            out = model(x)
            loss = (
                loss_fn(out[0], y0, scaled_anchors[0]) +
                loss_fn(out[1], y1, scaled_anchors[1]) + 
                loss_fn(out[2], y2, scaled_anchors[2])
            )
        losses.append(loss.item())
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # update progress bar
        mean_loss = sum(losses) / len(losses)
        loop.set_postfix(loss=mean_loss)

In [33]:
def main():
    model = YOLOv3(num_classes=NUM_CLASSES).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    loss_fn = YoloLoss()
    scaler = torch.cuda.amp.GradScaler()
    
    train_loader, test_loader, train_eval_loader = get_loaders(train_csv_path='../input/pascalvoc-yolo/train.csv', test_csv_path='../input/pascalvoc-yolo/test.csv')
    
    scaled_anchors = (
        torch.tensor(ANCHORS) * torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
    ).to(DEVICE)
    
    for epoch in range(NUM_EPOCHS):

        train_fn(train_loader, model, optimizer, loss_fn, scaler, scaled_anchors)

        if epoch > 0 and epoch % 3 == 0:
            check_class_accuracy(model, test_loader, threshold=CONF_THRESHOLD)
            pred_boxes, true_boxes = get_evaluation_bboxes(
                test_loader,
                model,
                iou_threshold=NMS_IOU_THRESH,
                anchors=ANCHORS,
                threshold=CONF_THRESHOLD,
            )
            mapval = mean_average_precision(
                pred_boxes,
                true_boxes,
                iou_threshold=MAP_IOU_THRESH,
                box_format="midpoint",
                num_classes=NUM_CLASSES,
            )
            print(f"MAP: {mapval.item()}")
            model.train()

In [34]:
if __name__ == "__main__":
    main()

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


cuda:0


  0%|          | 1/518 [00:08<1:17:10,  8.96s/it, loss=59.9]

cuda:0


  0%|          | 2/518 [00:09<35:20,  4.11s/it, loss=59.1]  

cuda:0


  1%|          | 3/518 [00:10<22:02,  2.57s/it, loss=60]

cuda:0


  1%|          | 4/518 [00:11<15:47,  1.84s/it, loss=61.6]

cuda:0


  1%|          | 5/518 [00:11<12:20,  1.44s/it, loss=60.9]

cuda:0


  1%|          | 6/518 [00:12<10:16,  1.20s/it, loss=61]

cuda:0


  1%|▏         | 7/518 [00:13<09:08,  1.07s/it, loss=61.3]

cuda:0


  2%|▏         | 8/518 [00:14<08:24,  1.01it/s, loss=61.4]

cuda:0


  2%|▏         | 9/518 [00:15<07:49,  1.08it/s, loss=61.2]

cuda:0


  2%|▏         | 10/518 [00:15<07:30,  1.13it/s, loss=60.8]

cuda:0


  2%|▏         | 11/518 [00:16<07:18,  1.16it/s, loss=60.7]

cuda:0


  2%|▏         | 12/518 [00:17<07:06,  1.19it/s, loss=60.3]

cuda:0


  3%|▎         | 13/518 [00:18<07:41,  1.10it/s, loss=60.4]

cuda:0


  3%|▎         | 14/518 [00:19<07:35,  1.11it/s, loss=60]

cuda:0


  3%|▎         | 15/518 [00:20<08:41,  1.04s/it, loss=59.7]

cuda:0


  3%|▎         | 16/518 [00:21<08:02,  1.04it/s, loss=59.7]

cuda:0


  3%|▎         | 17/518 [00:22<07:58,  1.05it/s, loss=59.6]

cuda:0


  3%|▎         | 18/518 [00:23<07:35,  1.10it/s, loss=59.4]

cuda:0


  4%|▎         | 19/518 [00:24<07:18,  1.14it/s, loss=59.4]

cuda:0


  4%|▍         | 20/518 [00:24<07:15,  1.14it/s, loss=59.2]

cuda:0


  4%|▍         | 21/518 [00:25<07:34,  1.09it/s, loss=59.3]

cuda:0


  4%|▍         | 22/518 [00:26<07:23,  1.12it/s, loss=59.4]

cuda:0


  4%|▍         | 23/518 [00:27<07:56,  1.04it/s, loss=59.3]

cuda:0


  5%|▍         | 24/518 [00:28<07:34,  1.09it/s, loss=59.2]

cuda:0


  5%|▍         | 25/518 [00:29<08:09,  1.01it/s, loss=58.8]

cuda:0


  5%|▌         | 26/518 [00:30<07:42,  1.06it/s, loss=58.7]

cuda:0


  5%|▌         | 27/518 [00:31<07:37,  1.07it/s, loss=58.5]

cuda:0


  5%|▌         | 28/518 [00:32<07:16,  1.12it/s, loss=58.5]

cuda:0


  6%|▌         | 29/518 [00:33<07:36,  1.07it/s, loss=58.2]

cuda:0


  6%|▌         | 30/518 [00:34<07:16,  1.12it/s, loss=58.1]

cuda:0


  6%|▌         | 31/518 [00:35<07:35,  1.07it/s, loss=57.9]

cuda:0


  6%|▌         | 32/518 [00:36<07:10,  1.13it/s, loss=57.9]

cuda:0


  6%|▋         | 33/518 [00:37<07:23,  1.09it/s, loss=57.8]

cuda:0


  7%|▋         | 34/518 [00:37<07:02,  1.14it/s, loss=57.6]

cuda:0


  7%|▋         | 35/518 [00:38<07:10,  1.12it/s, loss=57.6]

cuda:0


  7%|▋         | 36/518 [00:39<07:05,  1.13it/s, loss=57.4]

cuda:0


  7%|▋         | 37/518 [00:40<07:34,  1.06it/s, loss=57.3]

cuda:0


  7%|▋         | 38/518 [00:41<07:12,  1.11it/s, loss=57.2]

cuda:0


  8%|▊         | 39/518 [00:42<07:21,  1.09it/s, loss=57.1]

cuda:0


  8%|▊         | 40/518 [00:43<06:59,  1.14it/s, loss=57]

cuda:0


  8%|▊         | 41/518 [00:44<07:25,  1.07it/s, loss=56.9]

cuda:0


  8%|▊         | 42/518 [00:45<07:07,  1.11it/s, loss=56.9]

cuda:0


  8%|▊         | 43/518 [00:46<07:05,  1.12it/s, loss=56.7]

cuda:0


  8%|▊         | 44/518 [00:46<06:46,  1.17it/s, loss=56.8]

cuda:0


  9%|▊         | 45/518 [00:47<06:57,  1.13it/s, loss=56.7]

cuda:0


  9%|▉         | 46/518 [00:48<06:39,  1.18it/s, loss=56.6]

cuda:0


  9%|▉         | 47/518 [00:49<07:14,  1.08it/s, loss=56.4]

cuda:0


  9%|▉         | 48/518 [00:50<07:06,  1.10it/s, loss=56.3]

cuda:0


  9%|▉         | 49/518 [00:51<07:46,  1.01it/s, loss=56.2]

cuda:0


 10%|▉         | 50/518 [00:52<07:23,  1.05it/s, loss=56.2]

cuda:0


 10%|▉         | 51/518 [00:53<08:03,  1.04s/it, loss=56.1]

cuda:0


 10%|█         | 52/518 [00:54<07:29,  1.04it/s, loss=55.9]

cuda:0


 10%|█         | 53/518 [00:55<07:14,  1.07it/s, loss=55.9]

cuda:0


 10%|█         | 54/518 [00:56<06:53,  1.12it/s, loss=55.8]

cuda:0


 11%|█         | 55/518 [00:57<07:00,  1.10it/s, loss=55.7]

cuda:0


 11%|█         | 56/518 [00:57<06:51,  1.12it/s, loss=55.5]

cuda:0


 11%|█         | 57/518 [00:58<07:00,  1.10it/s, loss=55.4]

cuda:0


 11%|█         | 58/518 [00:59<06:46,  1.13it/s, loss=55.3]

cuda:0


 11%|█▏        | 59/518 [01:00<06:57,  1.10it/s, loss=55.3]

cuda:0


 12%|█▏        | 60/518 [01:01<06:42,  1.14it/s, loss=55.2]

cuda:0


 12%|█▏        | 61/518 [01:03<08:03,  1.06s/it, loss=55.1]

cuda:0


 12%|█▏        | 62/518 [01:03<07:29,  1.02it/s, loss=55]

cuda:0


 12%|█▏        | 63/518 [01:04<07:06,  1.07it/s, loss=54.9]

cuda:0


 12%|█▏        | 64/518 [01:05<06:40,  1.13it/s, loss=54.9]

cuda:0


 13%|█▎        | 65/518 [01:06<07:00,  1.08it/s, loss=54.8]

cuda:0


 13%|█▎        | 66/518 [01:07<06:44,  1.12it/s, loss=54.7]

cuda:0


 13%|█▎        | 67/518 [01:08<06:45,  1.11it/s, loss=54.6]

cuda:0


 13%|█▎        | 68/518 [01:08<06:25,  1.17it/s, loss=54.6]

cuda:0


 13%|█▎        | 69/518 [01:09<06:44,  1.11it/s, loss=54.5]

cuda:0


 14%|█▎        | 70/518 [01:10<06:25,  1.16it/s, loss=54.3]

cuda:0


 14%|█▎        | 71/518 [01:11<06:45,  1.10it/s, loss=54.2]

cuda:0


 14%|█▍        | 72/518 [01:12<06:27,  1.15it/s, loss=54.1]

cuda:0


 14%|█▍        | 73/518 [01:13<07:16,  1.02it/s, loss=54.1]

cuda:0


 14%|█▍        | 74/518 [01:14<06:49,  1.08it/s, loss=54]

cuda:0


 14%|█▍        | 75/518 [01:15<06:54,  1.07it/s, loss=53.9]

cuda:0


 15%|█▍        | 76/518 [01:16<06:42,  1.10it/s, loss=53.8]

cuda:0


 15%|█▍        | 77/518 [01:17<06:33,  1.12it/s, loss=53.7]

cuda:0


 15%|█▌        | 78/518 [01:18<06:24,  1.14it/s, loss=53.6]

cuda:0


 15%|█▌        | 79/518 [01:18<06:37,  1.10it/s, loss=53.6]

cuda:0


 15%|█▌        | 80/518 [01:19<06:24,  1.14it/s, loss=53.5]

cuda:0


 16%|█▌        | 81/518 [01:20<06:25,  1.13it/s, loss=53.4]

cuda:0


 16%|█▌        | 82/518 [01:21<06:12,  1.17it/s, loss=53.3]

cuda:0


 16%|█▌        | 83/518 [01:22<06:30,  1.11it/s, loss=53.2]

cuda:0


 16%|█▌        | 84/518 [01:23<06:22,  1.14it/s, loss=53.1]

cuda:0


 16%|█▋        | 85/518 [01:24<06:53,  1.05it/s, loss=53]

cuda:0


 17%|█▋        | 86/518 [01:25<07:02,  1.02it/s, loss=53]

cuda:0


 17%|█▋        | 87/518 [01:26<07:24,  1.03s/it, loss=52.9]

cuda:0


 17%|█▋        | 88/518 [01:27<06:56,  1.03it/s, loss=52.8]

cuda:0


 17%|█▋        | 89/518 [01:28<06:45,  1.06it/s, loss=52.7]

cuda:0


 17%|█▋        | 90/518 [01:29<06:25,  1.11it/s, loss=52.7]

cuda:0


 18%|█▊        | 91/518 [01:30<06:59,  1.02it/s, loss=52.6]

cuda:0


 18%|█▊        | 92/518 [01:31<06:31,  1.09it/s, loss=52.5]

cuda:0


 18%|█▊        | 93/518 [01:32<06:47,  1.04it/s, loss=52.4]

cuda:0


 18%|█▊        | 94/518 [01:32<06:23,  1.11it/s, loss=52.4]

cuda:0


 18%|█▊        | 95/518 [01:34<06:53,  1.02it/s, loss=52.2]

cuda:0


 19%|█▊        | 96/518 [01:35<06:48,  1.03it/s, loss=52.2]

cuda:0


 19%|█▊        | 97/518 [01:35<06:44,  1.04it/s, loss=52.1]

cuda:0


 19%|█▉        | 98/518 [01:36<06:21,  1.10it/s, loss=52]

cuda:0


 19%|█▉        | 99/518 [01:37<06:31,  1.07it/s, loss=51.9]

cuda:0


 19%|█▉        | 100/518 [01:38<06:08,  1.13it/s, loss=51.8]

cuda:0


 19%|█▉        | 101/518 [01:39<06:25,  1.08it/s, loss=51.7]

cuda:0


 20%|█▉        | 102/518 [01:40<06:04,  1.14it/s, loss=51.7]

cuda:0


 20%|█▉        | 103/518 [01:41<06:24,  1.08it/s, loss=51.6]

cuda:0


 20%|██        | 104/518 [01:42<06:04,  1.14it/s, loss=51.5]

cuda:0


 20%|██        | 105/518 [01:43<06:08,  1.12it/s, loss=51.4]

cuda:0


 20%|██        | 106/518 [01:43<05:54,  1.16it/s, loss=51.3]

cuda:0


 21%|██        | 107/518 [01:44<06:04,  1.13it/s, loss=51.3]

cuda:0


 21%|██        | 108/518 [01:45<06:01,  1.13it/s, loss=51.2]

cuda:0


 21%|██        | 109/518 [01:46<06:58,  1.02s/it, loss=51.1]

cuda:0


 21%|██        | 110/518 [01:47<06:24,  1.06it/s, loss=51]

cuda:0


 21%|██▏       | 111/518 [01:48<06:22,  1.06it/s, loss=50.9]

cuda:0


 22%|██▏       | 112/518 [01:49<05:59,  1.13it/s, loss=50.8]

cuda:0


 22%|██▏       | 113/518 [01:50<06:09,  1.10it/s, loss=50.8]

cuda:0


 22%|██▏       | 114/518 [01:51<05:54,  1.14it/s, loss=50.7]

cuda:0


 22%|██▏       | 115/518 [01:52<05:54,  1.14it/s, loss=50.6]

cuda:0


 22%|██▏       | 116/518 [01:52<05:43,  1.17it/s, loss=50.5]

cuda:0


 23%|██▎       | 117/518 [01:53<06:03,  1.10it/s, loss=50.4]

cuda:0


 23%|██▎       | 118/518 [01:54<05:47,  1.15it/s, loss=50.3]

cuda:0


 23%|██▎       | 119/518 [01:55<06:11,  1.07it/s, loss=50.2]

cuda:0


 23%|██▎       | 120/518 [01:56<05:58,  1.11it/s, loss=50.2]

cuda:0


 23%|██▎       | 121/518 [01:57<06:45,  1.02s/it, loss=50.1]

cuda:0


 24%|██▎       | 122/518 [01:58<06:29,  1.02it/s, loss=50]

cuda:0


 24%|██▎       | 123/518 [02:00<07:00,  1.07s/it, loss=49.9]

cuda:0


 24%|██▍       | 124/518 [02:00<06:26,  1.02it/s, loss=49.8]

cuda:0


 24%|██▍       | 125/518 [02:01<06:29,  1.01it/s, loss=49.7]

cuda:0


 24%|██▍       | 126/518 [02:02<06:04,  1.08it/s, loss=49.7]

cuda:0


 25%|██▍       | 127/518 [02:03<06:06,  1.07it/s, loss=49.6]

cuda:0


 25%|██▍       | 128/518 [02:04<05:48,  1.12it/s, loss=49.5]

cuda:0


 25%|██▍       | 129/518 [02:05<06:03,  1.07it/s, loss=49.4]

cuda:0


 25%|██▌       | 130/518 [02:06<05:45,  1.12it/s, loss=49.4]

cuda:0


 25%|██▌       | 131/518 [02:07<06:00,  1.07it/s, loss=49.3]

cuda:0


 25%|██▌       | 132/518 [02:08<05:44,  1.12it/s, loss=49.2]

cuda:0


 26%|██▌       | 133/518 [02:09<06:22,  1.01it/s, loss=49.2]

cuda:0


 26%|██▌       | 134/518 [02:10<05:54,  1.08it/s, loss=49.1]

cuda:0


 26%|██▌       | 135/518 [02:10<05:56,  1.07it/s, loss=49]

cuda:0


 26%|██▋       | 136/518 [02:11<05:37,  1.13it/s, loss=48.9]

cuda:0


 26%|██▋       | 137/518 [02:12<05:53,  1.08it/s, loss=48.8]

cuda:0


 27%|██▋       | 138/518 [02:13<05:39,  1.12it/s, loss=48.8]

cuda:0


 27%|██▋       | 139/518 [02:14<05:58,  1.06it/s, loss=48.7]

cuda:0


 27%|██▋       | 140/518 [02:15<05:40,  1.11it/s, loss=48.6]

cuda:0


 27%|██▋       | 141/518 [02:16<05:55,  1.06it/s, loss=48.5]

cuda:0


 27%|██▋       | 142/518 [02:17<05:35,  1.12it/s, loss=48.5]

cuda:0


 28%|██▊       | 143/518 [02:18<05:47,  1.08it/s, loss=48.4]

cuda:0


 28%|██▊       | 144/518 [02:19<05:39,  1.10it/s, loss=48.3]

cuda:0


 28%|██▊       | 145/518 [02:20<06:04,  1.02it/s, loss=48.3]

cuda:0


 28%|██▊       | 146/518 [02:21<05:44,  1.08it/s, loss=48.2]

cuda:0


 28%|██▊       | 147/518 [02:22<06:04,  1.02it/s, loss=48.1]

cuda:0


 29%|██▊       | 148/518 [02:22<05:40,  1.09it/s, loss=48.1]

cuda:0


 29%|██▉       | 149/518 [02:23<05:41,  1.08it/s, loss=48]

cuda:0


 29%|██▉       | 150/518 [02:24<05:23,  1.14it/s, loss=47.9]

cuda:0


 29%|██▉       | 151/518 [02:25<05:48,  1.05it/s, loss=47.8]

cuda:0


 29%|██▉       | 152/518 [02:26<05:26,  1.12it/s, loss=47.8]

cuda:0


 30%|██▉       | 153/518 [02:27<05:44,  1.06it/s, loss=47.7]

cuda:0


 30%|██▉       | 154/518 [02:28<05:25,  1.12it/s, loss=47.6]

cuda:0


 30%|██▉       | 155/518 [02:29<05:46,  1.05it/s, loss=47.6]

cuda:0


 30%|███       | 156/518 [02:30<05:41,  1.06it/s, loss=47.5]

cuda:0


 30%|███       | 157/518 [02:31<06:33,  1.09s/it, loss=47.4]

cuda:0


 31%|███       | 158/518 [02:32<06:01,  1.00s/it, loss=47.4]

cuda:0


 31%|███       | 159/518 [02:33<06:10,  1.03s/it, loss=47.3]

cuda:0


 31%|███       | 160/518 [02:34<05:51,  1.02it/s, loss=47.2]

cuda:0


 31%|███       | 161/518 [02:35<05:46,  1.03it/s, loss=47.2]

cuda:0


 31%|███▏      | 162/518 [02:36<05:25,  1.09it/s, loss=47.1]

cuda:0


 31%|███▏      | 163/518 [02:37<05:22,  1.10it/s, loss=47]

cuda:0


 32%|███▏      | 164/518 [02:38<05:11,  1.14it/s, loss=47]

cuda:0


 32%|███▏      | 165/518 [02:38<05:11,  1.13it/s, loss=46.9]

cuda:0


 32%|███▏      | 166/518 [02:39<05:01,  1.17it/s, loss=46.8]

cuda:0


 32%|███▏      | 167/518 [02:40<05:38,  1.04it/s, loss=46.8]

cuda:0


 32%|███▏      | 168/518 [02:41<05:29,  1.06it/s, loss=46.7]

cuda:0


 33%|███▎      | 169/518 [02:42<05:13,  1.11it/s, loss=46.6]

cuda:0


 33%|███▎      | 170/518 [02:43<04:59,  1.16it/s, loss=46.6]

cuda:0


 33%|███▎      | 171/518 [02:44<05:20,  1.08it/s, loss=46.5]

cuda:0


 33%|███▎      | 172/518 [02:45<05:03,  1.14it/s, loss=46.4]

cuda:0


 33%|███▎      | 173/518 [02:46<05:16,  1.09it/s, loss=46.4]

cuda:0


 34%|███▎      | 174/518 [02:47<05:06,  1.12it/s, loss=46.3]

cuda:0


 34%|███▍      | 175/518 [02:47<05:04,  1.13it/s, loss=46.2]

cuda:0


 34%|███▍      | 176/518 [02:48<04:51,  1.17it/s, loss=46.2]

cuda:0


 34%|███▍      | 177/518 [02:49<05:08,  1.11it/s, loss=46.1]

cuda:0


 34%|███▍      | 178/518 [02:50<04:58,  1.14it/s, loss=46]

cuda:0


 35%|███▍      | 179/518 [02:51<05:06,  1.10it/s, loss=46]

cuda:0


 35%|███▍      | 180/518 [02:52<04:59,  1.13it/s, loss=45.9]

cuda:0


 35%|███▍      | 181/518 [02:53<05:29,  1.02it/s, loss=45.9]

cuda:0


 35%|███▌      | 182/518 [02:54<05:07,  1.09it/s, loss=45.8]

cuda:0


 35%|███▌      | 183/518 [02:55<05:27,  1.02it/s, loss=45.7]

cuda:0


 36%|███▌      | 184/518 [02:56<05:06,  1.09it/s, loss=45.6]

cuda:0


 36%|███▌      | 185/518 [02:57<05:04,  1.09it/s, loss=45.6]

cuda:0


 36%|███▌      | 186/518 [02:57<04:49,  1.15it/s, loss=45.5]

cuda:0


 36%|███▌      | 187/518 [02:58<04:58,  1.11it/s, loss=45.4]

cuda:0


 36%|███▋      | 188/518 [02:59<04:48,  1.15it/s, loss=45.4]

cuda:0


 36%|███▋      | 189/518 [03:00<04:53,  1.12it/s, loss=45.3]

cuda:0


 37%|███▋      | 190/518 [03:01<04:42,  1.16it/s, loss=45.2]

cuda:0


 37%|███▋      | 191/518 [03:02<05:34,  1.02s/it, loss=45.2]

cuda:0


 37%|███▋      | 192/518 [03:03<05:10,  1.05it/s, loss=45.1]

cuda:0


 37%|███▋      | 193/518 [03:04<05:44,  1.06s/it, loss=45.1]

cuda:0


 37%|███▋      | 194/518 [03:05<05:17,  1.02it/s, loss=45]

cuda:0


 38%|███▊      | 195/518 [03:06<05:23,  1.00s/it, loss=44.9]

cuda:0


 38%|███▊      | 196/518 [03:07<04:59,  1.07it/s, loss=44.9]

cuda:0


 38%|███▊      | 197/518 [03:08<05:04,  1.05it/s, loss=44.8]

cuda:0


 38%|███▊      | 198/518 [03:09<04:47,  1.11it/s, loss=44.7]

cuda:0


 38%|███▊      | 199/518 [03:10<04:57,  1.07it/s, loss=44.7]

cuda:0


 39%|███▊      | 200/518 [03:11<04:41,  1.13it/s, loss=44.6]

cuda:0


 39%|███▉      | 201/518 [03:12<04:58,  1.06it/s, loss=44.6]

cuda:0


 39%|███▉      | 202/518 [03:12<04:46,  1.10it/s, loss=44.5]

cuda:0


 39%|███▉      | 203/518 [03:14<05:05,  1.03it/s, loss=44.5]

cuda:0


 39%|███▉      | 204/518 [03:14<04:48,  1.09it/s, loss=44.4]

cuda:0


 40%|███▉      | 205/518 [03:16<05:11,  1.01it/s, loss=44.3]

cuda:0


 40%|███▉      | 206/518 [03:16<04:52,  1.07it/s, loss=44.3]

cuda:0


 40%|███▉      | 207/518 [03:17<05:01,  1.03it/s, loss=44.2]

cuda:0


 40%|████      | 208/518 [03:18<04:44,  1.09it/s, loss=44.2]

cuda:0


 40%|████      | 209/518 [03:19<04:49,  1.07it/s, loss=44.1]

cuda:0


 41%|████      | 210/518 [03:20<04:33,  1.13it/s, loss=44]

cuda:0


 41%|████      | 211/518 [03:21<04:39,  1.10it/s, loss=44]

cuda:0


 41%|████      | 212/518 [03:22<04:26,  1.15it/s, loss=43.9]

cuda:0


 41%|████      | 213/518 [03:23<04:53,  1.04it/s, loss=43.9]

cuda:0


 41%|████▏     | 214/518 [03:24<04:39,  1.09it/s, loss=43.8]

cuda:0


 42%|████▏     | 215/518 [03:25<04:57,  1.02it/s, loss=43.8]

cuda:0


 42%|████▏     | 216/518 [03:26<04:38,  1.08it/s, loss=43.7]

cuda:0


 42%|████▏     | 217/518 [03:27<04:43,  1.06it/s, loss=43.7]

cuda:0


 42%|████▏     | 218/518 [03:27<04:30,  1.11it/s, loss=43.6]

cuda:0


 42%|████▏     | 219/518 [03:28<04:39,  1.07it/s, loss=43.5]

cuda:0


 42%|████▏     | 220/518 [03:29<04:24,  1.12it/s, loss=43.5]

cuda:0


 43%|████▎     | 221/518 [03:30<04:30,  1.10it/s, loss=43.4]

cuda:0


 43%|████▎     | 222/518 [03:31<04:18,  1.15it/s, loss=43.4]

cuda:0


 43%|████▎     | 223/518 [03:32<04:38,  1.06it/s, loss=43.3]

cuda:0


 43%|████▎     | 224/518 [03:33<04:22,  1.12it/s, loss=43.3]

cuda:0


 43%|████▎     | 225/518 [03:34<04:34,  1.07it/s, loss=43.2]

cuda:0


 44%|████▎     | 226/518 [03:35<04:21,  1.12it/s, loss=43.2]

cuda:0


 44%|████▍     | 227/518 [03:36<04:52,  1.00s/it, loss=43.1]

cuda:0


 44%|████▍     | 228/518 [03:37<04:43,  1.02it/s, loss=43.1]

cuda:0


 44%|████▍     | 229/518 [03:38<05:10,  1.08s/it, loss=43]

cuda:0


 44%|████▍     | 230/518 [03:39<04:42,  1.02it/s, loss=43]

cuda:0


 45%|████▍     | 231/518 [03:40<04:40,  1.02it/s, loss=42.9]

cuda:0


 45%|████▍     | 232/518 [03:41<04:20,  1.10it/s, loss=42.9]

cuda:0


 45%|████▍     | 233/518 [03:42<04:33,  1.04it/s, loss=42.8]

cuda:0


 45%|████▌     | 234/518 [03:43<04:17,  1.10it/s, loss=42.8]

cuda:0


 45%|████▌     | 235/518 [03:44<04:40,  1.01it/s, loss=42.7]

cuda:0


 46%|████▌     | 236/518 [03:45<04:26,  1.06it/s, loss=42.7]

cuda:0


 46%|████▌     | 237/518 [03:46<04:35,  1.02it/s, loss=42.6]

cuda:0


 46%|████▌     | 238/518 [03:46<04:19,  1.08it/s, loss=42.5]

cuda:0


 46%|████▌     | 239/518 [03:48<04:36,  1.01it/s, loss=42.5]

cuda:0


 46%|████▋     | 240/518 [03:48<04:19,  1.07it/s, loss=42.4]

cuda:0


 47%|████▋     | 241/518 [03:49<04:14,  1.09it/s, loss=42.4]

cuda:0


 47%|████▋     | 242/518 [03:50<04:02,  1.14it/s, loss=42.3]

cuda:0


 47%|████▋     | 243/518 [03:51<04:05,  1.12it/s, loss=42.3]

cuda:0


 47%|████▋     | 244/518 [03:52<03:54,  1.17it/s, loss=42.2]

cuda:0


 47%|████▋     | 245/518 [03:53<04:06,  1.11it/s, loss=42.2]

cuda:0


 47%|████▋     | 246/518 [03:54<03:56,  1.15it/s, loss=42.1]

cuda:0


 48%|████▊     | 247/518 [03:54<04:04,  1.11it/s, loss=42.1]

cuda:0


 48%|████▊     | 248/518 [03:55<03:54,  1.15it/s, loss=42]

cuda:0


 48%|████▊     | 249/518 [03:56<04:02,  1.11it/s, loss=42]

cuda:0


 48%|████▊     | 250/518 [03:57<03:59,  1.12it/s, loss=42]

cuda:0


 48%|████▊     | 251/518 [03:58<04:12,  1.06it/s, loss=41.9]

cuda:0


 49%|████▊     | 252/518 [03:59<04:00,  1.11it/s, loss=41.9]

cuda:0


 49%|████▉     | 253/518 [04:00<04:10,  1.06it/s, loss=41.8]

cuda:0


 49%|████▉     | 254/518 [04:01<03:56,  1.12it/s, loss=41.8]

cuda:0


 49%|████▉     | 255/518 [04:02<03:54,  1.12it/s, loss=41.7]

cuda:0


 49%|████▉     | 256/518 [04:03<03:45,  1.16it/s, loss=41.7]

cuda:0


 50%|████▉     | 257/518 [04:04<04:19,  1.01it/s, loss=41.6]

cuda:0


 50%|████▉     | 258/518 [04:05<04:01,  1.08it/s, loss=41.6]

cuda:0


 50%|█████     | 259/518 [04:06<04:02,  1.07it/s, loss=41.5]

cuda:0


 50%|█████     | 260/518 [04:06<03:49,  1.13it/s, loss=41.5]

cuda:0


 50%|█████     | 261/518 [04:07<04:09,  1.03it/s, loss=41.4]

cuda:0


 51%|█████     | 262/518 [04:08<04:00,  1.06it/s, loss=41.4]

cuda:0


 51%|█████     | 263/518 [04:09<04:11,  1.01it/s, loss=41.3]

cuda:0


 51%|█████     | 264/518 [04:10<03:58,  1.06it/s, loss=41.3]

cuda:0


 51%|█████     | 265/518 [04:12<04:32,  1.08s/it, loss=41.2]

cuda:0


 51%|█████▏    | 266/518 [04:12<04:11,  1.00it/s, loss=41.2]

cuda:0


 52%|█████▏    | 267/518 [04:13<04:06,  1.02it/s, loss=41.1]

cuda:0


 52%|█████▏    | 268/518 [04:14<03:54,  1.07it/s, loss=41.1]

cuda:0


 52%|█████▏    | 269/518 [04:15<03:48,  1.09it/s, loss=41]

cuda:0


 52%|█████▏    | 270/518 [04:16<03:37,  1.14it/s, loss=41]

cuda:0


 52%|█████▏    | 271/518 [04:17<03:42,  1.11it/s, loss=41]

cuda:0


 53%|█████▎    | 272/518 [04:18<03:36,  1.14it/s, loss=40.9]

cuda:0


 53%|█████▎    | 273/518 [04:19<03:47,  1.08it/s, loss=40.9]

cuda:0


 53%|█████▎    | 274/518 [04:20<03:40,  1.11it/s, loss=40.8]

cuda:0


 53%|█████▎    | 275/518 [04:21<04:03,  1.00s/it, loss=40.8]

cuda:0


 53%|█████▎    | 276/518 [04:22<03:46,  1.07it/s, loss=40.8]

cuda:0


 53%|█████▎    | 277/518 [04:23<03:48,  1.06it/s, loss=40.7]

cuda:0


 54%|█████▎    | 278/518 [04:23<03:33,  1.12it/s, loss=40.7]

cuda:0


 54%|█████▍    | 279/518 [04:24<03:42,  1.08it/s, loss=40.6]

cuda:0


 54%|█████▍    | 280/518 [04:25<03:34,  1.11it/s, loss=40.6]

cuda:0


 54%|█████▍    | 281/518 [04:26<03:34,  1.11it/s, loss=40.6]

cuda:0


 54%|█████▍    | 282/518 [04:27<03:23,  1.16it/s, loss=40.5]

cuda:0


 55%|█████▍    | 283/518 [04:28<03:35,  1.09it/s, loss=40.5]

cuda:0


 55%|█████▍    | 284/518 [04:29<03:25,  1.14it/s, loss=40.4]

cuda:0


 55%|█████▌    | 285/518 [04:30<03:35,  1.08it/s, loss=40.4]

cuda:0


 55%|█████▌    | 286/518 [04:31<03:28,  1.11it/s, loss=40.3]

cuda:0


 55%|█████▌    | 287/518 [04:32<03:43,  1.03it/s, loss=40.3]

cuda:0


 56%|█████▌    | 288/518 [04:32<03:29,  1.10it/s, loss=40.3]

cuda:0


 56%|█████▌    | 289/518 [04:33<03:33,  1.07it/s, loss=40.2]

cuda:0


 56%|█████▌    | 290/518 [04:34<03:27,  1.10it/s, loss=40.2]

cuda:0


 56%|█████▌    | 291/518 [04:35<03:33,  1.06it/s, loss=40.1]

cuda:0


 56%|█████▋    | 292/518 [04:36<03:21,  1.12it/s, loss=40.1]

cuda:0


 57%|█████▋    | 293/518 [04:37<03:32,  1.06it/s, loss=40.1]

cuda:0


 57%|█████▋    | 294/518 [04:38<03:20,  1.12it/s, loss=40]

cuda:0


 57%|█████▋    | 295/518 [04:39<03:25,  1.09it/s, loss=40]

cuda:0


 57%|█████▋    | 296/518 [04:40<03:14,  1.14it/s, loss=39.9]

cuda:0


 57%|█████▋    | 297/518 [04:41<03:24,  1.08it/s, loss=39.9]

cuda:0


 58%|█████▊    | 298/518 [04:42<03:13,  1.14it/s, loss=39.9]

cuda:0


 58%|█████▊    | 299/518 [04:43<04:10,  1.14s/it, loss=39.8]

cuda:0


 58%|█████▊    | 300/518 [04:44<03:46,  1.04s/it, loss=39.8]

cuda:0


 58%|█████▊    | 301/518 [04:45<03:54,  1.08s/it, loss=39.7]

cuda:0


 58%|█████▊    | 302/518 [04:46<03:34,  1.01it/s, loss=39.7]

cuda:0


 58%|█████▊    | 303/518 [04:47<03:36,  1.01s/it, loss=39.7]

cuda:0


 59%|█████▊    | 304/518 [04:48<03:21,  1.06it/s, loss=39.6]

cuda:0


 59%|█████▉    | 305/518 [04:49<03:24,  1.04it/s, loss=39.6]

cuda:0


 59%|█████▉    | 306/518 [04:50<03:12,  1.10it/s, loss=39.6]

cuda:0


 59%|█████▉    | 307/518 [04:51<03:20,  1.05it/s, loss=39.5]

cuda:0


 59%|█████▉    | 308/518 [04:52<03:12,  1.09it/s, loss=39.5]

cuda:0


 60%|█████▉    | 309/518 [04:53<03:14,  1.07it/s, loss=39.4]

cuda:0


 60%|█████▉    | 310/518 [04:53<03:05,  1.12it/s, loss=39.4]

cuda:0


 60%|██████    | 311/518 [04:54<03:21,  1.03it/s, loss=39.4]

cuda:0


 60%|██████    | 312/518 [04:55<03:09,  1.09it/s, loss=39.3]

cuda:0


 60%|██████    | 313/518 [04:56<03:07,  1.09it/s, loss=39.3]

cuda:0


 61%|██████    | 314/518 [04:57<02:57,  1.15it/s, loss=39.3]

cuda:0


 61%|██████    | 315/518 [04:58<03:09,  1.07it/s, loss=39.2]

cuda:0


 61%|██████    | 316/518 [04:59<02:59,  1.13it/s, loss=39.2]

cuda:0


 61%|██████    | 317/518 [05:00<03:03,  1.10it/s, loss=39.2]

cuda:0


 61%|██████▏   | 318/518 [05:01<02:56,  1.13it/s, loss=39.1]

cuda:0


 62%|██████▏   | 319/518 [05:01<02:57,  1.12it/s, loss=39.1]

cuda:0


 62%|██████▏   | 320/518 [05:02<02:53,  1.14it/s, loss=39]

cuda:0


 62%|██████▏   | 321/518 [05:03<02:59,  1.10it/s, loss=39]

cuda:0


 62%|██████▏   | 322/518 [05:04<02:56,  1.11it/s, loss=39]

cuda:0


 62%|██████▏   | 323/518 [05:05<03:15,  1.00s/it, loss=38.9]

cuda:0


 63%|██████▎   | 324/518 [05:06<03:02,  1.06it/s, loss=38.9]

cuda:0


 63%|██████▎   | 325/518 [05:07<03:07,  1.03it/s, loss=38.9]

cuda:0


 63%|██████▎   | 326/518 [05:08<02:55,  1.10it/s, loss=38.8]

cuda:0


 63%|██████▎   | 327/518 [05:09<03:07,  1.02it/s, loss=38.8]

cuda:0


 63%|██████▎   | 328/518 [05:10<02:55,  1.09it/s, loss=38.8]

cuda:0


 64%|██████▎   | 329/518 [05:11<02:59,  1.05it/s, loss=38.7]

cuda:0


 64%|██████▎   | 330/518 [05:12<02:48,  1.11it/s, loss=38.7]

cuda:0


 64%|██████▍   | 331/518 [05:13<02:57,  1.06it/s, loss=38.7]

cuda:0


 64%|██████▍   | 332/518 [05:14<02:47,  1.11it/s, loss=38.6]

cuda:0


 64%|██████▍   | 333/518 [05:15<03:13,  1.05s/it, loss=38.6]

cuda:0


 64%|██████▍   | 334/518 [05:16<03:05,  1.01s/it, loss=38.6]

cuda:0


 65%|██████▍   | 335/518 [05:17<03:10,  1.04s/it, loss=38.5]

cuda:0


 65%|██████▍   | 336/518 [05:18<02:55,  1.04it/s, loss=38.5]

cuda:0


 65%|██████▌   | 337/518 [05:19<02:50,  1.06it/s, loss=38.5]

cuda:0


 65%|██████▌   | 338/518 [05:20<02:40,  1.12it/s, loss=38.4]

cuda:0


 65%|██████▌   | 339/518 [05:21<02:48,  1.06it/s, loss=38.4]

cuda:0


 66%|██████▌   | 340/518 [05:21<02:38,  1.12it/s, loss=38.4]

cuda:0


 66%|██████▌   | 341/518 [05:22<02:44,  1.08it/s, loss=38.3]

cuda:0


 66%|██████▌   | 342/518 [05:23<02:36,  1.13it/s, loss=38.3]

cuda:0


 66%|██████▌   | 343/518 [05:24<02:42,  1.07it/s, loss=38.3]

cuda:0


 66%|██████▋   | 344/518 [05:25<02:38,  1.10it/s, loss=38.2]

cuda:0


 67%|██████▋   | 345/518 [05:26<02:56,  1.02s/it, loss=38.2]

cuda:0


 67%|██████▋   | 346/518 [05:27<02:42,  1.06it/s, loss=38.2]

cuda:0


 67%|██████▋   | 347/518 [05:28<02:42,  1.05it/s, loss=38.1]

cuda:0


 67%|██████▋   | 348/518 [05:29<02:35,  1.09it/s, loss=38.1]

cuda:0


 67%|██████▋   | 349/518 [05:30<02:34,  1.10it/s, loss=38.1]

cuda:0


 68%|██████▊   | 350/518 [05:31<02:26,  1.14it/s, loss=38.1]

cuda:0


 68%|██████▊   | 351/518 [05:31<02:26,  1.14it/s, loss=38]

cuda:0


 68%|██████▊   | 352/518 [05:32<02:20,  1.18it/s, loss=38]

cuda:0


 68%|██████▊   | 353/518 [05:33<02:34,  1.07it/s, loss=38]

cuda:0


 68%|██████▊   | 354/518 [05:34<02:26,  1.12it/s, loss=37.9]

cuda:0


 69%|██████▊   | 355/518 [05:35<02:32,  1.07it/s, loss=37.9]

cuda:0


 69%|██████▊   | 356/518 [05:36<02:26,  1.11it/s, loss=37.9]

cuda:0


 69%|██████▉   | 357/518 [05:37<02:46,  1.03s/it, loss=37.8]

cuda:0


 69%|██████▉   | 358/518 [05:38<02:33,  1.04it/s, loss=37.8]

cuda:0


 69%|██████▉   | 359/518 [05:39<02:42,  1.02s/it, loss=37.8]

cuda:0


 69%|██████▉   | 360/518 [05:40<02:33,  1.03it/s, loss=37.7]

cuda:0


 70%|██████▉   | 361/518 [05:41<02:28,  1.05it/s, loss=37.7]

cuda:0


 70%|██████▉   | 362/518 [05:42<02:24,  1.08it/s, loss=37.7]

cuda:0


 70%|███████   | 363/518 [05:43<02:22,  1.09it/s, loss=37.6]

cuda:0


 70%|███████   | 364/518 [05:44<02:13,  1.15it/s, loss=37.6]

cuda:0


 70%|███████   | 365/518 [05:45<02:21,  1.08it/s, loss=37.6]

cuda:0


 71%|███████   | 366/518 [05:45<02:17,  1.11it/s, loss=37.6]

cuda:0


 71%|███████   | 367/518 [05:47<02:24,  1.05it/s, loss=37.5]

cuda:0


 71%|███████   | 368/518 [05:47<02:15,  1.10it/s, loss=37.5]

cuda:0


 71%|███████   | 369/518 [05:48<02:20,  1.06it/s, loss=37.5]

cuda:0


 71%|███████▏  | 370/518 [05:49<02:12,  1.12it/s, loss=37.4]

cuda:0


 72%|███████▏  | 371/518 [05:50<02:19,  1.06it/s, loss=37.4]

cuda:0


 72%|███████▏  | 372/518 [05:51<02:11,  1.11it/s, loss=37.4]

cuda:0


 72%|███████▏  | 373/518 [05:52<02:19,  1.04it/s, loss=37.3]

cuda:0


 72%|███████▏  | 374/518 [05:53<02:11,  1.09it/s, loss=37.3]

cuda:0


 72%|███████▏  | 375/518 [05:54<02:16,  1.04it/s, loss=37.3]

cuda:0


 73%|███████▎  | 376/518 [05:55<02:11,  1.08it/s, loss=37.3]

cuda:0


 73%|███████▎  | 377/518 [05:56<02:23,  1.02s/it, loss=37.2]

cuda:0


 73%|███████▎  | 378/518 [05:57<02:12,  1.06it/s, loss=37.2]

cuda:0


 73%|███████▎  | 379/518 [05:58<02:17,  1.01it/s, loss=37.2]

cuda:0


 73%|███████▎  | 380/518 [05:59<02:09,  1.06it/s, loss=37.2]

cuda:0


 74%|███████▎  | 381/518 [06:00<02:14,  1.02it/s, loss=37.1]

cuda:0


 74%|███████▎  | 382/518 [06:01<02:05,  1.09it/s, loss=37.1]

cuda:0


 74%|███████▍  | 383/518 [06:02<02:05,  1.07it/s, loss=37.1]

cuda:0


 74%|███████▍  | 384/518 [06:02<01:58,  1.13it/s, loss=37]

cuda:0


 74%|███████▍  | 385/518 [06:03<02:04,  1.07it/s, loss=37]

cuda:0


 75%|███████▍  | 386/518 [06:04<01:57,  1.12it/s, loss=37]

cuda:0


 75%|███████▍  | 387/518 [06:05<02:02,  1.07it/s, loss=37]

cuda:0


 75%|███████▍  | 388/518 [06:06<01:56,  1.12it/s, loss=36.9]

cuda:0


 75%|███████▌  | 389/518 [06:07<01:53,  1.14it/s, loss=36.9]

cuda:0


 75%|███████▌  | 390/518 [06:08<01:48,  1.18it/s, loss=36.9]

cuda:0


 75%|███████▌  | 391/518 [06:09<01:55,  1.10it/s, loss=36.9]

cuda:0


 76%|███████▌  | 392/518 [06:10<01:52,  1.12it/s, loss=36.8]

cuda:0


 76%|███████▌  | 393/518 [06:11<02:09,  1.03s/it, loss=36.8]

cuda:0


 76%|███████▌  | 394/518 [06:12<01:58,  1.04it/s, loss=36.8]

cuda:0


 76%|███████▋  | 395/518 [06:13<01:57,  1.05it/s, loss=36.7]

cuda:0


 76%|███████▋  | 396/518 [06:13<01:51,  1.10it/s, loss=36.7]

cuda:0


 77%|███████▋  | 397/518 [06:14<01:48,  1.11it/s, loss=36.7]

cuda:0


 77%|███████▋  | 398/518 [06:15<01:43,  1.16it/s, loss=36.6]

cuda:0


 77%|███████▋  | 399/518 [06:16<01:45,  1.13it/s, loss=36.6]

cuda:0


 77%|███████▋  | 400/518 [06:17<01:40,  1.17it/s, loss=36.6]

cuda:0


 77%|███████▋  | 401/518 [06:18<01:46,  1.10it/s, loss=36.6]

cuda:0


 78%|███████▊  | 402/518 [06:19<01:42,  1.13it/s, loss=36.5]

cuda:0


 78%|███████▊  | 403/518 [06:20<01:41,  1.13it/s, loss=36.5]

cuda:0


 78%|███████▊  | 404/518 [06:20<01:38,  1.16it/s, loss=36.5]

cuda:0


 78%|███████▊  | 405/518 [06:22<01:47,  1.05it/s, loss=36.5]

cuda:0


 78%|███████▊  | 406/518 [06:22<01:41,  1.11it/s, loss=36.4]

cuda:0


 79%|███████▊  | 407/518 [06:23<01:39,  1.12it/s, loss=36.4]

cuda:0


 79%|███████▉  | 408/518 [06:24<01:34,  1.16it/s, loss=36.4]

cuda:0


 79%|███████▉  | 409/518 [06:25<01:35,  1.15it/s, loss=36.4]

cuda:0


 79%|███████▉  | 410/518 [06:26<01:30,  1.19it/s, loss=36.3]

cuda:0


 79%|███████▉  | 411/518 [06:27<01:46,  1.00it/s, loss=36.3]

cuda:0


 80%|███████▉  | 412/518 [06:28<01:39,  1.07it/s, loss=36.3]

cuda:0


 80%|███████▉  | 413/518 [06:29<01:45,  1.01s/it, loss=36.2]

cuda:0


 80%|███████▉  | 414/518 [06:30<01:38,  1.05it/s, loss=36.2]

cuda:0


 80%|████████  | 415/518 [06:31<01:43,  1.00s/it, loss=36.2]

cuda:0


 80%|████████  | 416/518 [06:32<01:36,  1.05it/s, loss=36.2]

cuda:0


 81%|████████  | 417/518 [06:33<01:43,  1.03s/it, loss=36.1]

cuda:0


 81%|████████  | 418/518 [06:34<01:36,  1.04it/s, loss=36.1]

cuda:0


 81%|████████  | 419/518 [06:35<01:30,  1.09it/s, loss=36.1]

cuda:0


 81%|████████  | 420/518 [06:35<01:25,  1.14it/s, loss=36.1]

cuda:0


 81%|████████▏ | 421/518 [06:36<01:27,  1.11it/s, loss=36]

cuda:0


 81%|████████▏ | 422/518 [06:37<01:24,  1.14it/s, loss=36]

cuda:0


 82%|████████▏ | 423/518 [06:38<01:23,  1.14it/s, loss=36]

cuda:0


 82%|████████▏ | 424/518 [06:39<01:19,  1.18it/s, loss=36]

cuda:0


 82%|████████▏ | 425/518 [06:40<01:25,  1.09it/s, loss=35.9]

cuda:0


 82%|████████▏ | 426/518 [06:41<01:20,  1.14it/s, loss=35.9]

cuda:0


 82%|████████▏ | 427/518 [06:42<01:33,  1.02s/it, loss=35.9]

cuda:0


 83%|████████▎ | 428/518 [06:43<01:26,  1.04it/s, loss=35.9]

cuda:0


 83%|████████▎ | 429/518 [06:44<01:32,  1.04s/it, loss=35.8]

cuda:0


 83%|████████▎ | 430/518 [06:45<01:26,  1.02it/s, loss=35.8]

cuda:0


 83%|████████▎ | 431/518 [06:46<01:24,  1.03it/s, loss=35.8]

cuda:0


 83%|████████▎ | 432/518 [06:47<01:17,  1.10it/s, loss=35.8]

cuda:0


 84%|████████▎ | 433/518 [06:48<01:21,  1.04it/s, loss=35.7]

cuda:0


 84%|████████▍ | 434/518 [06:49<01:17,  1.09it/s, loss=35.7]

cuda:0


 84%|████████▍ | 435/518 [06:49<01:16,  1.09it/s, loss=35.7]

cuda:0


 84%|████████▍ | 436/518 [06:50<01:11,  1.15it/s, loss=35.7]

cuda:0


 84%|████████▍ | 437/518 [06:51<01:16,  1.06it/s, loss=35.6]

cuda:0


 85%|████████▍ | 438/518 [06:52<01:11,  1.11it/s, loss=35.6]

cuda:0


 85%|████████▍ | 439/518 [06:53<01:12,  1.09it/s, loss=35.6]

cuda:0


 85%|████████▍ | 440/518 [06:54<01:08,  1.14it/s, loss=35.6]

cuda:0


 85%|████████▌ | 441/518 [06:55<01:17,  1.00s/it, loss=35.5]

cuda:0


 85%|████████▌ | 442/518 [06:56<01:11,  1.07it/s, loss=35.5]

cuda:0


 86%|████████▌ | 443/518 [06:57<01:11,  1.05it/s, loss=35.5]

cuda:0


 86%|████████▌ | 444/518 [06:58<01:06,  1.11it/s, loss=35.5]

cuda:0


 86%|████████▌ | 445/518 [06:59<01:06,  1.09it/s, loss=35.4]

cuda:0


 86%|████████▌ | 446/518 [06:59<01:03,  1.13it/s, loss=35.4]

cuda:0


 86%|████████▋ | 447/518 [07:01<01:11,  1.01s/it, loss=35.4]

cuda:0


 86%|████████▋ | 448/518 [07:02<01:06,  1.05it/s, loss=35.4]

cuda:0


 87%|████████▋ | 449/518 [07:03<01:07,  1.02it/s, loss=35.3]

cuda:0


 87%|████████▋ | 450/518 [07:04<01:04,  1.05it/s, loss=35.3]

cuda:0


 87%|████████▋ | 451/518 [07:05<01:04,  1.03it/s, loss=35.3]

cuda:0


 87%|████████▋ | 452/518 [07:05<01:00,  1.10it/s, loss=35.3]

cuda:0


 87%|████████▋ | 453/518 [07:06<01:02,  1.04it/s, loss=35.2]

cuda:0


 88%|████████▊ | 454/518 [07:07<00:57,  1.10it/s, loss=35.2]

cuda:0


 88%|████████▊ | 455/518 [07:08<01:01,  1.03it/s, loss=35.2]

cuda:0


 88%|████████▊ | 456/518 [07:09<00:56,  1.09it/s, loss=35.2]

cuda:0


 88%|████████▊ | 457/518 [07:10<00:56,  1.07it/s, loss=35.2]

cuda:0


 88%|████████▊ | 458/518 [07:11<00:53,  1.12it/s, loss=35.1]

cuda:0


 89%|████████▊ | 459/518 [07:12<00:53,  1.10it/s, loss=35.1]

cuda:0


 89%|████████▉ | 460/518 [07:13<00:51,  1.13it/s, loss=35.1]

cuda:0


 89%|████████▉ | 461/518 [07:14<00:56,  1.00it/s, loss=35.1]

cuda:0


 89%|████████▉ | 462/518 [07:15<00:52,  1.06it/s, loss=35]

cuda:0


 89%|████████▉ | 463/518 [07:16<00:54,  1.02it/s, loss=35]

cuda:0


 90%|████████▉ | 464/518 [07:17<00:50,  1.07it/s, loss=35]

cuda:0


 90%|████████▉ | 465/518 [07:18<00:49,  1.06it/s, loss=35]

cuda:0


 90%|████████▉ | 466/518 [07:18<00:47,  1.11it/s, loss=35]

cuda:0


 90%|█████████ | 467/518 [07:19<00:45,  1.12it/s, loss=34.9]

cuda:0


 90%|█████████ | 468/518 [07:20<00:42,  1.17it/s, loss=34.9]

cuda:0


 91%|█████████ | 469/518 [07:21<00:43,  1.12it/s, loss=34.9]

cuda:0


 91%|█████████ | 470/518 [07:22<00:41,  1.16it/s, loss=34.9]

cuda:0


 91%|█████████ | 471/518 [07:23<00:42,  1.11it/s, loss=34.8]

cuda:0


 91%|█████████ | 472/518 [07:24<00:40,  1.15it/s, loss=34.8]

cuda:0


 91%|█████████▏| 473/518 [07:25<00:40,  1.10it/s, loss=34.8]

cuda:0


 92%|█████████▏| 474/518 [07:25<00:39,  1.12it/s, loss=34.8]

cuda:0


 92%|█████████▏| 475/518 [07:27<00:41,  1.04it/s, loss=34.8]

cuda:0


 92%|█████████▏| 476/518 [07:27<00:37,  1.11it/s, loss=34.7]

cuda:0


 92%|█████████▏| 477/518 [07:28<00:37,  1.10it/s, loss=34.7]

cuda:0


 92%|█████████▏| 478/518 [07:29<00:34,  1.15it/s, loss=34.7]

cuda:0


 92%|█████████▏| 479/518 [07:30<00:35,  1.09it/s, loss=34.7]

cuda:0


 93%|█████████▎| 480/518 [07:31<00:33,  1.14it/s, loss=34.7]

cuda:0


 93%|█████████▎| 481/518 [07:32<00:33,  1.11it/s, loss=34.6]

cuda:0


 93%|█████████▎| 482/518 [07:33<00:31,  1.14it/s, loss=34.6]

cuda:0


 93%|█████████▎| 483/518 [07:34<00:31,  1.10it/s, loss=34.6]

cuda:0


 93%|█████████▎| 484/518 [07:34<00:29,  1.14it/s, loss=34.6]

cuda:0


 94%|█████████▎| 485/518 [07:36<00:32,  1.03it/s, loss=34.6]

cuda:0


 94%|█████████▍| 486/518 [07:36<00:30,  1.05it/s, loss=34.5]

cuda:0


 94%|█████████▍| 487/518 [07:37<00:29,  1.06it/s, loss=34.5]

cuda:0


 94%|█████████▍| 488/518 [07:38<00:26,  1.11it/s, loss=34.5]

cuda:0


 94%|█████████▍| 489/518 [07:39<00:27,  1.06it/s, loss=34.5]

cuda:0


 95%|█████████▍| 490/518 [07:40<00:25,  1.11it/s, loss=34.5]

cuda:0


 95%|█████████▍| 491/518 [07:41<00:25,  1.04it/s, loss=34.4]

cuda:0


 95%|█████████▍| 492/518 [07:42<00:23,  1.10it/s, loss=34.4]

cuda:0


 95%|█████████▌| 493/518 [07:43<00:23,  1.08it/s, loss=34.4]

cuda:0


 95%|█████████▌| 494/518 [07:44<00:21,  1.14it/s, loss=34.4]

cuda:0


 96%|█████████▌| 495/518 [07:45<00:23,  1.01s/it, loss=34.3]

cuda:0


 96%|█████████▌| 496/518 [07:46<00:20,  1.06it/s, loss=34.3]

cuda:0


 96%|█████████▌| 497/518 [07:47<00:20,  1.03it/s, loss=34.3]

cuda:0


 96%|█████████▌| 498/518 [07:48<00:18,  1.10it/s, loss=34.3]

cuda:0


 96%|█████████▋| 499/518 [07:49<00:18,  1.04it/s, loss=34.3]

cuda:0


 97%|█████████▋| 500/518 [07:49<00:16,  1.10it/s, loss=34.2]

cuda:0


 97%|█████████▋| 501/518 [07:50<00:16,  1.05it/s, loss=34.2]

cuda:0


 97%|█████████▋| 502/518 [07:51<00:14,  1.07it/s, loss=34.2]

cuda:0


 97%|█████████▋| 503/518 [07:52<00:13,  1.10it/s, loss=34.2]

cuda:0


 97%|█████████▋| 504/518 [07:53<00:12,  1.14it/s, loss=34.2]

cuda:0


 97%|█████████▋| 505/518 [07:54<00:11,  1.18it/s, loss=34.1]

cuda:0


 98%|█████████▊| 506/518 [07:55<00:09,  1.21it/s, loss=34.1]

cuda:0


 98%|█████████▊| 507/518 [07:56<00:09,  1.16it/s, loss=34.1]

cuda:0


 98%|█████████▊| 508/518 [07:56<00:08,  1.20it/s, loss=34.1]

cuda:0


 98%|█████████▊| 509/518 [07:57<00:07,  1.17it/s, loss=34.1]

cuda:0


 98%|█████████▊| 510/518 [07:58<00:06,  1.20it/s, loss=34]

cuda:0


 99%|█████████▊| 511/518 [07:59<00:06,  1.15it/s, loss=34]

cuda:0


 99%|█████████▉| 512/518 [08:00<00:05,  1.19it/s, loss=34]

cuda:0


 99%|█████████▉| 513/518 [08:01<00:04,  1.02it/s, loss=34]

cuda:0


 99%|█████████▉| 514/518 [08:02<00:03,  1.09it/s, loss=34]

cuda:0


 99%|█████████▉| 515/518 [08:03<00:02,  1.08it/s, loss=33.9]

cuda:0


100%|█████████▉| 516/518 [08:03<00:01,  1.15it/s, loss=33.9]

cuda:0


100%|█████████▉| 517/518 [08:04<00:00,  1.21it/s, loss=33.9]

cuda:0


100%|██████████| 518/518 [08:05<00:00,  1.07it/s, loss=33.9]
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


cuda:0


  0%|          | 1/518 [00:03<26:10,  3.04s/it, loss=22.7]

cuda:0


  0%|          | 2/518 [00:03<15:00,  1.75s/it, loss=22.8]

cuda:0


  1%|          | 3/518 [00:04<11:17,  1.32s/it, loss=23.1]

cuda:0


  1%|          | 4/518 [00:05<09:42,  1.13s/it, loss=23.1]

cuda:0


  1%|          | 5/518 [00:06<09:08,  1.07s/it, loss=22.8]

cuda:0


  1%|          | 6/518 [00:07<08:21,  1.02it/s, loss=22.8]

cuda:0


  1%|▏         | 7/518 [00:08<08:12,  1.04it/s, loss=22.8]

cuda:0


  2%|▏         | 8/518 [00:09<07:49,  1.09it/s, loss=23.1]

cuda:0


  2%|▏         | 9/518 [00:10<07:53,  1.08it/s, loss=23]

cuda:0


  2%|▏         | 10/518 [00:10<07:46,  1.09it/s, loss=23.4]

cuda:0


  2%|▏         | 11/518 [00:11<07:40,  1.10it/s, loss=23.5]

cuda:0


  2%|▏         | 12/518 [00:12<07:27,  1.13it/s, loss=23.5]

cuda:0


  3%|▎         | 13/518 [00:13<07:40,  1.10it/s, loss=23.6]

cuda:0


  3%|▎         | 14/518 [00:14<07:22,  1.14it/s, loss=23.5]

cuda:0


  3%|▎         | 15/518 [00:15<07:27,  1.12it/s, loss=23.5]

cuda:0


  3%|▎         | 16/518 [00:16<07:16,  1.15it/s, loss=23.5]

cuda:0


  3%|▎         | 17/518 [00:17<08:00,  1.04it/s, loss=23.5]

cuda:0


  3%|▎         | 18/518 [00:18<07:38,  1.09it/s, loss=23.5]

cuda:0


  4%|▎         | 19/518 [00:19<07:45,  1.07it/s, loss=23.5]

cuda:0


  4%|▍         | 20/518 [00:19<07:20,  1.13it/s, loss=23.5]

cuda:0


  4%|▍         | 21/518 [00:20<07:21,  1.13it/s, loss=23.5]

cuda:0


  4%|▍         | 22/518 [00:21<07:13,  1.14it/s, loss=23.5]

cuda:0


  4%|▍         | 23/518 [00:22<07:07,  1.16it/s, loss=23.4]

cuda:0


  5%|▍         | 24/518 [00:23<06:52,  1.20it/s, loss=23.4]

cuda:0


  5%|▍         | 25/518 [00:24<07:06,  1.16it/s, loss=23.5]

cuda:0


  5%|▌         | 26/518 [00:24<06:56,  1.18it/s, loss=23.5]

cuda:0


  5%|▌         | 27/518 [00:25<07:17,  1.12it/s, loss=23.4]

cuda:0


  5%|▌         | 28/518 [00:26<07:00,  1.16it/s, loss=23.4]

cuda:0


  6%|▌         | 29/518 [00:27<07:09,  1.14it/s, loss=23.4]

cuda:0


  6%|▌         | 30/518 [00:28<06:58,  1.17it/s, loss=23.4]

cuda:0


  6%|▌         | 31/518 [00:29<07:35,  1.07it/s, loss=23.4]

cuda:0


  6%|▌         | 32/518 [00:30<07:21,  1.10it/s, loss=23.5]

cuda:0


  6%|▋         | 33/518 [00:31<07:04,  1.14it/s, loss=23.5]

cuda:0


  7%|▋         | 34/518 [00:32<06:52,  1.17it/s, loss=23.5]

cuda:0


  7%|▋         | 35/518 [00:32<06:41,  1.20it/s, loss=23.5]

cuda:0


  7%|▋         | 36/518 [00:33<06:32,  1.23it/s, loss=23.4]

cuda:0


  7%|▋         | 37/518 [00:34<07:36,  1.05it/s, loss=23.5]

cuda:0


  7%|▋         | 38/518 [00:35<07:14,  1.10it/s, loss=23.5]

cuda:0


  8%|▊         | 39/518 [00:36<07:18,  1.09it/s, loss=23.4]

cuda:0


  8%|▊         | 40/518 [00:37<07:03,  1.13it/s, loss=23.5]

cuda:0


  8%|▊         | 41/518 [00:38<07:14,  1.10it/s, loss=23.5]

cuda:0


  8%|▊         | 42/518 [00:39<07:04,  1.12it/s, loss=23.4]

cuda:0


  8%|▊         | 43/518 [00:40<07:51,  1.01it/s, loss=23.4]

cuda:0


  8%|▊         | 44/518 [00:41<07:22,  1.07it/s, loss=23.4]

cuda:0


  9%|▊         | 45/518 [00:42<07:23,  1.07it/s, loss=23.4]

cuda:0


  9%|▉         | 46/518 [00:42<07:02,  1.12it/s, loss=23.4]

cuda:0


  9%|▉         | 47/518 [00:43<07:19,  1.07it/s, loss=23.4]

cuda:0


  9%|▉         | 48/518 [00:44<07:01,  1.12it/s, loss=23.4]

cuda:0


  9%|▉         | 49/518 [00:45<07:06,  1.10it/s, loss=23.4]

cuda:0


 10%|▉         | 50/518 [00:46<06:43,  1.16it/s, loss=23.4]

cuda:0


 10%|▉         | 51/518 [00:47<07:19,  1.06it/s, loss=23.4]

cuda:0


 10%|█         | 52/518 [00:48<07:10,  1.08it/s, loss=23.3]

cuda:0


 10%|█         | 53/518 [00:49<07:00,  1.11it/s, loss=23.3]

cuda:0


 10%|█         | 54/518 [00:50<06:52,  1.12it/s, loss=23.3]

cuda:0


 11%|█         | 55/518 [00:51<07:03,  1.09it/s, loss=23.3]

cuda:0


 11%|█         | 56/518 [00:52<06:50,  1.13it/s, loss=23.3]

cuda:0


 11%|█         | 57/518 [00:52<06:37,  1.16it/s, loss=23.3]

cuda:0


 11%|█         | 58/518 [00:53<06:30,  1.18it/s, loss=23.3]

cuda:0


 11%|█▏        | 59/518 [00:54<06:22,  1.20it/s, loss=23.3]

cuda:0


 12%|█▏        | 60/518 [00:55<06:15,  1.22it/s, loss=23.2]

cuda:0


 12%|█▏        | 61/518 [00:56<06:09,  1.24it/s, loss=23.2]

cuda:0


 12%|█▏        | 62/518 [00:56<06:09,  1.23it/s, loss=23.2]

cuda:0


 12%|█▏        | 63/518 [00:57<06:03,  1.25it/s, loss=23.2]

cuda:0


 12%|█▏        | 64/518 [00:58<06:00,  1.26it/s, loss=23.2]

cuda:0


 13%|█▎        | 65/518 [00:59<05:58,  1.26it/s, loss=23.2]

cuda:0


 13%|█▎        | 66/518 [01:00<06:14,  1.21it/s, loss=23.2]

cuda:0


 13%|█▎        | 67/518 [01:00<06:17,  1.19it/s, loss=23.2]

cuda:0


 13%|█▎        | 68/518 [01:01<06:14,  1.20it/s, loss=23.2]

cuda:0


 13%|█▎        | 69/518 [01:02<06:27,  1.16it/s, loss=23.2]

cuda:0


 14%|█▎        | 70/518 [01:03<06:21,  1.17it/s, loss=23.2]

cuda:0


 14%|█▎        | 71/518 [01:04<06:13,  1.20it/s, loss=23.2]

cuda:0


 14%|█▍        | 72/518 [01:05<06:03,  1.23it/s, loss=23.2]

cuda:0


 14%|█▍        | 73/518 [01:05<06:07,  1.21it/s, loss=23.2]

cuda:0


 14%|█▍        | 74/518 [01:06<06:18,  1.17it/s, loss=23.2]

cuda:0


 14%|█▍        | 75/518 [01:08<06:59,  1.06it/s, loss=23.2]

cuda:0


 15%|█▍        | 76/518 [01:08<06:35,  1.12it/s, loss=23.1]

cuda:0


 15%|█▍        | 77/518 [01:09<06:44,  1.09it/s, loss=23.1]

cuda:0


 15%|█▌        | 78/518 [01:10<06:32,  1.12it/s, loss=23.1]

cuda:0


 15%|█▌        | 79/518 [01:11<06:24,  1.14it/s, loss=23.1]

cuda:0


 15%|█▌        | 80/518 [01:12<06:16,  1.16it/s, loss=23.1]

cuda:0


 16%|█▌        | 81/518 [01:13<06:22,  1.14it/s, loss=23.1]

cuda:0


 16%|█▌        | 82/518 [01:13<06:11,  1.18it/s, loss=23.1]

cuda:0


 16%|█▌        | 83/518 [01:14<06:19,  1.15it/s, loss=23.1]

cuda:0


 16%|█▌        | 84/518 [01:15<06:06,  1.18it/s, loss=23.1]

cuda:0


 16%|█▋        | 85/518 [01:16<06:22,  1.13it/s, loss=23.1]

cuda:0


 17%|█▋        | 86/518 [01:17<06:16,  1.15it/s, loss=23.1]

cuda:0


 17%|█▋        | 87/518 [01:18<06:17,  1.14it/s, loss=23.1]

cuda:0


 17%|█▋        | 88/518 [01:19<06:02,  1.19it/s, loss=23]

cuda:0


 17%|█▋        | 89/518 [01:20<06:26,  1.11it/s, loss=23.1]

cuda:0


 17%|█▋        | 90/518 [01:20<06:12,  1.15it/s, loss=23.1]

cuda:0


 18%|█▊        | 91/518 [01:21<06:14,  1.14it/s, loss=23.1]

cuda:0


 18%|█▊        | 92/518 [01:22<06:08,  1.16it/s, loss=23.1]

cuda:0


 18%|█▊        | 93/518 [01:23<06:26,  1.10it/s, loss=23.1]

cuda:0


 18%|█▊        | 94/518 [01:24<06:23,  1.11it/s, loss=23.1]

cuda:0


 18%|█▊        | 95/518 [01:25<06:10,  1.14it/s, loss=23.1]

cuda:0


 19%|█▊        | 96/518 [01:26<06:02,  1.16it/s, loss=23.1]

cuda:0


 19%|█▊        | 97/518 [01:26<05:49,  1.20it/s, loss=23]

cuda:0


 19%|█▉        | 98/518 [01:27<05:49,  1.20it/s, loss=23]

cuda:0


 19%|█▉        | 99/518 [01:28<05:46,  1.21it/s, loss=23]

cuda:0


 19%|█▉        | 100/518 [01:29<05:39,  1.23it/s, loss=23]

cuda:0


 19%|█▉        | 101/518 [01:30<05:48,  1.20it/s, loss=23]

cuda:0


 20%|█▉        | 102/518 [01:31<05:42,  1.21it/s, loss=23]

cuda:0


 20%|█▉        | 103/518 [01:31<05:42,  1.21it/s, loss=23]

cuda:0


 20%|██        | 104/518 [01:32<05:36,  1.23it/s, loss=23]

cuda:0


 20%|██        | 105/518 [01:33<05:41,  1.21it/s, loss=23]

cuda:0


 20%|██        | 106/518 [01:34<05:41,  1.21it/s, loss=23]

cuda:0


 21%|██        | 107/518 [01:35<05:35,  1.23it/s, loss=23]

cuda:0


 21%|██        | 108/518 [01:35<05:31,  1.24it/s, loss=23]

cuda:0


 21%|██        | 109/518 [01:36<05:50,  1.17it/s, loss=23]

cuda:0


 21%|██        | 110/518 [01:37<05:37,  1.21it/s, loss=22.9]

cuda:0


 21%|██▏       | 111/518 [01:38<05:50,  1.16it/s, loss=22.9]

cuda:0


 22%|██▏       | 112/518 [01:39<05:42,  1.19it/s, loss=22.9]

cuda:0


 22%|██▏       | 113/518 [01:40<06:27,  1.05it/s, loss=22.9]

cuda:0


 22%|██▏       | 114/518 [01:41<06:08,  1.10it/s, loss=22.9]

cuda:0


 22%|██▏       | 115/518 [01:42<06:29,  1.03it/s, loss=22.9]

cuda:0


 22%|██▏       | 116/518 [01:43<06:06,  1.10it/s, loss=22.9]

cuda:0


 23%|██▎       | 117/518 [01:44<06:16,  1.06it/s, loss=22.9]

cuda:0


 23%|██▎       | 118/518 [01:45<06:02,  1.10it/s, loss=22.9]

cuda:0


 23%|██▎       | 119/518 [01:46<06:10,  1.08it/s, loss=22.9]

cuda:0


 23%|██▎       | 120/518 [01:47<05:56,  1.12it/s, loss=22.9]

cuda:0


 23%|██▎       | 121/518 [01:48<06:11,  1.07it/s, loss=22.9]

cuda:0


 24%|██▎       | 122/518 [01:48<05:55,  1.11it/s, loss=22.9]

cuda:0


 24%|██▎       | 123/518 [01:50<06:35,  1.00s/it, loss=22.9]

cuda:0


 24%|██▍       | 124/518 [01:50<06:11,  1.06it/s, loss=22.9]

cuda:0


 24%|██▍       | 125/518 [01:51<06:09,  1.06it/s, loss=22.9]

cuda:0


 24%|██▍       | 126/518 [01:52<05:57,  1.10it/s, loss=22.9]

cuda:0


 25%|██▍       | 127/518 [01:53<05:51,  1.11it/s, loss=22.8]

cuda:0


 25%|██▍       | 128/518 [01:54<05:35,  1.16it/s, loss=22.8]

cuda:0


 25%|██▍       | 129/518 [01:55<05:42,  1.14it/s, loss=22.8]

cuda:0


 25%|██▌       | 130/518 [01:56<05:36,  1.15it/s, loss=22.8]

cuda:0


 25%|██▌       | 131/518 [01:57<05:55,  1.09it/s, loss=22.8]

cuda:0


 25%|██▌       | 132/518 [01:57<05:37,  1.14it/s, loss=22.8]

cuda:0


 26%|██▌       | 133/518 [01:58<05:43,  1.12it/s, loss=22.8]

cuda:0


 26%|██▌       | 134/518 [01:59<05:27,  1.17it/s, loss=22.8]

cuda:0


 26%|██▌       | 135/518 [02:00<05:33,  1.15it/s, loss=22.8]

cuda:0


 26%|██▋       | 136/518 [02:01<05:20,  1.19it/s, loss=22.8]

cuda:0


 26%|██▋       | 137/518 [02:02<05:27,  1.16it/s, loss=22.8]

cuda:0


 27%|██▋       | 138/518 [02:02<05:20,  1.18it/s, loss=22.8]

cuda:0


 27%|██▋       | 139/518 [02:03<05:15,  1.20it/s, loss=22.8]

cuda:0


 27%|██▋       | 140/518 [02:04<05:11,  1.21it/s, loss=22.8]

cuda:0


 27%|██▋       | 141/518 [02:05<05:13,  1.20it/s, loss=22.7]

cuda:0


 27%|██▋       | 142/518 [02:06<05:12,  1.20it/s, loss=22.7]

cuda:0


 28%|██▊       | 143/518 [02:07<05:42,  1.10it/s, loss=22.7]

cuda:0


 28%|██▊       | 144/518 [02:08<05:31,  1.13it/s, loss=22.7]

cuda:0


 28%|██▊       | 145/518 [02:09<05:26,  1.14it/s, loss=22.7]

cuda:0


 28%|██▊       | 146/518 [02:09<05:24,  1.15it/s, loss=22.7]

cuda:0


 28%|██▊       | 147/518 [02:10<05:24,  1.14it/s, loss=22.7]

cuda:0


 29%|██▊       | 148/518 [02:11<05:14,  1.18it/s, loss=22.7]

cuda:0


 29%|██▉       | 149/518 [02:12<05:15,  1.17it/s, loss=22.7]

cuda:0


 29%|██▉       | 150/518 [02:13<05:12,  1.18it/s, loss=22.7]

cuda:0


 29%|██▉       | 151/518 [02:14<05:51,  1.04it/s, loss=22.7]

cuda:0


 29%|██▉       | 152/518 [02:15<05:36,  1.09it/s, loss=22.7]

cuda:0


 30%|██▉       | 153/518 [02:16<05:26,  1.12it/s, loss=22.7]

cuda:0


 30%|██▉       | 154/518 [02:16<05:20,  1.14it/s, loss=22.7]

cuda:0


 30%|██▉       | 155/518 [02:17<05:13,  1.16it/s, loss=22.6]

cuda:0


 30%|███       | 156/518 [02:18<05:07,  1.18it/s, loss=22.6]

cuda:0


 30%|███       | 157/518 [02:19<05:27,  1.10it/s, loss=22.6]

cuda:0


 31%|███       | 158/518 [02:20<05:12,  1.15it/s, loss=22.6]

cuda:0


 31%|███       | 159/518 [02:21<05:29,  1.09it/s, loss=22.6]

cuda:0


 31%|███       | 160/518 [02:22<05:21,  1.11it/s, loss=22.6]

cuda:0


 31%|███       | 161/518 [02:23<05:40,  1.05it/s, loss=22.6]

cuda:0


 31%|███▏      | 162/518 [02:24<05:19,  1.11it/s, loss=22.6]

cuda:0


 31%|███▏      | 163/518 [02:25<05:26,  1.09it/s, loss=22.6]

cuda:0


 32%|███▏      | 164/518 [02:25<05:12,  1.13it/s, loss=22.6]

cuda:0


 32%|███▏      | 165/518 [02:26<05:07,  1.15it/s, loss=22.6]

cuda:0


 32%|███▏      | 166/518 [02:27<05:02,  1.16it/s, loss=22.6]

cuda:0


 32%|███▏      | 167/518 [02:28<05:04,  1.15it/s, loss=22.6]

cuda:0


 32%|███▏      | 168/518 [02:29<05:06,  1.14it/s, loss=22.6]

cuda:0


 33%|███▎      | 169/518 [02:30<04:56,  1.18it/s, loss=22.6]

cuda:0


 33%|███▎      | 170/518 [02:31<04:52,  1.19it/s, loss=22.6]

cuda:0


 33%|███▎      | 171/518 [02:31<04:49,  1.20it/s, loss=22.6]

cuda:0


 33%|███▎      | 172/518 [02:32<04:55,  1.17it/s, loss=22.6]

cuda:0


 33%|███▎      | 173/518 [02:33<04:45,  1.21it/s, loss=22.6]

cuda:0


 34%|███▎      | 174/518 [02:34<04:42,  1.22it/s, loss=22.6]

cuda:0


 34%|███▍      | 175/518 [02:35<04:37,  1.24it/s, loss=22.6]

cuda:0


 34%|███▍      | 176/518 [02:35<04:38,  1.23it/s, loss=22.6]

cuda:0


 34%|███▍      | 177/518 [02:36<04:34,  1.24it/s, loss=22.6]

cuda:0


 34%|███▍      | 178/518 [02:37<04:33,  1.24it/s, loss=22.6]

cuda:0


 35%|███▍      | 179/518 [02:38<04:33,  1.24it/s, loss=22.6]

cuda:0


 35%|███▍      | 180/518 [02:39<04:33,  1.24it/s, loss=22.5]

cuda:0


 35%|███▍      | 181/518 [02:40<04:58,  1.13it/s, loss=22.5]

cuda:0


 35%|███▌      | 182/518 [02:40<04:48,  1.17it/s, loss=22.5]

cuda:0


 35%|███▌      | 183/518 [02:41<04:39,  1.20it/s, loss=22.5]

cuda:0


 36%|███▌      | 184/518 [02:42<04:39,  1.19it/s, loss=22.5]

cuda:0


 36%|███▌      | 185/518 [02:43<04:33,  1.22it/s, loss=22.5]

cuda:0


 36%|███▌      | 186/518 [02:44<04:26,  1.24it/s, loss=22.5]

cuda:0


 36%|███▌      | 187/518 [02:45<04:34,  1.20it/s, loss=22.5]

cuda:0


 36%|███▋      | 188/518 [02:45<04:43,  1.16it/s, loss=22.5]

cuda:0


 36%|███▋      | 189/518 [02:47<05:31,  1.01s/it, loss=22.5]

cuda:0


 37%|███▋      | 190/518 [02:48<05:07,  1.07it/s, loss=22.5]

cuda:0


 37%|███▋      | 191/518 [02:49<05:02,  1.08it/s, loss=22.5]

cuda:0


 37%|███▋      | 192/518 [02:49<05:00,  1.08it/s, loss=22.5]

cuda:0


 37%|███▋      | 193/518 [02:50<04:53,  1.11it/s, loss=22.5]

cuda:0


 37%|███▋      | 194/518 [02:51<04:44,  1.14it/s, loss=22.5]

cuda:0


 38%|███▊      | 195/518 [02:52<04:56,  1.09it/s, loss=22.5]

cuda:0


 38%|███▊      | 196/518 [02:53<04:51,  1.10it/s, loss=22.5]

cuda:0


 38%|███▊      | 197/518 [02:54<04:58,  1.08it/s, loss=22.5]

cuda:0


 38%|███▊      | 198/518 [02:55<04:42,  1.13it/s, loss=22.5]

cuda:0


 38%|███▊      | 199/518 [02:56<04:57,  1.07it/s, loss=22.5]

cuda:0


 39%|███▊      | 200/518 [02:57<04:48,  1.10it/s, loss=22.5]

cuda:0


 39%|███▉      | 201/518 [02:57<04:42,  1.12it/s, loss=22.4]

cuda:0


 39%|███▉      | 202/518 [02:58<04:37,  1.14it/s, loss=22.4]

cuda:0


 39%|███▉      | 203/518 [02:59<04:43,  1.11it/s, loss=22.4]

cuda:0


 39%|███▉      | 204/518 [03:00<04:29,  1.17it/s, loss=22.4]

cuda:0


 40%|███▉      | 205/518 [03:01<04:57,  1.05it/s, loss=22.4]

cuda:0


 40%|███▉      | 206/518 [03:02<04:39,  1.12it/s, loss=22.4]

cuda:0


 40%|███▉      | 207/518 [03:03<04:41,  1.11it/s, loss=22.4]

cuda:0


 40%|████      | 208/518 [03:04<04:27,  1.16it/s, loss=22.4]

cuda:0


 40%|████      | 209/518 [03:05<04:31,  1.14it/s, loss=22.4]

cuda:0


 41%|████      | 210/518 [03:05<04:27,  1.15it/s, loss=22.4]

cuda:0


 41%|████      | 211/518 [03:06<04:22,  1.17it/s, loss=22.4]

cuda:0


 41%|████      | 212/518 [03:07<04:23,  1.16it/s, loss=22.4]

cuda:0


 41%|████      | 213/518 [03:08<04:14,  1.20it/s, loss=22.4]

cuda:0


 41%|████▏     | 214/518 [03:09<04:14,  1.19it/s, loss=22.4]

cuda:0


 42%|████▏     | 215/518 [03:10<04:11,  1.21it/s, loss=22.4]

cuda:0


 42%|████▏     | 216/518 [03:10<04:09,  1.21it/s, loss=22.3]

cuda:0


 42%|████▏     | 217/518 [03:11<04:06,  1.22it/s, loss=22.4]

cuda:0


 42%|████▏     | 218/518 [03:12<04:17,  1.17it/s, loss=22.4]

cuda:0


 42%|████▏     | 219/518 [03:13<04:09,  1.20it/s, loss=22.3]

cuda:0


 42%|████▏     | 220/518 [03:14<04:09,  1.19it/s, loss=22.3]

cuda:0


 43%|████▎     | 221/518 [03:15<04:02,  1.22it/s, loss=22.3]

cuda:0


 43%|████▎     | 222/518 [03:15<04:00,  1.23it/s, loss=22.3]

cuda:0


 43%|████▎     | 223/518 [03:16<04:13,  1.17it/s, loss=22.3]

cuda:0


 43%|████▎     | 224/518 [03:17<04:09,  1.18it/s, loss=22.3]

cuda:0


 43%|████▎     | 225/518 [03:18<04:43,  1.03it/s, loss=22.3]

cuda:0


 44%|████▎     | 226/518 [03:19<04:31,  1.07it/s, loss=22.3]

cuda:0


 44%|████▍     | 227/518 [03:20<04:34,  1.06it/s, loss=22.3]

cuda:0


 44%|████▍     | 228/518 [03:21<04:24,  1.10it/s, loss=22.3]

cuda:0


 44%|████▍     | 229/518 [03:22<04:13,  1.14it/s, loss=22.3]

cuda:0


 44%|████▍     | 230/518 [03:23<04:10,  1.15it/s, loss=22.3]

cuda:0


 45%|████▍     | 231/518 [03:24<04:07,  1.16it/s, loss=22.3]

cuda:0


 45%|████▍     | 232/518 [03:24<04:05,  1.16it/s, loss=22.3]

cuda:0


 45%|████▍     | 233/518 [03:25<04:02,  1.18it/s, loss=22.2]

cuda:0


 45%|████▌     | 234/518 [03:26<03:59,  1.19it/s, loss=22.2]

cuda:0


 45%|████▌     | 235/518 [03:27<04:14,  1.11it/s, loss=22.2]

cuda:0


 46%|████▌     | 236/518 [03:28<04:06,  1.15it/s, loss=22.2]

cuda:0


 46%|████▌     | 237/518 [03:29<04:14,  1.10it/s, loss=22.2]

cuda:0


 46%|████▌     | 238/518 [03:30<04:06,  1.14it/s, loss=22.2]

cuda:0


 46%|████▌     | 239/518 [03:30<03:57,  1.17it/s, loss=22.2]

cuda:0


 46%|████▋     | 240/518 [03:31<03:51,  1.20it/s, loss=22.2]

cuda:0


 47%|████▋     | 241/518 [03:32<03:54,  1.18it/s, loss=22.2]

cuda:0


 47%|████▋     | 242/518 [03:33<03:51,  1.19it/s, loss=22.2]

cuda:0


 47%|████▋     | 243/518 [03:34<04:00,  1.14it/s, loss=22.2]

cuda:0


 47%|████▋     | 244/518 [03:35<03:57,  1.15it/s, loss=22.2]

cuda:0


 47%|████▋     | 245/518 [03:36<04:11,  1.09it/s, loss=22.2]

cuda:0


 47%|████▋     | 246/518 [03:37<04:01,  1.12it/s, loss=22.2]

cuda:0


 48%|████▊     | 247/518 [03:37<03:53,  1.16it/s, loss=22.2]

cuda:0


 48%|████▊     | 248/518 [03:38<03:51,  1.16it/s, loss=22.2]

cuda:0


 48%|████▊     | 249/518 [03:39<03:45,  1.19it/s, loss=22.2]

cuda:0


 48%|████▊     | 250/518 [03:40<03:46,  1.18it/s, loss=22.1]

cuda:0


 48%|████▊     | 251/518 [03:41<03:42,  1.20it/s, loss=22.1]

cuda:0


 49%|████▊     | 252/518 [03:42<03:37,  1.23it/s, loss=22.1]

cuda:0


 49%|████▉     | 253/518 [03:42<03:32,  1.25it/s, loss=22.1]

cuda:0


 49%|████▉     | 254/518 [03:43<03:36,  1.22it/s, loss=22.1]

cuda:0


 49%|████▉     | 255/518 [03:44<03:33,  1.23it/s, loss=22.1]

cuda:0


 49%|████▉     | 256/518 [03:45<03:33,  1.23it/s, loss=22.1]

cuda:0


 50%|████▉     | 257/518 [03:46<03:34,  1.22it/s, loss=22.1]

cuda:0


 50%|████▉     | 258/518 [03:46<03:34,  1.21it/s, loss=22.1]

cuda:0


 50%|█████     | 259/518 [03:47<03:50,  1.12it/s, loss=22.1]

cuda:0


 50%|█████     | 260/518 [03:48<03:46,  1.14it/s, loss=22.1]

cuda:0


 50%|█████     | 261/518 [03:49<03:48,  1.12it/s, loss=22.1]

cuda:0


 51%|█████     | 262/518 [03:50<03:48,  1.12it/s, loss=22.1]

cuda:0


 51%|█████     | 263/518 [03:51<04:00,  1.06it/s, loss=22.1]

cuda:0


 51%|█████     | 264/518 [03:52<03:46,  1.12it/s, loss=22.1]

cuda:0


 51%|█████     | 265/518 [03:53<03:40,  1.15it/s, loss=22.1]

cuda:0


 51%|█████▏    | 266/518 [03:54<03:40,  1.14it/s, loss=22.1]

cuda:0


 52%|█████▏    | 267/518 [03:54<03:34,  1.17it/s, loss=22.1]

cuda:0


 52%|█████▏    | 268/518 [03:55<03:30,  1.19it/s, loss=22]

cuda:0


 52%|█████▏    | 269/518 [03:56<03:46,  1.10it/s, loss=22]

cuda:0


 52%|█████▏    | 270/518 [03:57<03:40,  1.12it/s, loss=22]

cuda:0


 52%|█████▏    | 271/518 [03:58<04:01,  1.02it/s, loss=22]

cuda:0


 53%|█████▎    | 272/518 [03:59<03:46,  1.09it/s, loss=22]

cuda:0


 53%|█████▎    | 273/518 [04:00<03:59,  1.02it/s, loss=22]

cuda:0


 53%|█████▎    | 274/518 [04:01<03:50,  1.06it/s, loss=22]

cuda:0


 53%|█████▎    | 275/518 [04:02<03:39,  1.11it/s, loss=22]

cuda:0


 53%|█████▎    | 276/518 [04:03<03:34,  1.13it/s, loss=22]

cuda:0


 53%|█████▎    | 277/518 [04:04<03:24,  1.18it/s, loss=22]

cuda:0


 54%|█████▎    | 278/518 [04:04<03:23,  1.18it/s, loss=22]

cuda:0


 54%|█████▍    | 279/518 [04:05<03:18,  1.20it/s, loss=22]

cuda:0


 54%|█████▍    | 280/518 [04:06<03:14,  1.22it/s, loss=22]

cuda:0


 54%|█████▍    | 281/518 [04:07<03:21,  1.17it/s, loss=22]

cuda:0


 54%|█████▍    | 282/518 [04:08<03:16,  1.20it/s, loss=22]

cuda:0


 55%|█████▍    | 283/518 [04:09<03:34,  1.10it/s, loss=22]

cuda:0


 55%|█████▍    | 284/518 [04:10<03:28,  1.12it/s, loss=22]

cuda:0


 55%|█████▌    | 285/518 [04:10<03:20,  1.16it/s, loss=21.9]

cuda:0


 55%|█████▌    | 286/518 [04:11<03:17,  1.17it/s, loss=21.9]

cuda:0


 55%|█████▌    | 287/518 [04:12<03:13,  1.19it/s, loss=21.9]

cuda:0


 56%|█████▌    | 288/518 [04:13<03:12,  1.19it/s, loss=21.9]

cuda:0


 56%|█████▌    | 289/518 [04:14<03:07,  1.22it/s, loss=21.9]

cuda:0


 56%|█████▌    | 290/518 [04:15<03:08,  1.21it/s, loss=21.9]

cuda:0


 56%|█████▌    | 291/518 [04:15<03:04,  1.23it/s, loss=21.9]

cuda:0


 56%|█████▋    | 292/518 [04:16<03:00,  1.25it/s, loss=21.9]

cuda:0


 57%|█████▋    | 293/518 [04:17<03:13,  1.16it/s, loss=21.9]

cuda:0


 57%|█████▋    | 294/518 [04:18<03:21,  1.11it/s, loss=21.9]

cuda:0


 57%|█████▋    | 295/518 [04:19<03:15,  1.14it/s, loss=21.9]

cuda:0


 57%|█████▋    | 296/518 [04:20<03:08,  1.18it/s, loss=21.9]

cuda:0


 57%|█████▋    | 297/518 [04:20<03:05,  1.19it/s, loss=21.9]

cuda:0


 58%|█████▊    | 298/518 [04:21<03:04,  1.19it/s, loss=21.9]

cuda:0


 58%|█████▊    | 299/518 [04:22<03:15,  1.12it/s, loss=21.9]

cuda:0


 58%|█████▊    | 300/518 [04:23<03:10,  1.15it/s, loss=21.9]

cuda:0


 58%|█████▊    | 301/518 [04:24<03:25,  1.05it/s, loss=21.9]

cuda:0


 58%|█████▊    | 302/518 [04:25<03:18,  1.09it/s, loss=21.9]

cuda:0


 58%|█████▊    | 303/518 [04:26<03:16,  1.10it/s, loss=21.9]

cuda:0


 59%|█████▊    | 304/518 [04:27<03:10,  1.13it/s, loss=21.9]

cuda:0


 59%|█████▉    | 305/518 [04:28<03:15,  1.09it/s, loss=21.9]

cuda:0


 59%|█████▉    | 306/518 [04:29<03:07,  1.13it/s, loss=21.9]

cuda:0


 59%|█████▉    | 307/518 [04:30<03:12,  1.10it/s, loss=21.9]

cuda:0


 59%|█████▉    | 308/518 [04:30<03:04,  1.14it/s, loss=21.9]

cuda:0


 60%|█████▉    | 309/518 [04:32<03:16,  1.07it/s, loss=21.8]

cuda:0


 60%|█████▉    | 310/518 [04:32<03:08,  1.10it/s, loss=21.8]

cuda:0


 60%|██████    | 311/518 [04:33<03:21,  1.03it/s, loss=21.8]

cuda:0


 60%|██████    | 312/518 [04:34<03:11,  1.07it/s, loss=21.8]

cuda:0


 60%|██████    | 313/518 [04:35<03:11,  1.07it/s, loss=21.8]

cuda:0


 61%|██████    | 314/518 [04:36<02:59,  1.13it/s, loss=21.8]

cuda:0


 61%|██████    | 315/518 [04:37<03:00,  1.12it/s, loss=21.8]

cuda:0


 61%|██████    | 316/518 [04:38<03:01,  1.11it/s, loss=21.8]

cuda:0


 61%|██████    | 317/518 [04:39<02:53,  1.16it/s, loss=21.8]

cuda:0


 61%|██████▏   | 318/518 [04:39<02:52,  1.16it/s, loss=21.8]

cuda:0


 62%|██████▏   | 319/518 [04:40<02:57,  1.12it/s, loss=21.8]

cuda:0


 62%|██████▏   | 320/518 [04:41<02:48,  1.17it/s, loss=21.8]

cuda:0


 62%|██████▏   | 321/518 [04:42<02:59,  1.10it/s, loss=21.8]

cuda:0


 62%|██████▏   | 322/518 [04:43<02:52,  1.13it/s, loss=21.8]

cuda:0


 62%|██████▏   | 323/518 [04:44<02:53,  1.12it/s, loss=21.8]

cuda:0


 63%|██████▎   | 324/518 [04:45<02:46,  1.17it/s, loss=21.8]

cuda:0


 63%|██████▎   | 325/518 [04:46<02:42,  1.19it/s, loss=21.8]

cuda:0


 63%|██████▎   | 326/518 [04:46<02:37,  1.22it/s, loss=21.8]

cuda:0


 63%|██████▎   | 327/518 [04:47<02:37,  1.21it/s, loss=21.8]

cuda:0


 63%|██████▎   | 328/518 [04:48<02:35,  1.22it/s, loss=21.8]

cuda:0


 64%|██████▎   | 329/518 [04:49<02:33,  1.23it/s, loss=21.8]

cuda:0


 64%|██████▎   | 330/518 [04:50<02:29,  1.26it/s, loss=21.7]

cuda:0


 64%|██████▍   | 331/518 [04:51<02:40,  1.17it/s, loss=21.7]

cuda:0


 64%|██████▍   | 332/518 [04:51<02:40,  1.16it/s, loss=21.7]

cuda:0


 64%|██████▍   | 333/518 [04:52<02:51,  1.08it/s, loss=21.7]

cuda:0


 64%|██████▍   | 334/518 [04:53<02:48,  1.09it/s, loss=21.7]

cuda:0


 65%|██████▍   | 335/518 [04:55<03:05,  1.01s/it, loss=21.7]

cuda:0


 65%|██████▍   | 336/518 [04:55<02:57,  1.03it/s, loss=21.7]

cuda:0


 65%|██████▌   | 337/518 [04:56<02:46,  1.09it/s, loss=21.7]

cuda:0


 65%|██████▌   | 338/518 [04:57<02:40,  1.12it/s, loss=21.7]

cuda:0


 65%|██████▌   | 339/518 [04:58<02:33,  1.17it/s, loss=21.7]

cuda:0


 66%|██████▌   | 340/518 [04:59<02:27,  1.21it/s, loss=21.7]

cuda:0


 66%|██████▌   | 341/518 [04:59<02:23,  1.23it/s, loss=21.7]

cuda:0


 66%|██████▌   | 342/518 [05:00<02:21,  1.24it/s, loss=21.7]

cuda:0


 66%|██████▌   | 343/518 [05:01<02:29,  1.17it/s, loss=21.7]

cuda:0


 66%|██████▋   | 344/518 [05:02<02:30,  1.16it/s, loss=21.7]

cuda:0


 67%|██████▋   | 345/518 [05:03<02:30,  1.15it/s, loss=21.7]

cuda:0


 67%|██████▋   | 346/518 [05:04<02:28,  1.16it/s, loss=21.7]

cuda:0


 67%|██████▋   | 347/518 [05:05<02:33,  1.12it/s, loss=21.7]

cuda:0


 67%|██████▋   | 348/518 [05:06<02:28,  1.15it/s, loss=21.7]

cuda:0


 67%|██████▋   | 349/518 [05:07<02:36,  1.08it/s, loss=21.7]

cuda:0


 68%|██████▊   | 350/518 [05:07<02:27,  1.14it/s, loss=21.7]

cuda:0


 68%|██████▊   | 351/518 [05:08<02:26,  1.14it/s, loss=21.7]

cuda:0


 68%|██████▊   | 352/518 [05:09<02:25,  1.14it/s, loss=21.7]

cuda:0


 68%|██████▊   | 353/518 [05:10<02:21,  1.17it/s, loss=21.6]

cuda:0


 68%|██████▊   | 354/518 [05:11<02:17,  1.19it/s, loss=21.6]

cuda:0


 69%|██████▊   | 355/518 [05:12<02:16,  1.19it/s, loss=21.6]

cuda:0


 69%|██████▊   | 356/518 [05:12<02:17,  1.18it/s, loss=21.6]

cuda:0


 69%|██████▉   | 357/518 [05:14<02:27,  1.10it/s, loss=21.6]

cuda:0


 69%|██████▉   | 358/518 [05:14<02:18,  1.15it/s, loss=21.6]

cuda:0


 69%|██████▉   | 359/518 [05:15<02:20,  1.13it/s, loss=21.6]

cuda:0


 69%|██████▉   | 360/518 [05:16<02:16,  1.16it/s, loss=21.6]

cuda:0


 70%|██████▉   | 361/518 [05:17<02:16,  1.15it/s, loss=21.6]

cuda:0


 70%|██████▉   | 362/518 [05:18<02:12,  1.18it/s, loss=21.6]

cuda:0


 70%|███████   | 363/518 [05:19<02:12,  1.17it/s, loss=21.6]

cuda:0


 70%|███████   | 364/518 [05:19<02:09,  1.19it/s, loss=21.6]

cuda:0


 70%|███████   | 365/518 [05:20<02:07,  1.20it/s, loss=21.6]

cuda:0


 71%|███████   | 366/518 [05:21<02:04,  1.22it/s, loss=21.6]

cuda:0


 71%|███████   | 367/518 [05:22<02:02,  1.24it/s, loss=21.6]

cuda:0


 71%|███████   | 368/518 [05:23<02:04,  1.21it/s, loss=21.6]

cuda:0


 71%|███████   | 369/518 [05:24<02:04,  1.19it/s, loss=21.6]

cuda:0


 71%|███████▏  | 370/518 [05:24<02:06,  1.17it/s, loss=21.6]

cuda:0


 72%|███████▏  | 371/518 [05:25<02:11,  1.12it/s, loss=21.6]

cuda:0


 72%|███████▏  | 372/518 [05:26<02:14,  1.08it/s, loss=21.6]

cuda:0


 72%|███████▏  | 373/518 [05:27<02:18,  1.04it/s, loss=21.6]

cuda:0


 72%|███████▏  | 374/518 [05:28<02:12,  1.08it/s, loss=21.6]

cuda:0


 72%|███████▏  | 375/518 [05:29<02:06,  1.13it/s, loss=21.5]

cuda:0


 73%|███████▎  | 376/518 [05:30<02:00,  1.18it/s, loss=21.5]

cuda:0


 73%|███████▎  | 377/518 [05:31<02:06,  1.11it/s, loss=21.5]

cuda:0


 73%|███████▎  | 378/518 [05:32<02:03,  1.14it/s, loss=21.5]

cuda:0


 73%|███████▎  | 379/518 [05:32<01:58,  1.17it/s, loss=21.5]

cuda:0


 73%|███████▎  | 380/518 [05:33<01:55,  1.19it/s, loss=21.5]

cuda:0


 74%|███████▎  | 381/518 [05:34<01:52,  1.22it/s, loss=21.5]

cuda:0


 74%|███████▎  | 382/518 [05:35<01:53,  1.20it/s, loss=21.5]

cuda:0


 74%|███████▍  | 383/518 [05:36<01:57,  1.14it/s, loss=21.5]

cuda:0


 74%|███████▍  | 384/518 [05:37<01:55,  1.16it/s, loss=21.5]

cuda:0


 74%|███████▍  | 385/518 [05:38<02:03,  1.08it/s, loss=21.5]

cuda:0


 75%|███████▍  | 386/518 [05:39<02:00,  1.09it/s, loss=21.5]

cuda:0


 75%|███████▍  | 387/518 [05:40<02:01,  1.08it/s, loss=21.5]

cuda:0


 75%|███████▍  | 388/518 [05:40<01:54,  1.14it/s, loss=21.5]

cuda:0


 75%|███████▌  | 389/518 [05:41<01:53,  1.14it/s, loss=21.5]

cuda:0


 75%|███████▌  | 390/518 [05:42<01:49,  1.17it/s, loss=21.5]

cuda:0


 75%|███████▌  | 391/518 [05:43<01:53,  1.12it/s, loss=21.5]

cuda:0


 76%|███████▌  | 392/518 [05:44<01:47,  1.17it/s, loss=21.5]

cuda:0


 76%|███████▌  | 393/518 [05:45<01:52,  1.11it/s, loss=21.5]

cuda:0


 76%|███████▌  | 394/518 [05:46<01:48,  1.14it/s, loss=21.5]

cuda:0


 76%|███████▋  | 395/518 [05:47<01:51,  1.10it/s, loss=21.5]

cuda:0


 76%|███████▋  | 396/518 [05:47<01:47,  1.14it/s, loss=21.5]

cuda:0


 77%|███████▋  | 397/518 [05:48<01:47,  1.12it/s, loss=21.4]

cuda:0


 77%|███████▋  | 398/518 [05:49<01:43,  1.16it/s, loss=21.4]

cuda:0


 77%|███████▋  | 399/518 [05:50<01:39,  1.20it/s, loss=21.4]

cuda:0


 77%|███████▋  | 400/518 [05:51<01:37,  1.21it/s, loss=21.4]

cuda:0


 77%|███████▋  | 401/518 [05:52<01:35,  1.22it/s, loss=21.4]

cuda:0


 78%|███████▊  | 402/518 [05:52<01:34,  1.23it/s, loss=21.4]

cuda:0


 78%|███████▊  | 403/518 [05:53<01:32,  1.24it/s, loss=21.4]

cuda:0


 78%|███████▊  | 404/518 [05:54<01:30,  1.25it/s, loss=21.4]

cuda:0


 78%|███████▊  | 405/518 [05:55<01:29,  1.26it/s, loss=21.4]

cuda:0


 78%|███████▊  | 406/518 [05:55<01:27,  1.27it/s, loss=21.4]

cuda:0


 79%|███████▊  | 407/518 [05:56<01:31,  1.21it/s, loss=21.4]

cuda:0


 79%|███████▉  | 408/518 [05:57<01:32,  1.19it/s, loss=21.4]

cuda:0


 79%|███████▉  | 409/518 [05:58<01:32,  1.17it/s, loss=21.4]

cuda:0


 79%|███████▉  | 410/518 [05:59<01:32,  1.16it/s, loss=21.4]

cuda:0


 79%|███████▉  | 411/518 [06:00<01:41,  1.06it/s, loss=21.4]

cuda:0


 80%|███████▉  | 412/518 [06:01<01:35,  1.12it/s, loss=21.4]

cuda:0


 80%|███████▉  | 413/518 [06:02<01:36,  1.09it/s, loss=21.4]

cuda:0


 80%|███████▉  | 414/518 [06:03<01:32,  1.12it/s, loss=21.4]

cuda:0


 80%|████████  | 415/518 [06:04<01:28,  1.16it/s, loss=21.4]

cuda:0


 80%|████████  | 416/518 [06:04<01:26,  1.18it/s, loss=21.4]

cuda:0


 81%|████████  | 417/518 [06:05<01:30,  1.12it/s, loss=21.4]

cuda:0


 81%|████████  | 418/518 [06:06<01:25,  1.17it/s, loss=21.4]

cuda:0


 81%|████████  | 419/518 [06:07<01:25,  1.16it/s, loss=21.4]

cuda:0


 81%|████████  | 420/518 [06:08<01:24,  1.16it/s, loss=21.4]

cuda:0


 81%|████████▏ | 421/518 [06:09<01:28,  1.09it/s, loss=21.3]

cuda:0


 81%|████████▏ | 422/518 [06:10<01:26,  1.11it/s, loss=21.3]

cuda:0


 82%|████████▏ | 423/518 [06:11<01:40,  1.06s/it, loss=21.3]

cuda:0


 82%|████████▏ | 424/518 [06:12<01:33,  1.01it/s, loss=21.3]

cuda:0


 82%|████████▏ | 425/518 [06:13<01:29,  1.04it/s, loss=21.3]

cuda:0


 82%|████████▏ | 426/518 [06:14<01:24,  1.09it/s, loss=21.3]

cuda:0


 82%|████████▏ | 427/518 [06:15<01:22,  1.10it/s, loss=21.3]

cuda:0


 83%|████████▎ | 428/518 [06:15<01:18,  1.14it/s, loss=21.3]

cuda:0


 83%|████████▎ | 429/518 [06:16<01:20,  1.10it/s, loss=21.3]

cuda:0


 83%|████████▎ | 430/518 [06:17<01:17,  1.13it/s, loss=21.3]

cuda:0


 83%|████████▎ | 431/518 [06:18<01:14,  1.17it/s, loss=21.3]

cuda:0


 83%|████████▎ | 432/518 [06:19<01:13,  1.17it/s, loss=21.3]

cuda:0


 84%|████████▎ | 433/518 [06:20<01:12,  1.17it/s, loss=21.3]

cuda:0


 84%|████████▍ | 434/518 [06:21<01:10,  1.19it/s, loss=21.3]

cuda:0


 84%|████████▍ | 435/518 [06:21<01:11,  1.16it/s, loss=21.3]

cuda:0


 84%|████████▍ | 436/518 [06:22<01:10,  1.16it/s, loss=21.3]

cuda:0


 84%|████████▍ | 437/518 [06:23<01:07,  1.20it/s, loss=21.3]

cuda:0


 85%|████████▍ | 438/518 [06:24<01:04,  1.23it/s, loss=21.3]

cuda:0


 85%|████████▍ | 439/518 [06:25<01:05,  1.20it/s, loss=21.3]

cuda:0


 85%|████████▍ | 440/518 [06:26<01:05,  1.20it/s, loss=21.3]

cuda:0


 85%|████████▌ | 441/518 [06:26<01:03,  1.21it/s, loss=21.3]

cuda:0


 85%|████████▌ | 442/518 [06:27<01:01,  1.23it/s, loss=21.3]

cuda:0


 86%|████████▌ | 443/518 [06:28<01:04,  1.15it/s, loss=21.3]

cuda:0


 86%|████████▌ | 444/518 [06:29<01:02,  1.18it/s, loss=21.3]

cuda:0


 86%|████████▌ | 445/518 [06:30<01:02,  1.17it/s, loss=21.2]

cuda:0


 86%|████████▌ | 446/518 [06:31<01:04,  1.11it/s, loss=21.2]

cuda:0


 86%|████████▋ | 447/518 [06:32<01:13,  1.03s/it, loss=21.2]

cuda:0


 86%|████████▋ | 448/518 [06:33<01:07,  1.04it/s, loss=21.2]

cuda:0


 87%|████████▋ | 449/518 [06:34<01:07,  1.03it/s, loss=21.2]

cuda:0


 87%|████████▋ | 450/518 [06:35<01:03,  1.08it/s, loss=21.2]

cuda:0


 87%|████████▋ | 451/518 [06:36<00:59,  1.13it/s, loss=21.2]

cuda:0


 87%|████████▋ | 452/518 [06:36<00:57,  1.16it/s, loss=21.2]

cuda:0


 87%|████████▋ | 453/518 [06:37<00:54,  1.18it/s, loss=21.2]

cuda:0


 88%|████████▊ | 454/518 [06:38<00:53,  1.20it/s, loss=21.2]

cuda:0


 88%|████████▊ | 455/518 [06:39<00:51,  1.21it/s, loss=21.2]

cuda:0


 88%|████████▊ | 456/518 [06:40<00:51,  1.21it/s, loss=21.2]

cuda:0


 88%|████████▊ | 457/518 [06:40<00:51,  1.18it/s, loss=21.2]

cuda:0


 88%|████████▊ | 458/518 [06:41<00:50,  1.19it/s, loss=21.2]

cuda:0


 89%|████████▊ | 459/518 [06:42<00:51,  1.15it/s, loss=21.2]

cuda:0


 89%|████████▉ | 460/518 [06:43<00:49,  1.17it/s, loss=21.2]

cuda:0


 89%|████████▉ | 461/518 [06:44<00:54,  1.05it/s, loss=21.2]

cuda:0


 89%|████████▉ | 462/518 [06:45<00:51,  1.08it/s, loss=21.2]

cuda:0


 89%|████████▉ | 463/518 [06:46<00:51,  1.08it/s, loss=21.2]

cuda:0


 90%|████████▉ | 464/518 [06:47<00:48,  1.11it/s, loss=21.2]

cuda:0


 90%|████████▉ | 465/518 [06:48<00:48,  1.10it/s, loss=21.2]

cuda:0


 90%|████████▉ | 466/518 [06:49<00:45,  1.15it/s, loss=21.2]

cuda:0


 90%|█████████ | 467/518 [06:50<00:45,  1.11it/s, loss=21.2]

cuda:0


 90%|█████████ | 468/518 [06:50<00:44,  1.13it/s, loss=21.2]

cuda:0


 91%|█████████ | 469/518 [06:51<00:42,  1.16it/s, loss=21.2]

cuda:0


 91%|█████████ | 470/518 [06:52<00:41,  1.17it/s, loss=21.2]

cuda:0


 91%|█████████ | 471/518 [06:53<00:44,  1.06it/s, loss=21.1]

cuda:0


 91%|█████████ | 472/518 [06:54<00:41,  1.10it/s, loss=21.1]

cuda:0


 91%|█████████▏| 473/518 [06:55<00:39,  1.14it/s, loss=21.1]

cuda:0


 92%|█████████▏| 474/518 [06:56<00:37,  1.18it/s, loss=21.1]

cuda:0


 92%|█████████▏| 475/518 [06:56<00:35,  1.20it/s, loss=21.1]

cuda:0


 92%|█████████▏| 476/518 [06:57<00:35,  1.18it/s, loss=21.1]

cuda:0


 92%|█████████▏| 477/518 [06:58<00:33,  1.21it/s, loss=21.1]

cuda:0


 92%|█████████▏| 478/518 [06:59<00:32,  1.22it/s, loss=21.1]

cuda:0


 92%|█████████▏| 479/518 [07:00<00:32,  1.21it/s, loss=21.1]

cuda:0


 93%|█████████▎| 480/518 [07:01<00:31,  1.21it/s, loss=21.1]

cuda:0


 93%|█████████▎| 481/518 [07:01<00:30,  1.23it/s, loss=21.1]

cuda:0


 93%|█████████▎| 482/518 [07:02<00:29,  1.21it/s, loss=21.1]

cuda:0


 93%|█████████▎| 483/518 [07:03<00:29,  1.17it/s, loss=21.1]

cuda:0


 93%|█████████▎| 484/518 [07:04<00:29,  1.15it/s, loss=21.1]

cuda:0


 94%|█████████▎| 485/518 [07:05<00:29,  1.12it/s, loss=21.1]

cuda:0


 94%|█████████▍| 486/518 [07:06<00:27,  1.15it/s, loss=21.1]

cuda:0


 94%|█████████▍| 487/518 [07:07<00:26,  1.15it/s, loss=21.1]

cuda:0


 94%|█████████▍| 488/518 [07:07<00:25,  1.19it/s, loss=21.1]

cuda:0


 94%|█████████▍| 489/518 [07:08<00:23,  1.21it/s, loss=21.1]

cuda:0


 95%|█████████▍| 490/518 [07:09<00:24,  1.15it/s, loss=21.1]

cuda:0


 95%|█████████▍| 491/518 [07:10<00:22,  1.20it/s, loss=21]

cuda:0


 95%|█████████▍| 492/518 [07:11<00:22,  1.14it/s, loss=21]

cuda:0


 95%|█████████▌| 493/518 [07:12<00:21,  1.14it/s, loss=21]

cuda:0


 95%|█████████▌| 494/518 [07:13<00:20,  1.17it/s, loss=21]

cuda:0


 96%|█████████▌| 495/518 [07:13<00:19,  1.16it/s, loss=21]

cuda:0


 96%|█████████▌| 496/518 [07:14<00:19,  1.15it/s, loss=21]

cuda:0


 96%|█████████▌| 497/518 [07:15<00:18,  1.15it/s, loss=21]

cuda:0


 96%|█████████▌| 498/518 [07:16<00:17,  1.15it/s, loss=21]

cuda:0


 96%|█████████▋| 499/518 [07:17<00:16,  1.16it/s, loss=21]

cuda:0


 97%|█████████▋| 500/518 [07:18<00:16,  1.11it/s, loss=21]

cuda:0


 97%|█████████▋| 501/518 [07:19<00:14,  1.15it/s, loss=21]

cuda:0


 97%|█████████▋| 502/518 [07:20<00:14,  1.07it/s, loss=21]

cuda:0


 97%|█████████▋| 503/518 [07:21<00:13,  1.11it/s, loss=21]

cuda:0


 97%|█████████▋| 504/518 [07:21<00:12,  1.13it/s, loss=21]

cuda:0


 97%|█████████▋| 505/518 [07:22<00:11,  1.18it/s, loss=21]

cuda:0


 98%|█████████▊| 506/518 [07:23<00:10,  1.12it/s, loss=21]

cuda:0


 98%|█████████▊| 507/518 [07:24<00:09,  1.14it/s, loss=21]

cuda:0


 98%|█████████▊| 508/518 [07:25<00:08,  1.14it/s, loss=21]

cuda:0


 98%|█████████▊| 509/518 [07:26<00:07,  1.14it/s, loss=21]

cuda:0


 98%|█████████▊| 510/518 [07:27<00:06,  1.17it/s, loss=21]

cuda:0


 99%|█████████▊| 511/518 [07:27<00:05,  1.18it/s, loss=20.9]

cuda:0


 99%|█████████▉| 512/518 [07:28<00:05,  1.19it/s, loss=20.9]

cuda:0


 99%|█████████▉| 513/518 [07:29<00:04,  1.20it/s, loss=20.9]

cuda:0


 99%|█████████▉| 514/518 [07:30<00:03,  1.16it/s, loss=20.9]

cuda:0


 99%|█████████▉| 515/518 [07:31<00:02,  1.20it/s, loss=20.9]

cuda:0


100%|█████████▉| 516/518 [07:32<00:01,  1.25it/s, loss=20.9]

cuda:0


100%|█████████▉| 517/518 [07:32<00:00,  1.28it/s, loss=20.9]

cuda:0


100%|██████████| 518/518 [07:33<00:00,  1.14it/s, loss=20.9]
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


cuda:0


  0%|          | 1/518 [00:02<19:13,  2.23s/it, loss=17.5]

cuda:0


  0%|          | 2/518 [00:03<12:29,  1.45s/it, loss=18.4]

cuda:0


  1%|          | 3/518 [00:04<13:52,  1.62s/it, loss=18.7]

cuda:0


  1%|          | 4/518 [00:05<11:12,  1.31s/it, loss=18.7]

cuda:0


  1%|          | 5/518 [00:06<10:35,  1.24s/it, loss=18.9]

cuda:0


  1%|          | 6/518 [00:07<09:16,  1.09s/it, loss=19.1]

cuda:0


  1%|▏         | 7/518 [00:08<08:27,  1.01it/s, loss=19.3]

cuda:0


  2%|▏         | 8/518 [00:09<07:54,  1.07it/s, loss=19.3]

cuda:0


  2%|▏         | 9/518 [00:10<07:30,  1.13it/s, loss=19.5]

cuda:0


  2%|▏         | 10/518 [00:10<07:19,  1.16it/s, loss=19.4]

cuda:0


  2%|▏         | 11/518 [00:11<07:10,  1.18it/s, loss=19.3]

cuda:0


  2%|▏         | 12/518 [00:12<07:00,  1.20it/s, loss=19.3]

cuda:0


  3%|▎         | 13/518 [00:13<07:15,  1.16it/s, loss=19.3]

cuda:0


  3%|▎         | 14/518 [00:14<07:20,  1.14it/s, loss=19.3]

cuda:0


  3%|▎         | 15/518 [00:15<07:59,  1.05it/s, loss=19.3]

cuda:0


  3%|▎         | 16/518 [00:16<07:41,  1.09it/s, loss=19.3]

cuda:0


  3%|▎         | 17/518 [00:17<07:35,  1.10it/s, loss=19.2]

cuda:0


  3%|▎         | 18/518 [00:18<07:23,  1.13it/s, loss=19.2]

cuda:0


  4%|▎         | 19/518 [00:19<07:40,  1.08it/s, loss=19.2]

cuda:0


  4%|▍         | 20/518 [00:19<07:28,  1.11it/s, loss=19.3]

cuda:0


  4%|▍         | 21/518 [00:20<07:44,  1.07it/s, loss=19.3]

cuda:0


  4%|▍         | 22/518 [00:21<07:17,  1.13it/s, loss=19.2]

cuda:0


  4%|▍         | 23/518 [00:22<07:32,  1.09it/s, loss=19.2]

cuda:0


  5%|▍         | 24/518 [00:23<07:18,  1.13it/s, loss=19.2]

cuda:0


  5%|▍         | 25/518 [00:24<07:12,  1.14it/s, loss=19.1]

cuda:0


  5%|▌         | 26/518 [00:25<07:06,  1.15it/s, loss=19.1]

cuda:0


  5%|▌         | 27/518 [00:26<07:26,  1.10it/s, loss=19.1]

cuda:0


  5%|▌         | 28/518 [00:26<07:10,  1.14it/s, loss=19]

cuda:0


  6%|▌         | 29/518 [00:27<07:02,  1.16it/s, loss=19]

cuda:0


  6%|▌         | 30/518 [00:28<07:19,  1.11it/s, loss=19]

cuda:0


  6%|▌         | 31/518 [00:29<06:59,  1.16it/s, loss=19]

cuda:0


  6%|▌         | 32/518 [00:30<06:53,  1.18it/s, loss=19]

cuda:0


  6%|▋         | 33/518 [00:31<06:43,  1.20it/s, loss=19]

cuda:0


  7%|▋         | 34/518 [00:32<07:00,  1.15it/s, loss=19]

cuda:0


  7%|▋         | 35/518 [00:32<06:47,  1.19it/s, loss=19]

cuda:0


  7%|▋         | 36/518 [00:33<07:13,  1.11it/s, loss=19]

cuda:0


  7%|▋         | 37/518 [00:34<07:09,  1.12it/s, loss=19]

cuda:0


  7%|▋         | 38/518 [00:35<07:45,  1.03it/s, loss=19]

cuda:0


  8%|▊         | 39/518 [00:36<07:32,  1.06it/s, loss=19]

cuda:0


  8%|▊         | 40/518 [00:37<07:48,  1.02it/s, loss=19.1]

cuda:0


  8%|▊         | 41/518 [00:38<07:21,  1.08it/s, loss=19]

cuda:0


  8%|▊         | 42/518 [00:39<07:12,  1.10it/s, loss=19.1]

cuda:0


  8%|▊         | 43/518 [00:40<06:50,  1.16it/s, loss=19.1]

cuda:0


  8%|▊         | 44/518 [00:41<06:51,  1.15it/s, loss=19.1]

cuda:0


  9%|▊         | 45/518 [00:41<06:34,  1.20it/s, loss=19.1]

cuda:0


  9%|▉         | 46/518 [00:42<06:47,  1.16it/s, loss=19.1]

cuda:0


  9%|▉         | 47/518 [00:43<06:33,  1.20it/s, loss=19]

cuda:0


  9%|▉         | 48/518 [00:44<06:42,  1.17it/s, loss=19]

cuda:0


  9%|▉         | 49/518 [00:45<06:36,  1.18it/s, loss=19]

cuda:0


 10%|▉         | 50/518 [00:46<06:40,  1.17it/s, loss=19]

cuda:0


 10%|▉         | 51/518 [00:47<06:39,  1.17it/s, loss=19]

cuda:0


 10%|█         | 52/518 [00:48<07:21,  1.06it/s, loss=19]

cuda:0


 10%|█         | 53/518 [00:49<07:10,  1.08it/s, loss=19]

cuda:0


 10%|█         | 54/518 [00:49<06:48,  1.13it/s, loss=19]

cuda:0


 11%|█         | 55/518 [00:50<06:41,  1.15it/s, loss=18.9]

cuda:0


 11%|█         | 56/518 [00:51<06:43,  1.14it/s, loss=18.9]

cuda:0


 11%|█         | 57/518 [00:52<06:34,  1.17it/s, loss=18.9]

cuda:0


 11%|█         | 58/518 [00:53<07:27,  1.03it/s, loss=18.9]

cuda:0


 11%|█▏        | 59/518 [00:54<07:03,  1.08it/s, loss=18.9]

cuda:0


 12%|█▏        | 60/518 [00:55<06:54,  1.11it/s, loss=18.9]

cuda:0


 12%|█▏        | 61/518 [00:56<06:36,  1.15it/s, loss=18.9]

cuda:0


 12%|█▏        | 62/518 [00:56<06:23,  1.19it/s, loss=18.9]

cuda:0


 12%|█▏        | 63/518 [00:57<06:23,  1.19it/s, loss=18.9]

cuda:0


 12%|█▏        | 64/518 [00:58<06:53,  1.10it/s, loss=18.9]

cuda:0


 13%|█▎        | 65/518 [00:59<06:37,  1.14it/s, loss=18.9]

cuda:0


 13%|█▎        | 66/518 [01:00<06:51,  1.10it/s, loss=18.9]

cuda:0


 13%|█▎        | 67/518 [01:01<06:39,  1.13it/s, loss=18.9]

cuda:0


 13%|█▎        | 68/518 [01:02<06:25,  1.17it/s, loss=18.9]

cuda:0


 13%|█▎        | 69/518 [01:03<06:13,  1.20it/s, loss=18.9]

cuda:0


 14%|█▎        | 70/518 [01:03<06:07,  1.22it/s, loss=18.9]

cuda:0


 14%|█▎        | 71/518 [01:04<05:59,  1.24it/s, loss=18.9]

cuda:0


 14%|█▍        | 72/518 [01:05<05:55,  1.25it/s, loss=18.9]

cuda:0


 14%|█▍        | 73/518 [01:06<05:51,  1.27it/s, loss=18.8]

cuda:0


 14%|█▍        | 74/518 [01:07<06:37,  1.12it/s, loss=18.9]

cuda:0


 14%|█▍        | 75/518 [01:08<06:32,  1.13it/s, loss=18.8]

cuda:0


 15%|█▍        | 76/518 [01:09<07:37,  1.04s/it, loss=18.8]

cuda:0


 15%|█▍        | 77/518 [01:10<07:07,  1.03it/s, loss=18.9]

cuda:0


 15%|█▌        | 78/518 [01:11<07:04,  1.04it/s, loss=18.8]

cuda:0


 15%|█▌        | 79/518 [01:12<06:45,  1.08it/s, loss=18.8]

cuda:0


 15%|█▌        | 80/518 [01:12<06:28,  1.13it/s, loss=18.8]

cuda:0


 16%|█▌        | 81/518 [01:13<06:11,  1.18it/s, loss=18.8]

cuda:0


 16%|█▌        | 82/518 [01:14<06:20,  1.15it/s, loss=18.8]

cuda:0


 16%|█▌        | 83/518 [01:15<06:08,  1.18it/s, loss=18.8]

cuda:0


 16%|█▌        | 84/518 [01:16<06:07,  1.18it/s, loss=18.8]

cuda:0


 16%|█▋        | 85/518 [01:17<06:02,  1.19it/s, loss=18.8]

cuda:0


 17%|█▋        | 86/518 [01:17<05:56,  1.21it/s, loss=18.8]

cuda:0


 17%|█▋        | 87/518 [01:18<05:48,  1.24it/s, loss=18.8]

cuda:0


 17%|█▋        | 88/518 [01:19<05:54,  1.21it/s, loss=18.8]

cuda:0


 17%|█▋        | 89/518 [01:20<05:57,  1.20it/s, loss=18.8]

cuda:0


 17%|█▋        | 90/518 [01:21<06:08,  1.16it/s, loss=18.8]

cuda:0


 18%|█▊        | 91/518 [01:22<06:08,  1.16it/s, loss=18.7]

cuda:0


 18%|█▊        | 92/518 [01:23<06:24,  1.11it/s, loss=18.8]

cuda:0


 18%|█▊        | 93/518 [01:23<06:10,  1.15it/s, loss=18.7]

cuda:0


 18%|█▊        | 94/518 [01:25<06:31,  1.08it/s, loss=18.7]

cuda:0


 18%|█▊        | 95/518 [01:25<06:18,  1.12it/s, loss=18.7]

cuda:0


 19%|█▊        | 96/518 [01:27<07:17,  1.04s/it, loss=18.7]

cuda:0


 19%|█▊        | 97/518 [01:28<06:45,  1.04it/s, loss=18.7]

cuda:0


 19%|█▉        | 98/518 [01:29<06:59,  1.00it/s, loss=18.7]

cuda:0


 19%|█▉        | 99/518 [01:29<06:32,  1.07it/s, loss=18.7]

cuda:0


 19%|█▉        | 100/518 [01:30<06:33,  1.06it/s, loss=18.7]

cuda:0


 19%|█▉        | 101/518 [01:31<06:16,  1.11it/s, loss=18.7]

cuda:0


 20%|█▉        | 102/518 [01:32<06:38,  1.04it/s, loss=18.7]

cuda:0


 20%|█▉        | 103/518 [01:33<06:26,  1.08it/s, loss=18.7]

cuda:0


 20%|██        | 104/518 [01:34<06:13,  1.11it/s, loss=18.7]

cuda:0


 20%|██        | 105/518 [01:35<05:56,  1.16it/s, loss=18.7]

cuda:0


 20%|██        | 106/518 [01:36<06:27,  1.06it/s, loss=18.7]

cuda:0


 21%|██        | 107/518 [01:37<06:12,  1.10it/s, loss=18.7]

cuda:0


 21%|██        | 108/518 [01:37<06:01,  1.13it/s, loss=18.6]

cuda:0


 21%|██        | 109/518 [01:38<06:04,  1.12it/s, loss=18.6]

cuda:0


 21%|██        | 110/518 [01:39<06:15,  1.09it/s, loss=18.6]

cuda:0


 21%|██▏       | 111/518 [01:40<06:10,  1.10it/s, loss=18.6]

cuda:0


 22%|██▏       | 112/518 [01:41<06:15,  1.08it/s, loss=18.6]

cuda:0


 22%|██▏       | 113/518 [01:42<06:05,  1.11it/s, loss=18.6]

cuda:0


 22%|██▏       | 114/518 [01:43<06:11,  1.09it/s, loss=18.6]

cuda:0


 22%|██▏       | 115/518 [01:44<05:58,  1.13it/s, loss=18.6]

cuda:0


 22%|██▏       | 116/518 [01:45<06:15,  1.07it/s, loss=18.6]

cuda:0


 23%|██▎       | 117/518 [01:46<06:01,  1.11it/s, loss=18.6]

cuda:0


 23%|██▎       | 118/518 [01:47<06:17,  1.06it/s, loss=18.6]

cuda:0


 23%|██▎       | 119/518 [01:48<06:00,  1.11it/s, loss=18.6]

cuda:0


 23%|██▎       | 120/518 [01:48<05:48,  1.14it/s, loss=18.6]

cuda:0


 23%|██▎       | 121/518 [01:49<05:34,  1.19it/s, loss=18.6]

cuda:0


 24%|██▎       | 122/518 [01:50<05:43,  1.15it/s, loss=18.6]

cuda:0


 24%|██▎       | 123/518 [01:51<05:40,  1.16it/s, loss=18.6]

cuda:0


 24%|██▍       | 124/518 [01:52<05:40,  1.16it/s, loss=18.6]

cuda:0


 24%|██▍       | 125/518 [01:53<05:35,  1.17it/s, loss=18.6]

cuda:0


 24%|██▍       | 126/518 [01:54<06:16,  1.04it/s, loss=18.6]

cuda:0


 25%|██▍       | 127/518 [01:55<05:53,  1.10it/s, loss=18.6]

cuda:0


 25%|██▍       | 128/518 [01:55<05:49,  1.12it/s, loss=18.6]

cuda:0


 25%|██▍       | 129/518 [01:56<05:38,  1.15it/s, loss=18.6]

cuda:0


 25%|██▌       | 130/518 [01:57<05:49,  1.11it/s, loss=18.6]

cuda:0


 25%|██▌       | 131/518 [01:58<05:45,  1.12it/s, loss=18.6]

cuda:0


 25%|██▌       | 132/518 [01:59<06:04,  1.06it/s, loss=18.6]

cuda:0


 26%|██▌       | 133/518 [02:00<05:50,  1.10it/s, loss=18.6]

cuda:0


 26%|██▌       | 134/518 [02:01<05:58,  1.07it/s, loss=18.6]

cuda:0


 26%|██▌       | 135/518 [02:02<05:43,  1.12it/s, loss=18.6]

cuda:0


 26%|██▋       | 136/518 [02:03<05:46,  1.10it/s, loss=18.6]

cuda:0


 26%|██▋       | 137/518 [02:04<05:40,  1.12it/s, loss=18.6]

cuda:0


 27%|██▋       | 138/518 [02:05<06:02,  1.05it/s, loss=18.6]

cuda:0


 27%|██▋       | 139/518 [02:05<05:40,  1.11it/s, loss=18.6]

cuda:0


 27%|██▋       | 140/518 [02:06<05:41,  1.11it/s, loss=18.6]

cuda:0


 27%|██▋       | 141/518 [02:07<05:25,  1.16it/s, loss=18.6]

cuda:0


 27%|██▋       | 142/518 [02:08<05:44,  1.09it/s, loss=18.6]

cuda:0


 28%|██▊       | 143/518 [02:09<05:29,  1.14it/s, loss=18.6]

cuda:0


 28%|██▊       | 144/518 [02:10<05:22,  1.16it/s, loss=18.5]

cuda:0


 28%|██▊       | 145/518 [02:11<05:14,  1.19it/s, loss=18.5]

cuda:0


 28%|██▊       | 146/518 [02:12<05:18,  1.17it/s, loss=18.5]

cuda:0


 28%|██▊       | 147/518 [02:12<05:13,  1.18it/s, loss=18.5]

cuda:0


 29%|██▊       | 148/518 [02:13<05:39,  1.09it/s, loss=18.5]

cuda:0


 29%|██▉       | 149/518 [02:14<05:33,  1.11it/s, loss=18.5]

cuda:0


 29%|██▉       | 150/518 [02:15<05:23,  1.14it/s, loss=18.5]

cuda:0


 29%|██▉       | 151/518 [02:16<05:16,  1.16it/s, loss=18.5]

cuda:0


 29%|██▉       | 152/518 [02:17<05:24,  1.13it/s, loss=18.5]

cuda:0


 30%|██▉       | 153/518 [02:18<05:14,  1.16it/s, loss=18.5]

cuda:0


 30%|██▉       | 154/518 [02:19<05:15,  1.15it/s, loss=18.5]

cuda:0


 30%|██▉       | 155/518 [02:19<05:10,  1.17it/s, loss=18.5]

cuda:0


 30%|███       | 156/518 [02:20<04:59,  1.21it/s, loss=18.5]

cuda:0


 30%|███       | 157/518 [02:21<05:04,  1.19it/s, loss=18.5]

cuda:0


 31%|███       | 158/518 [02:22<04:55,  1.22it/s, loss=18.5]

cuda:0


 31%|███       | 159/518 [02:23<04:56,  1.21it/s, loss=18.5]

cuda:0


 31%|███       | 160/518 [02:23<04:59,  1.19it/s, loss=18.5]

cuda:0


 31%|███       | 161/518 [02:24<04:51,  1.23it/s, loss=18.5]

cuda:0


 31%|███▏      | 162/518 [02:25<05:15,  1.13it/s, loss=18.5]

cuda:0


 31%|███▏      | 163/518 [02:26<05:07,  1.16it/s, loss=18.5]

cuda:0


 32%|███▏      | 164/518 [02:27<05:08,  1.15it/s, loss=18.5]

cuda:0


 32%|███▏      | 165/518 [02:28<05:02,  1.17it/s, loss=18.5]

cuda:0


 32%|███▏      | 166/518 [02:29<05:01,  1.17it/s, loss=18.5]

cuda:0


 32%|███▏      | 167/518 [02:29<04:53,  1.20it/s, loss=18.5]

cuda:0


 32%|███▏      | 168/518 [02:30<04:55,  1.18it/s, loss=18.5]

cuda:0


 33%|███▎      | 169/518 [02:31<04:49,  1.20it/s, loss=18.5]

cuda:0


 33%|███▎      | 170/518 [02:32<04:52,  1.19it/s, loss=18.5]

cuda:0


 33%|███▎      | 171/518 [02:33<04:51,  1.19it/s, loss=18.5]

cuda:0


 33%|███▎      | 172/518 [02:34<05:08,  1.12it/s, loss=18.5]

cuda:0


 33%|███▎      | 173/518 [02:35<04:58,  1.15it/s, loss=18.5]

cuda:0


 34%|███▎      | 174/518 [02:36<05:01,  1.14it/s, loss=18.5]

cuda:0


 34%|███▍      | 175/518 [02:36<04:56,  1.16it/s, loss=18.5]

cuda:0


 34%|███▍      | 176/518 [02:38<05:38,  1.01it/s, loss=18.5]

cuda:0


 34%|███▍      | 177/518 [02:38<05:14,  1.08it/s, loss=18.5]

cuda:0


 34%|███▍      | 178/518 [02:39<05:21,  1.06it/s, loss=18.5]

cuda:0


 35%|███▍      | 179/518 [02:40<05:10,  1.09it/s, loss=18.5]

cuda:0


 35%|███▍      | 180/518 [02:41<04:59,  1.13it/s, loss=18.5]

cuda:0


 35%|███▍      | 181/518 [02:42<04:46,  1.18it/s, loss=18.5]

cuda:0


 35%|███▌      | 182/518 [02:43<04:52,  1.15it/s, loss=18.5]

cuda:0


 35%|███▌      | 183/518 [02:44<04:59,  1.12it/s, loss=18.4]

cuda:0


 36%|███▌      | 184/518 [02:45<05:34,  1.00s/it, loss=18.4]

cuda:0


 36%|███▌      | 185/518 [02:46<05:12,  1.07it/s, loss=18.4]

cuda:0


 36%|███▌      | 186/518 [02:47<05:12,  1.06it/s, loss=18.4]

cuda:0


 36%|███▌      | 187/518 [02:48<04:57,  1.11it/s, loss=18.4]

cuda:0


 36%|███▋      | 188/518 [02:49<05:36,  1.02s/it, loss=18.4]

cuda:0


 36%|███▋      | 189/518 [02:50<05:11,  1.05it/s, loss=18.4]

cuda:0


 37%|███▋      | 190/518 [02:50<04:53,  1.12it/s, loss=18.4]

cuda:0


 37%|███▋      | 191/518 [02:51<04:45,  1.14it/s, loss=18.4]

cuda:0


 37%|███▋      | 192/518 [02:52<04:34,  1.19it/s, loss=18.4]

cuda:0


 37%|███▋      | 193/518 [02:53<04:29,  1.21it/s, loss=18.4]

cuda:0


 37%|███▋      | 194/518 [02:54<04:49,  1.12it/s, loss=18.4]

cuda:0


 38%|███▊      | 195/518 [02:55<04:36,  1.17it/s, loss=18.4]

cuda:0


 38%|███▊      | 196/518 [02:56<04:57,  1.08it/s, loss=18.4]

cuda:0


 38%|███▊      | 197/518 [02:56<04:43,  1.13it/s, loss=18.4]

cuda:0


 38%|███▊      | 198/518 [02:57<04:52,  1.09it/s, loss=18.4]

cuda:0


 38%|███▊      | 199/518 [02:58<04:55,  1.08it/s, loss=18.4]

cuda:0


 39%|███▊      | 200/518 [02:59<04:43,  1.12it/s, loss=18.4]

cuda:0


 39%|███▉      | 201/518 [03:00<04:41,  1.13it/s, loss=18.4]

cuda:0


 39%|███▉      | 202/518 [03:01<05:05,  1.04it/s, loss=18.4]

cuda:0


 39%|███▉      | 203/518 [03:02<04:49,  1.09it/s, loss=18.4]

cuda:0


 39%|███▉      | 204/518 [03:03<04:42,  1.11it/s, loss=18.4]

cuda:0


 40%|███▉      | 205/518 [03:04<04:38,  1.12it/s, loss=18.4]

cuda:0


 40%|███▉      | 206/518 [03:05<04:34,  1.14it/s, loss=18.4]

cuda:0


 40%|███▉      | 207/518 [03:05<04:22,  1.18it/s, loss=18.4]

cuda:0


 40%|████      | 208/518 [03:06<04:22,  1.18it/s, loss=18.4]

cuda:0


 40%|████      | 209/518 [03:07<04:24,  1.17it/s, loss=18.4]

cuda:0


 41%|████      | 210/518 [03:08<04:27,  1.15it/s, loss=18.4]

cuda:0


 41%|████      | 211/518 [03:09<04:24,  1.16it/s, loss=18.4]

cuda:0


 41%|████      | 212/518 [03:10<04:35,  1.11it/s, loss=18.4]

cuda:0


 41%|████      | 213/518 [03:11<04:44,  1.07it/s, loss=18.4]

cuda:0


 41%|████▏     | 214/518 [03:12<04:31,  1.12it/s, loss=18.4]

cuda:0


 42%|████▏     | 215/518 [03:12<04:21,  1.16it/s, loss=18.4]

cuda:0


 42%|████▏     | 216/518 [03:13<04:19,  1.16it/s, loss=18.4]

cuda:0


 42%|████▏     | 217/518 [03:14<04:11,  1.20it/s, loss=18.4]

cuda:0


 42%|████▏     | 218/518 [03:15<04:06,  1.22it/s, loss=18.4]

cuda:0


 42%|████▏     | 219/518 [03:16<04:28,  1.12it/s, loss=18.4]

cuda:0


 42%|████▏     | 220/518 [03:17<04:21,  1.14it/s, loss=18.4]

cuda:0


 43%|████▎     | 221/518 [03:18<04:34,  1.08it/s, loss=18.4]

cuda:0


 43%|████▎     | 222/518 [03:19<04:25,  1.11it/s, loss=18.4]

cuda:0


 43%|████▎     | 223/518 [03:19<04:17,  1.14it/s, loss=18.4]

cuda:0


 43%|████▎     | 224/518 [03:20<04:19,  1.13it/s, loss=18.3]

cuda:0


 43%|████▎     | 225/518 [03:21<04:24,  1.11it/s, loss=18.3]

cuda:0


 44%|████▎     | 226/518 [03:22<04:11,  1.16it/s, loss=18.3]

cuda:0


 44%|████▍     | 227/518 [03:23<04:19,  1.12it/s, loss=18.3]

cuda:0


 44%|████▍     | 228/518 [03:24<04:17,  1.13it/s, loss=18.3]

cuda:0


 44%|████▍     | 229/518 [03:25<04:06,  1.17it/s, loss=18.3]

cuda:0


 44%|████▍     | 230/518 [03:26<04:13,  1.13it/s, loss=18.3]

cuda:0


 45%|████▍     | 231/518 [03:26<04:02,  1.18it/s, loss=18.3]

cuda:0


 45%|████▍     | 232/518 [03:27<04:08,  1.15it/s, loss=18.3]

cuda:0


 45%|████▍     | 233/518 [03:28<04:02,  1.18it/s, loss=18.3]

cuda:0


 45%|████▌     | 234/518 [03:29<03:55,  1.21it/s, loss=18.3]

cuda:0


 45%|████▌     | 235/518 [03:30<03:49,  1.23it/s, loss=18.3]

cuda:0


 46%|████▌     | 236/518 [03:31<03:57,  1.19it/s, loss=18.3]

cuda:0


 46%|████▌     | 237/518 [03:31<03:54,  1.20it/s, loss=18.3]

cuda:0


 46%|████▌     | 238/518 [03:33<04:21,  1.07it/s, loss=18.3]

cuda:0


 46%|████▌     | 239/518 [03:33<04:09,  1.12it/s, loss=18.3]

cuda:0


 46%|████▋     | 240/518 [03:34<04:17,  1.08it/s, loss=18.3]

cuda:0


 47%|████▋     | 241/518 [03:35<04:06,  1.13it/s, loss=18.3]

cuda:0


 47%|████▋     | 242/518 [03:36<03:57,  1.16it/s, loss=18.3]

cuda:0


 47%|████▋     | 243/518 [03:37<03:54,  1.17it/s, loss=18.3]

cuda:0


 47%|████▋     | 244/518 [03:38<03:47,  1.20it/s, loss=18.3]

cuda:0


 47%|████▋     | 245/518 [03:38<03:50,  1.18it/s, loss=18.3]

cuda:0


 47%|████▋     | 246/518 [03:39<03:46,  1.20it/s, loss=18.3]

cuda:0


 48%|████▊     | 247/518 [03:40<03:44,  1.21it/s, loss=18.3]

cuda:0


 48%|████▊     | 248/518 [03:41<03:54,  1.15it/s, loss=18.3]

cuda:0


 48%|████▊     | 249/518 [03:42<04:01,  1.11it/s, loss=18.3]

cuda:0


 48%|████▊     | 250/518 [03:43<04:32,  1.02s/it, loss=18.3]

cuda:0


 48%|████▊     | 251/518 [03:44<04:16,  1.04it/s, loss=18.3]

cuda:0


 49%|████▊     | 252/518 [03:45<04:22,  1.01it/s, loss=18.3]

cuda:0


 49%|████▉     | 253/518 [03:46<04:14,  1.04it/s, loss=18.3]

cuda:0


 49%|████▉     | 254/518 [03:47<04:04,  1.08it/s, loss=18.3]

cuda:0


 49%|████▉     | 255/518 [03:48<04:07,  1.06it/s, loss=18.3]

cuda:0


 49%|████▉     | 256/518 [03:49<04:17,  1.02it/s, loss=18.3]

cuda:0


 50%|████▉     | 257/518 [03:50<04:21,  1.00s/it, loss=18.3]

cuda:0


 50%|████▉     | 258/518 [03:51<04:08,  1.05it/s, loss=18.3]

cuda:0


 50%|█████     | 259/518 [03:52<03:54,  1.10it/s, loss=18.3]

cuda:0


 50%|█████     | 260/518 [03:53<03:51,  1.12it/s, loss=18.3]

cuda:0


 50%|█████     | 261/518 [03:53<03:44,  1.14it/s, loss=18.2]

cuda:0


 51%|█████     | 262/518 [03:55<04:10,  1.02it/s, loss=18.2]

cuda:0


 51%|█████     | 263/518 [03:55<03:57,  1.08it/s, loss=18.2]

cuda:0


 51%|█████     | 264/518 [03:56<03:56,  1.07it/s, loss=18.2]

cuda:0


 51%|█████     | 265/518 [03:57<03:46,  1.12it/s, loss=18.2]

cuda:0


 51%|█████▏    | 266/518 [03:58<03:46,  1.11it/s, loss=18.2]

cuda:0


 52%|█████▏    | 267/518 [03:59<03:38,  1.15it/s, loss=18.2]

cuda:0


 52%|█████▏    | 268/518 [04:00<03:38,  1.15it/s, loss=18.2]

cuda:0


 52%|█████▏    | 269/518 [04:01<03:30,  1.18it/s, loss=18.2]

cuda:0


 52%|█████▏    | 270/518 [04:01<03:29,  1.18it/s, loss=18.2]

cuda:0


 52%|█████▏    | 271/518 [04:02<03:26,  1.20it/s, loss=18.2]

cuda:0


 53%|█████▎    | 272/518 [04:03<03:20,  1.23it/s, loss=18.2]

cuda:0


 53%|█████▎    | 273/518 [04:04<03:18,  1.24it/s, loss=18.2]

cuda:0


 53%|█████▎    | 274/518 [04:05<03:25,  1.19it/s, loss=18.2]

cuda:0


 53%|█████▎    | 275/518 [04:05<03:23,  1.20it/s, loss=18.2]

cuda:0


 53%|█████▎    | 276/518 [04:07<03:42,  1.09it/s, loss=18.2]

cuda:0


 53%|█████▎    | 277/518 [04:07<03:31,  1.14it/s, loss=18.2]

cuda:0


 54%|█████▎    | 278/518 [04:08<03:36,  1.11it/s, loss=18.2]

cuda:0


 54%|█████▍    | 279/518 [04:09<03:28,  1.15it/s, loss=18.2]

cuda:0


 54%|█████▍    | 280/518 [04:10<03:26,  1.15it/s, loss=18.2]

cuda:0


 54%|█████▍    | 281/518 [04:11<03:20,  1.18it/s, loss=18.2]

cuda:0


 54%|█████▍    | 282/518 [04:12<03:16,  1.20it/s, loss=18.2]

cuda:0


 55%|█████▍    | 283/518 [04:12<03:14,  1.21it/s, loss=18.2]

cuda:0


 55%|█████▍    | 284/518 [04:13<03:13,  1.21it/s, loss=18.2]

cuda:0


 55%|█████▌    | 285/518 [04:14<03:10,  1.22it/s, loss=18.2]

cuda:0


 55%|█████▌    | 286/518 [04:15<03:08,  1.23it/s, loss=18.2]

cuda:0


 55%|█████▌    | 287/518 [04:16<03:09,  1.22it/s, loss=18.2]

cuda:0


 56%|█████▌    | 288/518 [04:17<03:56,  1.03s/it, loss=18.2]

cuda:0


 56%|█████▌    | 289/518 [04:18<03:43,  1.03it/s, loss=18.2]

cuda:0


 56%|█████▌    | 290/518 [04:19<03:41,  1.03it/s, loss=18.2]

cuda:0


 56%|█████▌    | 291/518 [04:20<03:31,  1.08it/s, loss=18.2]

cuda:0


 56%|█████▋    | 292/518 [04:21<03:40,  1.03it/s, loss=18.2]

cuda:0


 57%|█████▋    | 293/518 [04:22<03:44,  1.00it/s, loss=18.2]

cuda:0


 57%|█████▋    | 294/518 [04:23<03:29,  1.07it/s, loss=18.2]

cuda:0


 57%|█████▋    | 295/518 [04:24<03:37,  1.02it/s, loss=18.2]

cuda:0


 57%|█████▋    | 296/518 [04:25<03:28,  1.06it/s, loss=18.2]

cuda:0


 57%|█████▋    | 297/518 [04:25<03:17,  1.12it/s, loss=18.2]

cuda:0


 58%|█████▊    | 298/518 [04:26<03:11,  1.15it/s, loss=18.2]

cuda:0


 58%|█████▊    | 299/518 [04:27<03:25,  1.07it/s, loss=18.2]

cuda:0


 58%|█████▊    | 300/518 [04:28<03:14,  1.12it/s, loss=18.2]

cuda:0


 58%|█████▊    | 301/518 [04:29<03:18,  1.09it/s, loss=18.2]

cuda:0


 58%|█████▊    | 302/518 [04:30<03:08,  1.15it/s, loss=18.2]

cuda:0


 58%|█████▊    | 303/518 [04:31<03:15,  1.10it/s, loss=18.2]

cuda:0


 59%|█████▊    | 304/518 [04:32<03:07,  1.14it/s, loss=18.2]

cuda:0


 59%|█████▉    | 305/518 [04:33<03:14,  1.10it/s, loss=18.2]

cuda:0


 59%|█████▉    | 306/518 [04:34<03:07,  1.13it/s, loss=18.2]

cuda:0


 59%|█████▉    | 307/518 [04:35<03:13,  1.09it/s, loss=18.2]

cuda:0


 59%|█████▉    | 308/518 [04:35<03:06,  1.13it/s, loss=18.2]

cuda:0


 60%|█████▉    | 309/518 [04:36<03:03,  1.14it/s, loss=18.2]

cuda:0


 60%|█████▉    | 310/518 [04:37<02:59,  1.16it/s, loss=18.2]

cuda:0


 60%|██████    | 311/518 [04:38<03:01,  1.14it/s, loss=18.2]

cuda:0


 60%|██████    | 312/518 [04:39<02:56,  1.17it/s, loss=18.2]

cuda:0


 60%|██████    | 313/518 [04:40<03:08,  1.09it/s, loss=18.2]

cuda:0


 61%|██████    | 314/518 [04:41<02:58,  1.14it/s, loss=18.2]

cuda:0


 61%|██████    | 315/518 [04:41<02:57,  1.15it/s, loss=18.2]

cuda:0


 61%|██████    | 316/518 [04:42<02:52,  1.17it/s, loss=18.2]

cuda:0


 61%|██████    | 317/518 [04:43<03:02,  1.10it/s, loss=18.2]

cuda:0


 61%|██████▏   | 318/518 [04:44<02:55,  1.14it/s, loss=18.2]

cuda:0


 62%|██████▏   | 319/518 [04:45<03:00,  1.10it/s, loss=18.2]

cuda:0


 62%|██████▏   | 320/518 [04:46<02:53,  1.14it/s, loss=18.2]

cuda:0


 62%|██████▏   | 321/518 [04:47<03:02,  1.08it/s, loss=18.2]

cuda:0


 62%|██████▏   | 322/518 [04:48<02:57,  1.11it/s, loss=18.2]

cuda:0


 62%|██████▏   | 323/518 [04:49<02:53,  1.13it/s, loss=18.2]

cuda:0


 63%|██████▎   | 324/518 [04:50<02:54,  1.11it/s, loss=18.2]

cuda:0


 63%|██████▎   | 325/518 [04:50<02:49,  1.14it/s, loss=18.2]

cuda:0


 63%|██████▎   | 326/518 [04:51<02:50,  1.13it/s, loss=18.2]

cuda:0


 63%|██████▎   | 327/518 [04:52<02:56,  1.08it/s, loss=18.1]

cuda:0


 63%|██████▎   | 328/518 [04:53<02:59,  1.06it/s, loss=18.1]

cuda:0


 64%|██████▎   | 329/518 [04:55<03:30,  1.11s/it, loss=18.1]

cuda:0


 64%|██████▎   | 330/518 [04:56<03:16,  1.04s/it, loss=18.1]

cuda:0


 64%|██████▍   | 331/518 [04:56<02:59,  1.04it/s, loss=18.1]

cuda:0


 64%|██████▍   | 332/518 [04:57<02:50,  1.09it/s, loss=18.1]

cuda:0


 64%|██████▍   | 333/518 [04:58<02:44,  1.13it/s, loss=18.1]

cuda:0


 64%|██████▍   | 334/518 [04:59<02:37,  1.17it/s, loss=18.1]

cuda:0


 65%|██████▍   | 335/518 [05:00<02:36,  1.17it/s, loss=18.1]

cuda:0


 65%|██████▍   | 336/518 [05:01<02:33,  1.19it/s, loss=18.1]

cuda:0


 65%|██████▌   | 337/518 [05:01<02:37,  1.15it/s, loss=18.1]

cuda:0


 65%|██████▌   | 338/518 [05:02<02:35,  1.16it/s, loss=18.1]

cuda:0


 65%|██████▌   | 339/518 [05:03<02:31,  1.18it/s, loss=18.1]

cuda:0


 66%|██████▌   | 340/518 [05:04<02:28,  1.20it/s, loss=18.1]

cuda:0


 66%|██████▌   | 341/518 [05:05<02:25,  1.21it/s, loss=18.1]

cuda:0


 66%|██████▌   | 342/518 [05:05<02:22,  1.24it/s, loss=18.1]

cuda:0


 66%|██████▌   | 343/518 [05:07<02:34,  1.13it/s, loss=18.1]

cuda:0


 66%|██████▋   | 344/518 [05:07<02:29,  1.17it/s, loss=18.1]

cuda:0


 67%|██████▋   | 345/518 [05:08<02:24,  1.20it/s, loss=18.1]

cuda:0


 67%|██████▋   | 346/518 [05:09<02:22,  1.21it/s, loss=18.1]

cuda:0


 67%|██████▋   | 347/518 [05:10<02:23,  1.19it/s, loss=18.1]

cuda:0


 67%|██████▋   | 348/518 [05:11<02:24,  1.18it/s, loss=18.1]

cuda:0


 67%|██████▋   | 349/518 [05:12<02:28,  1.14it/s, loss=18.1]

cuda:0


 68%|██████▊   | 350/518 [05:12<02:24,  1.16it/s, loss=18.1]

cuda:0


 68%|██████▊   | 351/518 [05:13<02:27,  1.13it/s, loss=18.1]

cuda:0


 68%|██████▊   | 352/518 [05:14<02:26,  1.14it/s, loss=18.1]

cuda:0


 68%|██████▊   | 353/518 [05:15<02:21,  1.16it/s, loss=18.1]

cuda:0


 68%|██████▊   | 354/518 [05:16<02:17,  1.20it/s, loss=18.1]

cuda:0


 69%|██████▊   | 355/518 [05:17<02:14,  1.21it/s, loss=18.1]

cuda:0


 69%|██████▊   | 356/518 [05:17<02:11,  1.23it/s, loss=18.1]

cuda:0


 69%|██████▉   | 357/518 [05:18<02:10,  1.23it/s, loss=18.1]

cuda:0


 69%|██████▉   | 358/518 [05:19<02:11,  1.22it/s, loss=18.1]

cuda:0


 69%|██████▉   | 359/518 [05:20<02:07,  1.24it/s, loss=18.1]

cuda:0


 69%|██████▉   | 360/518 [05:21<02:18,  1.14it/s, loss=18.1]

cuda:0


 70%|██████▉   | 361/518 [05:22<02:16,  1.15it/s, loss=18.1]

cuda:0


 70%|██████▉   | 362/518 [05:23<02:19,  1.12it/s, loss=18.1]

cuda:0


 70%|███████   | 363/518 [05:24<02:15,  1.14it/s, loss=18.1]

cuda:0


 70%|███████   | 364/518 [05:25<02:27,  1.04it/s, loss=18.1]

cuda:0


 70%|███████   | 365/518 [05:26<02:26,  1.04it/s, loss=18.1]

cuda:0


 71%|███████   | 366/518 [05:27<02:49,  1.11s/it, loss=18.1]

cuda:0


 71%|███████   | 367/518 [05:28<02:36,  1.03s/it, loss=18.1]

cuda:0


 71%|███████   | 368/518 [05:29<02:32,  1.01s/it, loss=18.1]

cuda:0


 71%|███████   | 369/518 [05:30<02:21,  1.05it/s, loss=18.1]

cuda:0


 71%|███████▏  | 370/518 [05:31<02:18,  1.07it/s, loss=18.1]

cuda:0


 72%|███████▏  | 371/518 [05:31<02:13,  1.10it/s, loss=18.1]

cuda:0


 72%|███████▏  | 372/518 [05:32<02:09,  1.13it/s, loss=18.1]

cuda:0


 72%|███████▏  | 373/518 [05:33<02:18,  1.05it/s, loss=18]

cuda:0


 72%|███████▏  | 374/518 [05:34<02:11,  1.10it/s, loss=18]

cuda:0


 72%|███████▏  | 375/518 [05:35<02:17,  1.04it/s, loss=18]

cuda:0


 73%|███████▎  | 376/518 [05:36<02:10,  1.09it/s, loss=18]

cuda:0


 73%|███████▎  | 377/518 [05:37<02:13,  1.06it/s, loss=18]

cuda:0


 73%|███████▎  | 378/518 [05:38<02:07,  1.10it/s, loss=18]

cuda:0


 73%|███████▎  | 379/518 [05:39<02:03,  1.12it/s, loss=18]

cuda:0


 73%|███████▎  | 380/518 [05:40<02:00,  1.15it/s, loss=18]

cuda:0


 74%|███████▎  | 381/518 [05:40<01:57,  1.16it/s, loss=18]

cuda:0


 74%|███████▎  | 382/518 [05:41<01:55,  1.18it/s, loss=18]

cuda:0


 74%|███████▍  | 383/518 [05:42<01:55,  1.17it/s, loss=18]

cuda:0


 74%|███████▍  | 384/518 [05:43<01:53,  1.18it/s, loss=18]

cuda:0


 74%|███████▍  | 385/518 [05:44<01:58,  1.12it/s, loss=18]

cuda:0


 75%|███████▍  | 386/518 [05:45<01:54,  1.16it/s, loss=18]

cuda:0


 75%|███████▍  | 387/518 [05:46<01:57,  1.11it/s, loss=18]

cuda:0


 75%|███████▍  | 388/518 [05:47<01:53,  1.15it/s, loss=18]

cuda:0


 75%|███████▌  | 389/518 [05:48<01:57,  1.10it/s, loss=18]

cuda:0


 75%|███████▌  | 390/518 [05:48<01:54,  1.12it/s, loss=18]

cuda:0


 75%|███████▌  | 391/518 [05:49<01:48,  1.17it/s, loss=18]

cuda:0


 76%|███████▌  | 392/518 [05:50<01:47,  1.18it/s, loss=18]

cuda:0


 76%|███████▌  | 393/518 [05:51<01:45,  1.19it/s, loss=18]

cuda:0


 76%|███████▌  | 394/518 [05:52<01:42,  1.21it/s, loss=18]

cuda:0


 76%|███████▋  | 395/518 [05:53<01:44,  1.18it/s, loss=18]

cuda:0


 76%|███████▋  | 396/518 [05:53<01:41,  1.20it/s, loss=18]

cuda:0


 77%|███████▋  | 397/518 [05:54<01:42,  1.18it/s, loss=18]

cuda:0


 77%|███████▋  | 398/518 [05:55<01:42,  1.17it/s, loss=18]

cuda:0


 77%|███████▋  | 399/518 [05:56<01:50,  1.08it/s, loss=18]

cuda:0


 77%|███████▋  | 400/518 [05:57<01:49,  1.08it/s, loss=18]

cuda:0


 77%|███████▋  | 401/518 [05:58<01:59,  1.02s/it, loss=18]

cuda:0


 78%|███████▊  | 402/518 [05:59<01:55,  1.01it/s, loss=18]

cuda:0


 78%|███████▊  | 403/518 [06:00<01:50,  1.04it/s, loss=18]

cuda:0


 78%|███████▊  | 404/518 [06:01<01:46,  1.07it/s, loss=18]

cuda:0


 78%|███████▊  | 405/518 [06:02<01:53,  1.00s/it, loss=18]

cuda:0


 78%|███████▊  | 406/518 [06:03<01:49,  1.02it/s, loss=18]

cuda:0


 79%|███████▊  | 407/518 [06:04<01:53,  1.02s/it, loss=18]

cuda:0


 79%|███████▉  | 408/518 [06:05<01:45,  1.04it/s, loss=18]

cuda:0


 79%|███████▉  | 409/518 [06:06<01:44,  1.04it/s, loss=18]

cuda:0


 79%|███████▉  | 410/518 [06:07<01:39,  1.09it/s, loss=18]

cuda:0


 79%|███████▉  | 411/518 [06:08<01:44,  1.02it/s, loss=18]

cuda:0


 80%|███████▉  | 412/518 [06:09<01:36,  1.09it/s, loss=18]

cuda:0


 80%|███████▉  | 413/518 [06:10<01:37,  1.07it/s, loss=18]

cuda:0


 80%|███████▉  | 414/518 [06:10<01:32,  1.13it/s, loss=17.9]

cuda:0


 80%|████████  | 415/518 [06:11<01:31,  1.12it/s, loss=17.9]

cuda:0


 80%|████████  | 416/518 [06:12<01:27,  1.17it/s, loss=17.9]

cuda:0


 81%|████████  | 417/518 [06:13<01:30,  1.12it/s, loss=17.9]

cuda:0


 81%|████████  | 418/518 [06:14<01:25,  1.17it/s, loss=17.9]

cuda:0


 81%|████████  | 419/518 [06:15<01:23,  1.19it/s, loss=17.9]

cuda:0


 81%|████████  | 420/518 [06:16<01:22,  1.19it/s, loss=17.9]

cuda:0


 81%|████████▏ | 421/518 [06:16<01:19,  1.21it/s, loss=17.9]

cuda:0


 81%|████████▏ | 422/518 [06:17<01:21,  1.18it/s, loss=17.9]

cuda:0


 82%|████████▏ | 423/518 [06:18<01:25,  1.11it/s, loss=17.9]

cuda:0


 82%|████████▏ | 424/518 [06:19<01:22,  1.14it/s, loss=17.9]

cuda:0


 82%|████████▏ | 425/518 [06:20<01:22,  1.12it/s, loss=17.9]

cuda:0


 82%|████████▏ | 426/518 [06:21<01:18,  1.17it/s, loss=17.9]

cuda:0


 82%|████████▏ | 427/518 [06:22<01:20,  1.13it/s, loss=17.9]

cuda:0


 83%|████████▎ | 428/518 [06:23<01:18,  1.14it/s, loss=17.9]

cuda:0


 83%|████████▎ | 429/518 [06:23<01:15,  1.17it/s, loss=17.9]

cuda:0


 83%|████████▎ | 430/518 [06:24<01:13,  1.20it/s, loss=17.9]

cuda:0


 83%|████████▎ | 431/518 [06:25<01:14,  1.17it/s, loss=17.9]

cuda:0


 83%|████████▎ | 432/518 [06:26<01:13,  1.18it/s, loss=17.9]

cuda:0


 84%|████████▎ | 433/518 [06:27<01:11,  1.19it/s, loss=17.9]

cuda:0


 84%|████████▍ | 434/518 [06:28<01:10,  1.20it/s, loss=17.9]

cuda:0


 84%|████████▍ | 435/518 [06:28<01:09,  1.19it/s, loss=17.9]

cuda:0


 84%|████████▍ | 436/518 [06:29<01:10,  1.16it/s, loss=17.9]

cuda:0


 84%|████████▍ | 437/518 [06:30<01:16,  1.06it/s, loss=17.9]

cuda:0


 85%|████████▍ | 438/518 [06:31<01:11,  1.11it/s, loss=17.9]

cuda:0


 85%|████████▍ | 439/518 [06:32<01:13,  1.07it/s, loss=17.9]

cuda:0


 85%|████████▍ | 440/518 [06:33<01:10,  1.10it/s, loss=17.9]

cuda:0


 85%|████████▌ | 441/518 [06:34<01:09,  1.11it/s, loss=17.9]

cuda:0


 85%|████████▌ | 442/518 [06:35<01:07,  1.13it/s, loss=17.9]

cuda:0


 86%|████████▌ | 443/518 [06:36<01:08,  1.10it/s, loss=17.9]

cuda:0


 86%|████████▌ | 444/518 [06:37<01:06,  1.11it/s, loss=17.9]

cuda:0


 86%|████████▌ | 445/518 [06:38<01:05,  1.12it/s, loss=17.9]

cuda:0


 86%|████████▌ | 446/518 [06:38<01:02,  1.15it/s, loss=17.9]

cuda:0


 86%|████████▋ | 447/518 [06:40<01:09,  1.03it/s, loss=17.9]

cuda:0


 86%|████████▋ | 448/518 [06:40<01:04,  1.08it/s, loss=17.9]

cuda:0


 87%|████████▋ | 449/518 [06:41<01:03,  1.08it/s, loss=17.9]

cuda:0


 87%|████████▋ | 450/518 [06:42<00:59,  1.14it/s, loss=17.9]

cuda:0


 87%|████████▋ | 451/518 [06:43<00:59,  1.13it/s, loss=17.9]

cuda:0


 87%|████████▋ | 452/518 [06:44<00:56,  1.18it/s, loss=17.8]

cuda:0


 87%|████████▋ | 453/518 [06:45<00:57,  1.13it/s, loss=17.9]

cuda:0


 88%|████████▊ | 454/518 [06:46<00:54,  1.17it/s, loss=17.8]

cuda:0


 88%|████████▊ | 455/518 [06:46<00:52,  1.20it/s, loss=17.8]

cuda:0


 88%|████████▊ | 456/518 [06:47<00:50,  1.22it/s, loss=17.8]

cuda:0


 88%|████████▊ | 457/518 [06:48<00:50,  1.20it/s, loss=17.8]

cuda:0


 88%|████████▊ | 458/518 [06:49<00:48,  1.23it/s, loss=17.8]

cuda:0


 89%|████████▊ | 459/518 [06:50<00:52,  1.12it/s, loss=17.8]

cuda:0


 89%|████████▉ | 460/518 [06:51<00:50,  1.14it/s, loss=17.8]

cuda:0


 89%|████████▉ | 461/518 [06:52<00:54,  1.05it/s, loss=17.8]

cuda:0


 89%|████████▉ | 462/518 [06:53<00:50,  1.10it/s, loss=17.8]

cuda:0


 89%|████████▉ | 463/518 [06:53<00:49,  1.11it/s, loss=17.8]

cuda:0


 90%|████████▉ | 464/518 [06:54<00:46,  1.16it/s, loss=17.8]

cuda:0


 90%|████████▉ | 465/518 [06:55<00:46,  1.13it/s, loss=17.8]

cuda:0


 90%|████████▉ | 466/518 [06:56<00:44,  1.18it/s, loss=17.8]

cuda:0


 90%|█████████ | 467/518 [06:57<00:45,  1.11it/s, loss=17.8]

cuda:0


 90%|█████████ | 468/518 [06:58<00:43,  1.16it/s, loss=17.8]

cuda:0


 91%|█████████ | 469/518 [06:59<00:42,  1.14it/s, loss=17.8]

cuda:0


 91%|█████████ | 470/518 [06:59<00:40,  1.19it/s, loss=17.8]

cuda:0


 91%|█████████ | 471/518 [07:00<00:42,  1.10it/s, loss=17.8]

cuda:0


 91%|█████████ | 472/518 [07:01<00:41,  1.10it/s, loss=17.8]

cuda:0


 91%|█████████▏| 473/518 [07:03<00:46,  1.04s/it, loss=17.8]

cuda:0


 92%|█████████▏| 474/518 [07:03<00:41,  1.05it/s, loss=17.8]

cuda:0


 92%|█████████▏| 475/518 [07:04<00:41,  1.03it/s, loss=17.8]

cuda:0


 92%|█████████▏| 476/518 [07:05<00:38,  1.10it/s, loss=17.8]

cuda:0


 92%|█████████▏| 477/518 [07:06<00:38,  1.08it/s, loss=17.8]

cuda:0


 92%|█████████▏| 478/518 [07:07<00:35,  1.12it/s, loss=17.8]

cuda:0


 92%|█████████▏| 479/518 [07:08<00:33,  1.17it/s, loss=17.8]

cuda:0


 93%|█████████▎| 480/518 [07:09<00:31,  1.19it/s, loss=17.8]

cuda:0


 93%|█████████▎| 481/518 [07:09<00:30,  1.20it/s, loss=17.8]

cuda:0


 93%|█████████▎| 482/518 [07:10<00:29,  1.21it/s, loss=17.8]

cuda:0


 93%|█████████▎| 483/518 [07:11<00:33,  1.06it/s, loss=17.8]

cuda:0


 93%|█████████▎| 484/518 [07:12<00:31,  1.09it/s, loss=17.8]

cuda:0


 94%|█████████▎| 485/518 [07:14<00:34,  1.05s/it, loss=17.8]

cuda:0


 94%|█████████▍| 486/518 [07:14<00:30,  1.03it/s, loss=17.8]

cuda:0


 94%|█████████▍| 487/518 [07:15<00:28,  1.07it/s, loss=17.8]

cuda:0


 94%|█████████▍| 488/518 [07:16<00:26,  1.13it/s, loss=17.8]

cuda:0


 94%|█████████▍| 489/518 [07:17<00:26,  1.10it/s, loss=17.8]

cuda:0


 95%|█████████▍| 490/518 [07:18<00:24,  1.14it/s, loss=17.8]

cuda:0


 95%|█████████▍| 491/518 [07:19<00:23,  1.13it/s, loss=17.8]

cuda:0


 95%|█████████▍| 492/518 [07:20<00:22,  1.17it/s, loss=17.8]

cuda:0


 95%|█████████▌| 493/518 [07:20<00:21,  1.14it/s, loss=17.8]

cuda:0


 95%|█████████▌| 494/518 [07:21<00:20,  1.17it/s, loss=17.8]

cuda:0


 96%|█████████▌| 495/518 [07:22<00:19,  1.19it/s, loss=17.8]

cuda:0


 96%|█████████▌| 496/518 [07:23<00:18,  1.19it/s, loss=17.8]

cuda:0


 96%|█████████▌| 497/518 [07:24<00:19,  1.08it/s, loss=17.8]

cuda:0


 96%|█████████▌| 498/518 [07:25<00:17,  1.14it/s, loss=17.8]

cuda:0


 96%|█████████▋| 499/518 [07:26<00:16,  1.13it/s, loss=17.8]

cuda:0


 97%|█████████▋| 500/518 [07:27<00:15,  1.16it/s, loss=17.7]

cuda:0


 97%|█████████▋| 501/518 [07:28<00:15,  1.11it/s, loss=17.7]

cuda:0


 97%|█████████▋| 502/518 [07:28<00:13,  1.15it/s, loss=17.7]

cuda:0


 97%|█████████▋| 503/518 [07:29<00:13,  1.14it/s, loss=17.7]

cuda:0


 97%|█████████▋| 504/518 [07:30<00:12,  1.16it/s, loss=17.7]

cuda:0


 97%|█████████▋| 505/518 [07:31<00:10,  1.18it/s, loss=17.7]

cuda:0


 98%|█████████▊| 506/518 [07:32<00:10,  1.19it/s, loss=17.7]

cuda:0


 98%|█████████▊| 507/518 [07:32<00:09,  1.19it/s, loss=17.7]

cuda:0


 98%|█████████▊| 508/518 [07:33<00:08,  1.19it/s, loss=17.7]

cuda:0


 98%|█████████▊| 509/518 [07:35<00:09,  1.03s/it, loss=17.7]

cuda:0


 98%|█████████▊| 510/518 [07:36<00:07,  1.04it/s, loss=17.7]

cuda:0


 99%|█████████▊| 511/518 [07:37<00:06,  1.01it/s, loss=17.7]

cuda:0


 99%|█████████▉| 512/518 [07:38<00:05,  1.06it/s, loss=17.7]

cuda:0


 99%|█████████▉| 513/518 [07:38<00:04,  1.09it/s, loss=17.7]

cuda:0


 99%|█████████▉| 514/518 [07:39<00:03,  1.12it/s, loss=17.7]

cuda:0


 99%|█████████▉| 515/518 [07:40<00:02,  1.18it/s, loss=17.7]

cuda:0


100%|█████████▉| 516/518 [07:41<00:01,  1.22it/s, loss=17.7]

cuda:0


100%|█████████▉| 517/518 [07:41<00:00,  1.26it/s, loss=17.7]

cuda:0


100%|██████████| 518/518 [07:42<00:00,  1.12it/s, loss=17.7]
