In [1]:
import numpy as np
import glob
import pandas as pd
import os
import torch
from tqdm import tqdm
import random
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torchvision.transforms as transforms
import cv2
import matplotlib.pyplot as plt
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
DATA_DIR = '/run/media/misha/G/mipt-dl/Data'

In [3]:
def get_number_of_classes(the_path):
    train_txts = glob.glob(f"{the_path}/train/*.txt")
    all_labels = set()
    for i, tx in enumerate(train_txts):
        with open(tx, 'r') as f:
            lines = f.readlines()
            labels = [int(x.split(' ')[0]) for x in lines]
            all_labels.update(labels)
    return list(sorted(list(all_labels)))

In [4]:
classes = get_number_of_classes(DATA_DIR)
classes

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
def colorstr(*input):
    # Colors a string https://en.wikipedia.org/wiki/ANSI_escape_code, i.e.  colorstr('blue', 'hello world')
    *args, string = input if len(input) > 1 else ('blue', 'bold', input[0])  # color arguments, string
    colors = {
        'black': '\033[30m',  # basic colors
        'red': '\033[31m',
        'green': '\033[32m',
        'yellow': '\033[33m',
        'blue': '\033[34m',
        'magenta': '\033[35m',
        'cyan': '\033[36m',
        'white': '\033[37m',
        'bright_black': '\033[90m',  # bright colors
        'bright_red': '\033[91m',
        'bright_green': '\033[92m',
        'bright_yellow': '\033[93m',
        'bright_blue': '\033[94m',
        'bright_magenta': '\033[95m',
        'bright_cyan': '\033[96m',
        'bright_white': '\033[97m',
        'end': '\033[0m',  # misc
        'bold': '\033[1m',
        'underline': '\033[4m'}
    return ''.join(colors[x] for x in args) + f'{string}' + colors['end']
    

PREFIX = colorstr('AutoAnchor: ')

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)

def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw  # top left x
    y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh  # top left y
    y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw  # bottom right x
    y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh  # bottom right y
    return y

def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w  # x center
    y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h  # y center
    y[..., 2] = (x[..., 2] - x[..., 0]) / w  # width
    y[..., 3] = (x[..., 3] - x[..., 1]) / h  # height
    return y

def check_anchor_order(m):
    # Check anchor order against stride order for YOLOv5 Detect() module m, and correct if necessary
    a = m.anchors.prod(-1).mean(-1).view(-1)  # mean anchor area per output layer
    da = a[-1] - a[0]  # delta a
    ds = m.stride[-1] - m.stride[0]  # delta s
    if da and (da.sign() != ds.sign()):  # same order
        print(f'{PREFIX}Reversing anchor order')
        m.anchors[:] = m.anchors.flip(0)



def check_anchors(dataset, model, thr=4.0, imgsz=640):
    # Check anchor fit to data, recompute if necessary
    m = model.module.model[-1] if hasattr(model, 'module') else model.model[-1]  # Detect()
    shapes = imgsz * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    scale = np.random.uniform(0.9, 1.1, size=(shapes.shape[0], 1))  # augment scale
    wh = torch.tensor(np.concatenate([l[:, 3:5] * s for s, l in zip(shapes * scale, dataset.labels)])).float()  # wh

    def metric(k):  # compute metric
        r = wh[:, None] / k[None]
        x = torch.min(r, 1 / r).min(2)[0]  # ratio metric
        best = x.max(1)[0]  # best_x
        aat = (x > 1 / thr).float().sum(1).mean()  # anchors above threshold
        bpr = (best > 1 / thr).float().mean()  # best possible recall
        return bpr, aat

    stride = m.stride.to(m.anchors.device).view(-1, 1, 1)  # model strides
    anchors = m.anchors.clone() * stride  # current anchors
    bpr, aat = metric(anchors.cpu().view(-1, 2))
    s = f'\n{PREFIX}{aat:.2f} anchors/target, {bpr:.3f} Best Possible Recall (BPR). '
    if bpr > 0.98:  # threshold to recompute
        print(f'{s}Current anchors are a good fit to dataset ✅')
    else:
        print(f'{s}Anchors are a poor fit to dataset ⚠️, attempting to improve...')
        na = m.anchors.numel() // 2  # number of anchors
        anchors = kmean_anchors(dataset, n=na, img_size=imgsz, thr=thr, gen=1000, verbose=False)
        new_bpr = metric(anchors)[0]
        if new_bpr > bpr:  # replace anchors
            anchors = torch.tensor(anchors, device=m.anchors.device).type_as(m.anchors)
            m.anchors[:] = anchors.clone().view_as(m.anchors)
            check_anchor_order(m)  # must be in pixel-space (not grid-space)
            m.anchors /= stride
            s = f'{PREFIX}Done ✅ (optional: update model *.yaml to use these anchors in the future)'
        else:
            s = f'{PREFIX}Done ⚠️ (original anchors better than new anchors, proceeding with original anchors)'
        print(s)


def kmean_anchors(dataset='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
    """ Creates kmeans-evolved anchors from training dataset

        Arguments:
            dataset: path to data.yaml, or a loaded dataset
            n: number of anchors
            img_size: image size used for training
            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
            gen: generations to evolve anchors using genetic algorithm
            verbose: print all results

        Return:
            k: kmeans evolved anchors

        Usage:
            from utils.autoanchor import *; _ = kmean_anchors()
    """
    from scipy.cluster.vq import kmeans

    npr = np.random
    thr = 1 / thr
    
    def metric(k, wh):  # compute metrics
        r = wh[:, None] / k[None]
        x = torch.min(r, 1 / r).min(2)[0]  # ratio metric
        # x = wh_iou(wh, torch.tensor(k))  # iou metric
        return x, x.max(1)[0]  # x, best_x

    def anchor_fitness(k):  # mutation fitness
        _, best = metric(torch.tensor(k, dtype=torch.float32), wh)
        return (best * (best > thr).float()).mean()  # fitness

    def print_results(k, verbose=True):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        x, best = metric(k, wh0)
        bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n  # best possible recall, anch > thr
        s = f'{PREFIX}thr={thr:.2f}: {bpr:.4f} best possible recall, {aat:.2f} anchors past thr\n' \
            f'{PREFIX}n={n}, img_size={img_size}, metric_all={x.mean():.3f}/{best.mean():.3f}-mean/best, ' \
            f'past_thr={x[x > thr].mean():.3f}-mean: '
        for x in k:
            s += '%i,%i, ' % (round(x[0]), round(x[1]))
        if verbose:
            print(s[:-2])
        return k

    # Get label wh
    shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])  # wh

    # Filter
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print(f'{PREFIX}WARNING ⚠️ Extremely small objects found: {i} of {len(wh0)} labels are <3 pixels in size')
    wh = wh0[(wh0 >= 2.0).any(1)].astype(np.float32)  # filter > 2 pixels
    # wh = wh * (npr.rand(wh.shape[0], 1) * 0.9 + 0.1)  # multiply by random scale 0-1

    # Kmeans init
    try:
        print(f'{PREFIX}Running kmeans for {n} anchors on {len(wh)} points...')
        assert n <= len(wh)  # apply overdetermined constraint
        s = wh.std(0)  # sigmas for whitening
        k = kmeans(wh / s, n, iter=30)[0] * s  # points
        assert n == len(k)  # kmeans may return fewer points than requested if wh is insufficient or too similar
    except Exception:
        print(f'{PREFIX}WARNING ⚠️ switching strategies from kmeans to random init')
        k = np.sort(npr.rand(n * 2)).reshape(n, 2) * img_size  # random init
    wh, wh0 = (torch.tensor(x, dtype=torch.float32) for x in (wh, wh0))
    k = print_results(k, verbose=False)

    # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True)
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
    # fig.savefig('wh.png', dpi=200)
    TQDM_BAR_FORMAT = '{l_bar}{bar:10}{r_bar}'  # tqdm bar format
    # Evolve
    f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen), bar_format=TQDM_BAR_FORMAT)  # progress bar
    for _ in pbar:
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = anchor_fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = f'{PREFIX}Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'
            if verbose:
                print_results(k, verbose)

    return print_results(k).astype(np.float32)



In [7]:
class StreetDataset(Dataset):
    def __init__(self, the_path, transformations):
        super().__init__()
        self.the_path = the_path
        self.transformations = transformations
        self._post_init()

    def _post_init(self) -> None:
        self.all_labels = glob.glob(f"{self.the_path}/*.txt")
        self.all_imgs = glob.glob(f"{self.the_path}/*.jpg")
        if len(self.all_labels) != len(self.all_imgs):
            raise ValueError("The amount of y and amount of X are not the same")
        self.shapes = np.ones((self.__len__(), 2))
        self.shapes *= 640
        lls = []
        for label_path in self.all_labels:
            bboxes = np.array(self._parse_labels(label_path))
            lls.append(bboxes)
        self.labels = np.array(lls)

    def _parse_labels(self, label_path):
        with open(label_path, 'r') as f:
            lines = f.readlines()
        nl = len(lines)
        # labels = [int(x.split(' ')[0]) for x in lines]
        # labels = torch.as_tensor(labels, dtype=torch.int16)
        bboxes = [x.split(' ')[:] for x in lines]
        bboxes = [[float(x.strip()) for x in row] for row in bboxes]
        bboxes = np.array(bboxes, dtype=np.float32)
        labels_out = torch.zeros((nl, 6))
        labels_out[:, 1:] = torch.from_numpy(bboxes)

        # targets = {'boxes': bboxes, 'labels': labels}
        # print(targets)
        # targets = [{k: torch.tensor(v) for k, v in t.items()} for t in targets]

        return labels_out

    def __getitem__(self, index):
        bboxes = self._parse_labels(self.all_labels[index])
        img_path = self.all_imgs[index]
        # img = Image.open(img_path)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)        
        # print(img.shape)
        # cv2.imwrite('image_cv.jpg', np.asarray(img))
        # Image.fromarray(img).save('image_pil.jpg')
        # tot = transforms.ToTensor()
        # img = tot(img)
        x = self.transformations(img)
        return x, bboxes

    def __len__(self) -> int:
        return len(self.all_labels)

In [8]:
def get_transforms():
    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Resize((640, 640)), transforms.RandomHorizontalFlip()])
    val_transforms = transforms.Compose([transforms.ToTensor(), transforms.Resize((640, 640))])
    return train_transforms, val_transforms

In [9]:
train_transforms, val_transforms = get_transforms()

In [10]:
train_ds = StreetDataset(os.path.join(DATA_DIR, 'train'), train_transforms)
val_ds = StreetDataset(os.path.join(DATA_DIR, 'val'), val_transforms)

  self.labels = np.array(lls)


In [11]:
targets = train_ds[0][1]
targets

tensor([[0.0000, 2.0000, 0.4539, 0.4409, 0.0168, 0.0199],
        [0.0000, 2.0000, 0.4329, 0.4434, 0.0168, 0.0249],
        [0.0000, 2.0000, 0.4140, 0.4434, 0.0182, 0.0249],
        [0.0000, 2.0000, 0.4939, 0.4297, 0.0126, 0.0174],
        [0.0000, 2.0000, 0.5009, 0.4421, 0.0126, 0.0174],
        [0.0000, 2.0000, 0.1562, 0.4402, 0.0434, 0.0324],
        [0.0000, 2.0000, 0.3285, 0.4402, 0.0238, 0.0174],
        [0.0000, 7.0000, 0.4666, 0.4228, 0.0084, 0.0224],
        [0.0000, 7.0000, 0.5674, 0.2983, 0.0252, 0.0174],
        [0.0000, 7.0000, 0.5422, 0.3219, 0.0813, 0.0399],
        [0.0000, 7.0000, 0.5625, 0.4029, 0.0098, 0.0224],
        [0.0000, 7.0000, 0.6480, 0.3854, 0.0210, 0.0473],
        [0.0000, 7.0000, 0.7054, 0.3794, 0.0462, 0.0598],
        [0.0000, 2.0000, 0.2305, 0.4458, 0.0406, 0.0149],
        [0.0000, 2.0000, 0.1856, 0.4465, 0.0098, 0.0199],
        [0.0000, 6.0000, 0.5520, 0.3865, 0.0112, 0.0125]])

In [12]:
import random
import torch.nn.functional as F

def custom_collate_fn(batch):
    images = []
    bboxes = []

    for b in batch:
        images.append(b[0])
        bboxes.append(b[1])

    images = torch.stack(images, dim=0)
    return images, bboxes


def collate_fn4(batch):
    im, label, path, shapes = zip(*batch)  # transposed
    n = len(shapes) // 4
    im4, label4, path4, shapes4 = [], [], path[:n], shapes[:n]

    ho = torch.tensor([[0.0, 0, 0, 1, 0, 0]])
    wo = torch.tensor([[0.0, 0, 1, 0, 0, 0]])
    s = torch.tensor([[1, 1, 0.5, 0.5, 0.5, 0.5]])  # scale
    for i in range(n):  # zidane torch.zeros(16,3,720,1280)  # BCHW
        i *= 4
        if random.random() < 0.5:
            im1 = F.interpolate(im[i].unsqueeze(0).float(), scale_factor=2.0, mode='bilinear',
                                align_corners=False)[0].type(im[i].type())
            lb = label[i]
        else:
            im1 = torch.cat((torch.cat((im[i], im[i + 1]), 1), torch.cat((im[i + 2], im[i + 3]), 1)), 2)
            lb = torch.cat((label[i], label[i + 1] + ho, label[i + 2] + wo, label[i + 3] + ho + wo), 0) * s
        im4.append(im1)
        label4.append(lb)

    for i, lb in enumerate(label4):
        lb[:, 0] = i  # add target image index for build_targets()

    return torch.stack(im4, 0), torch.cat(label4, 0), path4, shapes4

In [13]:
train_loader = DataLoader(train_ds, shuffle=True, batch_size=1, num_workers=4, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_ds, shuffle=False, batch_size=1, num_workers=4, collate_fn=custom_collate_fn)

In [14]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, autoshape=False, classes=len(classes))

Using cache found in /home/misha/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-1-15 Python-3.10.8 torch-1.13.0+cu117 CUDA:0 (NVIDIA GeForce RTX 2080, 7941MiB)

Overriding model.yaml nc=80 with nc=10

                 from  n    params  module                                  arguments                     
  0                -1  1      3520  models.common.Conv                      [3, 32, 6, 2, 2]              
  1                -1  1     18560  models.common.Conv                      [32, 64, 3, 2]                
  2                -1  1     18816  models.common.C3                        [64, 64, 1]                   
  3                -1  1     73984  models.common.Conv                      [64, 128, 3, 2]               
  4                -1  2    115712  models.common.C3                        [128, 128, 2]                 
  5                -1  1    295424  models.common.Conv                      [128, 256, 3, 2]              
  6                -1  3    625152  mode

In [15]:
torch.save(model.state_dict(), 'yolov5s_untrained.pt')

In [16]:
m = model.module.model[-1] if hasattr(model, 'module') else model.model[-1]  # Detect()
print(m.anchors)
check_anchors(val_ds, model)
print(m.anchors)

tensor([[[ 1.25000,  1.62500],
         [ 2.00000,  3.75000],
         [ 4.12500,  2.87500]],

        [[ 1.87500,  3.81250],
         [ 3.87500,  2.81250],
         [ 3.68750,  7.43750]],

        [[ 3.62500,  2.81250],
         [ 4.87500,  6.18750],
         [11.65625, 10.18750]]], device='cuda:0')

[34m[1mAutoAnchor: [0m0.79 anchors/target, 0.430 Best Possible Recall (BPR). Anchors are a poor fit to dataset ⚠️, attempting to improve...
[34m[1mAutoAnchor: [0mRunning kmeans for 9 anchors on 37075 points...


[34m[1mAutoAnchor: [0mEvolving anchors with Genetic Algorithm: fitness = 0.7948: 100%|██████████| 1000/1000 [00:01<00:00, 916.10it/s]

[34m[1mAutoAnchor: [0mthr=0.25: 0.9933 best possible recall, 5.96 anchors past thr
[34m[1mAutoAnchor: [0mn=9, img_size=640, metric_all=0.398/0.796-mean/best, past_thr=0.531-mean: 286,5, 202,8, 311,9, 158,17, 292,14, 311,24, 308,40, 335,68, 343,139
[34m[1mAutoAnchor: [0mDone ✅ (optional: update model *.yaml to use these anchors in the future)
tensor([[[35.74540,  0.63267],
         [25.28660,  1.03843],
         [38.91613,  1.07827]],

        [[ 9.87360,  1.07002],
         [18.23471,  0.90380],
         [19.41502,  1.47753]],

        [[ 9.62854,  1.24542],
         [10.46634,  2.13949],
         [10.71779,  4.35385]]], device='cuda:0')





In [17]:
class ObjectDetector(nn.Module):
	def __init__(self, base_model, num_classes):
		super(ObjectDetector, self).__init__()
		# initialize the base model and the number of classes
		self.base_model = base_model
		self.num_classes = num_classes

		self.regressor = nn.Sequential(
			nn.Linear(base_model.fc.in_features, 128),
			nn.ReLU(),
			nn.Linear(128, 64),
			nn.ReLU(),
			nn.Linear(64, 32),
			nn.ReLU(),
			nn.Linear(32, 4),
			nn.Sigmoid()
		)
		self.classifier = nn.Sequential(
			nn.Linear(base_model.fc.in_features, 512),
			nn.ReLU(),
			nn.Dropout(),
			nn.Linear(512, 512),
			nn.ReLU(),
			nn.Dropout(),
			nn.Linear(512, self.num_classes)
		)
		self.base_model.fc = nn.Identity()


	def forward(self, x):
		# pass the inputs through the base model and then obtain
		# predictions from two different branches of the network
		features = self.base_model(x)
		bboxes = self.regressor(features)
		class_logits = self.classifier(features)
		return (bboxes, class_logits)

In [18]:
# from torchvision import models
# resnet = models.resnet18(pretrained=True)
# model = ObjectDetector(resnet, 1)
# model = model.to(device)

In [19]:
from loss import ComputeLoss
compute_loss = ComputeLoss(model)
losses = []
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
n = 0
for epoch in range(1):
    for X, targets in train_loader:
        n+=1
        if n>=100:
            break
        model.zero_grad()
        X = X.to(device)
        outputs = model(X)
        loss, loss_items = compute_loss(outputs, targets[0].to(device))
        losses.append(loss.item())
        print(loss)
        loss.backward()
        optimizer.step()
        # print(outputs[0].shape)
        # print(outputs[1].shape)
        # print(outputs[2].shape)
        # predicted_bboxes, predicted_labels, predicted_smthing = outputs[0], outputs[1], outputs[2]
        # print(predicted_bboxes.shape)
        # print(labels[0].shape)
        # print(bboxes[0].shape, predicted_bboxes.shape)
        # print(predicted_labels[1].shape)
        # for bbox in bboxes:
            # print(bbox.shape)
        # for i in range(len(bboxes)):

        # print(bboxes.shape)
        # print(predicted_labels.shape)

        # print(predicted_bboxes.shape, len(bboxes))
        
        # for i in range(len(labels)):
        #     label = labels[i].to(device)
        #     predicted_label = predicted_labels[i].to(device)
        #     print(label.shape)
        #     print(predicted_label.shape)
        #     print(predicted_labels)
        #     classification_loss(predicted_label, label)
        # print(outputs)
        # a = 5
        # break

tensor([4.19515], device='cuda:0', grad_fn=<MulBackward0>)
tensor([2.36874], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.78387], device='cuda:0', grad_fn=<MulBackward0>)
tensor([2.46075], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.77365], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.62432], device='cuda:0', grad_fn=<MulBackward0>)
tensor([2.72543], device='cuda:0', grad_fn=<MulBackward0>)
tensor([2.40383], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.54688], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.51231], device='cuda:0', grad_fn=<MulBackward0>)
tensor([2.24263], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.36809], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.18413], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.30956], device='cuda:0', grad_fn=<MulBackward0>)
tensor([2.09331], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.53503], device='cuda:0', grad_fn=<MulBackward0>)
tensor([3.97658], device='cuda:0', grad_fn=<MulBackward0

In [20]:
import matplotlib.pyplot as plt

In [32]:
cv_img = cv2.imread(train_ds.all_imgs[1])
inf_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', autoshape=False, classes=len(classes))

Using cache found in /home/misha/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-1-15 Python-3.10.8 torch-1.13.0+cu117 CUDA:0 (NVIDIA GeForce RTX 2080, 7941MiB)

Overriding model.yaml nc=80 with nc=10

                 from  n    params  module                                  arguments                     
  0                -1  1      3520  models.common.Conv                      [3, 32, 6, 2, 2]              
  1                -1  1     18560  models.common.Conv                      [32, 64, 3, 2]                
  2                -1  1     18816  models.common.C3                        [64, 64, 1]                   
  3                -1  1     73984  models.common.Conv                      [64, 128, 3, 2]               
  4                -1  2    115712  models.common.C3                        [128, 128, 2]                 
  5                -1  1    295424  models.common.Conv                      [128, 256, 3, 2]              
  6                -1  3    625152  mode

In [33]:
inf_model.load_state_dict(torch.load('yolov5s_untrained.pt'))

<All keys matched successfully>

In [36]:
inf_model(train_ds[0][0].unsqueeze(0).to(device))

[tensor([[[[[-1.59492e+00, -1.04936e+00,  1.01636e+00,  ..., -3.70942e+00, -1.95839e+00, -2.00335e+00],
            [-1.29904e+00, -8.57357e-01,  1.37752e+00,  ..., -3.15033e+00, -2.53233e+00, -1.92315e+00],
            [-1.31988e+00,  3.50413e-01,  1.17287e+00,  ..., -4.09144e+00, -2.28672e+00, -1.12478e+00],
            ...,
            [-1.16982e+00, -1.28806e+00,  1.47513e+00,  ..., -3.95985e+00, -2.23873e+00, -2.78596e+00],
            [-9.57968e-02, -2.23272e+00,  1.99048e+00,  ..., -3.70358e+00, -2.94877e+00, -3.19296e+00],
            [ 6.82479e-01, -2.32830e+00,  1.60807e+00,  ..., -2.80107e+00, -2.69234e+00, -3.64964e+00]],
 
           [[-1.49507e+00,  1.17318e-01,  7.88828e-01,  ..., -4.59834e+00, -2.99275e+00, -1.28546e+00],
            [-7.39582e-01, -3.56187e-01,  1.00553e+00,  ..., -3.80096e+00, -3.27851e+00, -2.17852e+00],
            [-9.78547e-01,  1.14395e+00,  5.55822e-01,  ..., -4.57600e+00, -3.03635e+00, -1.32640e+00],
            ...,
            [-7.09222e-01, 

In [42]:
from common import DetectMultiBackend

In [44]:
dddd = torch.load(weights)

In [48]:
'stride' in dddd

False

In [41]:

save_img = True
is_file = False
is_url = False
webcam = False
screenshot = False

# Directories
save_dir = './save_dir'
weights = 'yolov5s_untrained.pt'
os.makedirs(save_dir, exist_ok=True)

# Load model
model = DetectMultiBackend(weights, device=device)
stride, names, pt = model.stride, model.names, model.pt


KeyError: 'model'

In [None]:
imgsz = check_img_size(imgsz, s=stride)  # check image size
# Dataloader
bs = 1  # batch_size
if webcam:
    view_img = check_imshow(warn=True)
    dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
    bs = len(dataset)
elif screenshot:
    dataset = LoadScreenshots(source, img_size=imgsz, stride=stride, auto=pt)
else:
    dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
vid_path, vid_writer = [None] * bs, [None] * bs

# Run inference
model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz))  # warmup
seen, windows, dt = 0, [], (Profile(), Profile(), Profile())
for path, im, im0s, vid_cap, s in dataset:
    with dt[0]:
        im = torch.from_numpy(im).to(model.device)
        im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
        im /= 255  # 0 - 255 to 0.0 - 1.0
        if len(im.shape) == 3:
            im = im[None]  # expand for batch dim

    # Inference
    with dt[1]:
        visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
        pred = model(im, augment=augment, visualize=visualize)

    # NMS
    with dt[2]:
        pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)

    # Second-stage classifier (optional)
    # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)

    # Process predictions
    for i, det in enumerate(pred):  # per image
        seen += 1
        if webcam:  # batch_size >= 1
            p, im0, frame = path[i], im0s[i].copy(), dataset.count
            s += f'{i}: '
        else:
            p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)

        p = Path(p)  # to Path
        save_path = str(save_dir / p.name)  # im.jpg
        txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # im.txt
        s += '%gx%g ' % im.shape[2:]  # print string
        gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        imc = im0.copy() if save_crop else im0  # for save_crop
        annotator = Annotator(im0, line_width=line_thickness, example=str(names))
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()

            # Print results
            for c in det[:, 5].unique():
                n = (det[:, 5] == c).sum()  # detections per class
                s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

            # Write results
            for *xyxy, conf, cls in reversed(det):
                if save_txt:  # Write to file
                    xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                    line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
                    with open(f'{txt_path}.txt', 'a') as f:
                        f.write(('%g ' * len(line)).rstrip() % line + '\n')

                if save_img or save_crop or view_img:  # Add bbox to image
                    c = int(cls)  # integer class
                    label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
                    annotator.box_label(xyxy, label, color=colors(c, True))
                if save_crop:
                    save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)

        # Stream results
        im0 = annotator.result()
        if view_img:
            if platform.system() == 'Linux' and p not in windows:
                windows.append(p)
                cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)  # allow window resize (Linux)
                cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0])
            cv2.imshow(str(p), im0)
            cv2.waitKey(1)  # 1 millisecond

        # Save results (image with detections)
        if save_img:
            if dataset.mode == 'image':
                cv2.imwrite(save_path, im0)
            else:  # 'video' or 'stream'
                if vid_path[i] != save_path:  # new video
                    vid_path[i] = save_path
                    if isinstance(vid_writer[i], cv2.VideoWriter):
                        vid_writer[i].release()  # release previous video writer
                    if vid_cap:  # video
                        fps = vid_cap.get(cv2.CAP_PROP_FPS)
                        w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                        h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                    else:  # stream
                        fps, w, h = 30, im0.shape[1], im0.shape[0]
                    save_path = str(Path(save_path).with_suffix('.mp4'))  # force *.mp4 suffix on results videos
                    vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                vid_writer[i].write(im0)


In [35]:
first_tensor = train_ds[0][0]
first_tensor = first_tensor.unsqueeze(0)
first_tensor = first_tensor.to(device)
first_output = model(first_tensor)

In [38]:
len(first_output)

3

In [43]:
first_output[0].shape

torch.Size([1, 3, 80, 80, 15])

In [41]:
np.mean(losses[0:100])

2.442574153789319

In [48]:
np.mean(losses[12000:12100])

1.8756494843959808

In [19]:
losses

[3.724456787109375,
 3.9800469875335693,
 3.890322208404541,
 2.520813226699829,
 3.9133477210998535,
 3.8348779678344727,
 3.6501495838165283,
 3.783374071121216,
 0.019466619938611984,
 0.01832880824804306,
 3.9570531845092773,
 3.6686127185821533,
 3.216564893722534,
 3.660982131958008,
 3.1975789070129395,
 3.255096912384033,
 2.3065571784973145,
 1.9826027154922485,
 2.9358198642730713,
 3.473113775253296,
 2.0954573154449463,
 1.8336900472640991,
 3.3758974075317383,
 3.2132506370544434,
 3.5856399536132812,
 1.9642516374588013,
 3.7702932357788086,
 2.1478891372680664,
 2.4491376876831055,
 3.943010091781616,
 2.756061315536499,
 2.7761402130126953,
 2.437709331512451,
 2.8360986709594727,
 0.009144200943410397,
 2.658423900604248,
 1.428532600402832,
 2.3744349479675293,
 1.0361278057098389,
 2.5058655738830566,
 1.6161329746246338,
 2.889284610748291,
 2.3266687393188477,
 1.4848957061767578,
 2.3071250915527344,
 2.682664155960083,
 2.520766496658325,
 1.5415633916854858,
 2.