In [1]:
import numpy as np
import glob
import pandas as pd
import os
import torch
from tqdm import tqdm
import random
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torchvision.transforms as transforms
import cv2
import matplotlib.pyplot as plt
from ssd_utils import generate_dboxes
from ssd_transform import SSDTransformer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# torch.multiprocessing.set_start_method('spawn')

In [2]:
# device = 'cpu'

In [3]:
DATA_DIR = '/run/media/misha/G/mipt-dl/Data'

In [4]:
def get_number_of_classes(the_path):
    train_txts = glob.glob(f"{the_path}/train/*.txt")
    all_labels = set()
    for i, tx in enumerate(train_txts):
        with open(tx, 'r') as f:
            lines = f.readlines()
            labels = [int(x.split(' ')[0]) for x in lines]
            all_labels.update(labels)
    return list(sorted(list(all_labels)))

In [5]:
classes = get_number_of_classes(DATA_DIR)
classes

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
num_classes = len(classes)

In [7]:
from PIL import Image
from torchvision.ops import box_convert
class StreetDataset(Dataset):
    def __init__(self, the_path, transformations):
        super().__init__()
        self.the_path = the_path
        self.transformations = transformations
        self._post_init()

    def _post_init(self) -> None:
        self.all_labels = glob.glob(f"{self.the_path}/*.txt")
        self.all_imgs = glob.glob(f"{self.the_path}/*.jpg")
        if len(self.all_labels) != len(self.all_imgs):
            raise ValueError("The amount of y and amount of X are not the same")

    def _parse_labels(self, label_path, width, height):
        with open(label_path, 'r') as f:
            lines = f.readlines()
        nl = len(lines)
        labels = [int(x.split(' ')[0]) for x in lines]
        labels = torch.as_tensor(labels, dtype=torch.long)
        bboxes = [x.split(' ')[1:] for x in lines]
        bboxes = [[float(x.strip()) for x in row] for row in bboxes]
        bboxes = torch.as_tensor(bboxes, dtype=torch.float32)

        bboxes[..., 0] *= width
        bboxes[..., 1] *= height

        bboxes[..., 2] *= width
        bboxes[..., 3] *= height
        bboxes = box_convert(bboxes, 'cxcywh', 'xyxy')

        return bboxes, labels

    def __getitem__(self, index):
        img_path = self.all_imgs[index]
        img = Image.open(img_path)
        width, height = img.size
        img = img.resize((300, 300))
        bboxes, labels = self._parse_labels(self.all_labels[index], 300, 300)
        # img = cv2.imread(img_path)
        # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        x, (height, width), boxes, labels = self.transformations(img, (300, 300), bboxes, labels)
        return x, (300, 300), boxes, labels

    def __len__(self) -> int:
        return len(self.all_labels)

In [8]:
def get_yolo_transforms():
    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Resize((640, 640)), transforms.RandomHorizontalFlip()])
    val_transforms = transforms.Compose([transforms.ToTensor(), transforms.Resize((640, 640))])
    return train_transforms, val_transforms
    
def get_ssd_transforms(dboxes):
    train_transforms =  SSDTransformer(dboxes, (300, 300), val=False)
    val_transforms =  SSDTransformer(dboxes, (300, 300), val=True)
    return train_transforms, val_transforms

In [9]:
dboxes = generate_dboxes(model="ssd")
train_transforms, val_transforms = get_ssd_transforms(dboxes)

In [10]:
train_ds = StreetDataset(os.path.join(DATA_DIR, 'train'), train_transforms)
val_ds = StreetDataset(os.path.join(DATA_DIR, 'val'), val_transforms)

In [11]:
targets = train_ds[0][2]
targets

tensor([[1.6561e+02, 1.1595e+02, 3.3625e+00, 3.7362e+00],
        [4.0000e-02, 1.3333e-02, 7.0000e-02, 7.0000e-02],
        [6.6667e-02, 1.3333e-02, 7.0000e-02, 7.0000e-02],
        ...,
        [5.0000e-01, 5.0000e-01, 9.5577e-01, 9.5577e-01],
        [5.0000e-01, 5.0000e-01, 1.0000e+00, 6.1518e-01],
        [5.0000e-01, 5.0000e-01, 6.1518e-01, 1.0000e+00]])

In [12]:
import random
import torch.nn.functional as F
from torch.utils.data.dataloader import default_collate
import os

def ssd_collate_fn(batch):
    items = list(zip(*batch))
    items[0] = default_collate([i for i in items[0] if torch.is_tensor(i)])
    items[1] = list([i for i in items[1] if i])
    items[2] = default_collate([i for i in items[2] if torch.is_tensor(i)])
    items[3] = default_collate([i for i in items[3] if torch.is_tensor(i)])
    return items

def custom_collate_fn(batch):
    images = []
    bboxes = []

    for b in batch:
        images.append(b[0])
        bboxes.append(b[1])

    images = torch.stack(images, dim=0)
    return images, bboxes



In [13]:
train_loader = DataLoader(train_ds, shuffle=True, batch_size=4, num_workers=1, collate_fn=ssd_collate_fn, pin_memory=True)
val_loader = DataLoader(val_ds, shuffle=False, batch_size=4, num_workers=1, collate_fn=ssd_collate_fn, pin_memory=True)

In [14]:
import torch
import torch.nn as nn
from torchvision.models.resnet import resnet50

class Base(nn.Module):
    def __init__(self):
        super().__init__()

    def init_weights(self):
        layers = [*self.additional_blocks, *self.loc, *self.conf]
        for layer in layers:
            for param in layer.parameters():
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)

    def bbox_view(self, src, loc, conf):
        ret = []
        for s, l, c in zip(src, loc, conf):
            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.num_classes, -1)))

        locs, confs = list(zip(*ret))
        locs, confs = torch.cat(locs, 2), torch.cat(confs, 2)
        return locs, confs


class ResNet(nn.Module):
    def __init__(self):
        super().__init__()
        backbone = resnet50(pretrained=True)
        self.out_channels = [1024, 512, 512, 256, 256, 256]
        self.feature_extractor = nn.Sequential(*list(backbone.children())[:7])

        conv4_block1 = self.feature_extractor[-1][0]
        conv4_block1.conv1.stride = (1, 1)
        conv4_block1.conv2.stride = (1, 1)
        conv4_block1.downsample[0].stride = (1, 1)

    def forward(self, x):
        x = self.feature_extractor(x)
        return x


class SSD(Base):
    def __init__(self, backbone=ResNet(), num_classes=81):
        super().__init__()

        self.feature_extractor = backbone
        self.num_classes = num_classes
        self._build_additional_features(self.feature_extractor.out_channels)
        self.num_defaults = [4, 6, 6, 6, 4, 4]
        self.loc = []
        self.conf = []

        for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
            self.loc.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
            self.conf.append(nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))

        self.loc = nn.ModuleList(self.loc)
        self.conf = nn.ModuleList(self.conf)
        self.init_weights()

    def _build_additional_features(self, input_size):
        self.additional_blocks = []
        for i, (input_size, output_size, channels) in enumerate(
                zip(input_size[:-1], input_size[1:], [256, 256, 128, 128, 128])):
            if i < 3:
                layer = nn.Sequential(
                    nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
                    nn.BatchNorm2d(channels),
                    nn.ReLU(inplace=True),
                    nn.Conv2d(channels, output_size, kernel_size=3, padding=1, stride=2, bias=False),
                    nn.BatchNorm2d(output_size),
                    nn.ReLU(inplace=True),
                )
            else:
                layer = nn.Sequential(
                    nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
                    nn.BatchNorm2d(channels),
                    nn.ReLU(inplace=True),
                    nn.Conv2d(channels, output_size, kernel_size=3, bias=False),
                    nn.BatchNorm2d(output_size),
                    nn.ReLU(inplace=True),
                )

            self.additional_blocks.append(layer)

        self.additional_blocks = nn.ModuleList(self.additional_blocks)


    def forward(self, x):
        x = self.feature_extractor(x)
        detection_feed = [x]
        for l in self.additional_blocks:
            x = l(x)
            detection_feed.append(x)
        locs, confs = self.bbox_view(detection_feed, self.loc, self.conf)
        return locs, confs


feature_maps = {}



In [15]:
class Loss(nn.Module):
    """
        Implements the loss as the sum of the followings:
        1. Confidence Loss: All labels, with hard negative mining
        2. Localization Loss: Only on positive labels
        Suppose input dboxes has the shape 8732x4
    """

    def __init__(self, dboxes):
        super(Loss, self).__init__()
        self.scale_xy = 1.0 / dboxes.scale_xy
        self.scale_wh = 1.0 / dboxes.scale_wh

        self.sl1_loss = nn.SmoothL1Loss(reduce=False)
        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), requires_grad=False)
        self.con_loss = nn.CrossEntropyLoss(reduce=False)

    def loc_vec(self, loc):
        # print(self.dboxes.device)
        # print(loc.device)
        # print('-------')
        gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, ]
        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()
        return torch.cat((gxy, gwh), dim=1)

    def forward(self, ploc, plabel, gloc, glabel):
        """
            ploc, plabel: Nx4x8732, Nxlabel_numx8732
                predicted location and labels
            gloc, glabel: Nx4x8732, Nx8732
                ground truth location and labels
        """
        
        mask = glabel > 0
        pos_num = mask.sum(dim=1)

        vec_gd = self.loc_vec(gloc)

        ploc = ploc.to(device)
        vec_gd = vec_gd.to(device)
        # gloc = gloc.to(device)
        # plabel = plabel.to(device)
        # glabel = glabel.to(device)

        # sum on four coordinates, and mask
        sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
        sl1 = (mask.float() * sl1).sum(dim=1)

        # hard negative mining
        con = self.con_loss(plabel, glabel)

        # postive mask will never selected
        con_neg = con.clone()
        con_neg[mask] = 0
        _, con_idx = con_neg.sort(dim=1, descending=True)
        _, con_rank = con_idx.sort(dim=1)

        # number of negative three times positive
        neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
        neg_mask = con_rank < neg_num

        closs = (con * (mask.float() + neg_mask.float())).sum(dim=1)

        # avoid no object detected
        total_loss = sl1 + closs
        num_mask = (pos_num > 0).float()
        pos_num = pos_num.float().clamp(min=1e-6)
        ret = (total_loss * num_mask / pos_num).mean(dim=0)
        return ret

In [16]:
model = SSD(backbone=ResNet(), num_classes=len(classes))
model = model.to(device)

In [17]:
criterion = Loss(dboxes)
losses = []
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-6)
n = 0
for epoch in range(10):
    for img, img_size, gloc, glabel in train_loader:
        n+=1
        img = img.to(device)
        # gloc = gloc.to(device)
        glabel = glabel.to(device)

        ploc, plabel = model(img)
        ploc, plabel = ploc.float(), plabel.float()
        ploc = ploc.to(device)
        plabel = plabel.to(device)
        gloc = gloc.transpose(1, 2)
        loss = criterion(ploc, plabel, gloc, glabel)
        if n%500==0:
            print(epoch, n, loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # break



0 500 tensor(35084.2266, device='cuda:0', grad_fn=<MeanBackward1>)
0 1000 tensor(44378.8789, device='cuda:0', grad_fn=<MeanBackward1>)
0 1500 tensor(44884.4648, device='cuda:0', grad_fn=<MeanBackward1>)
0 2000 tensor(30974.7188, device='cuda:0', grad_fn=<MeanBackward1>)
0 2500 tensor(35461.8477, device='cuda:0', grad_fn=<MeanBackward1>)
0 3000 tensor(36975.9648, device='cuda:0', grad_fn=<MeanBackward1>)
0 3500 tensor(54529.6641, device='cuda:0', grad_fn=<MeanBackward1>)
0 4000 tensor(38608.8516, device='cuda:0', grad_fn=<MeanBackward1>)
0 4500 tensor(45602.2969, device='cuda:0', grad_fn=<MeanBackward1>)
0 5000 tensor(41748.2617, device='cuda:0', grad_fn=<MeanBackward1>)
1 5500 tensor(41492.6484, device='cuda:0', grad_fn=<MeanBackward1>)
1 6000 tensor(38215.3438, device='cuda:0', grad_fn=<MeanBackward1>)
1 6500 tensor(52145.4883, device='cuda:0', grad_fn=<MeanBackward1>)
1 7000 tensor(42834.6562, device='cuda:0', grad_fn=<MeanBackward1>)
1 7500 tensor(37955.7500, device='cuda:0', grad_f

In [18]:
dboxes.scale_xy

0.1

In [20]:
torch.save(model.state_dict(), 'ssd_trained.pt')