In [1]:
import torch as T
from torch.nn import functional as F
from torch import nn
import cv2
from PIL import Image, ImageOps, ImageEnhance, __version__ as PILLOW_VERSION
import matplotlib.pyplot as plt
import timm
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, SequentialSampler
from torchvision import transforms
import torchvision.transforms.functional as tvf
import numpy as np
import os
import time
import math
from prettytable import PrettyTable
import pandas as pd
from ast import literal_eval
from tqdm import tqdm
from copy import deepcopy
from pcgrad import PCGrad

device = 'cpu'
#device = T.device('cuda' if T.cuda.is_available() else 'cpu')

class Encoder(nn.Module):
    def __init__(self, backbone = 'resnet18', device = 'cuda'):
        super(Encoder, self).__init__()
        self.backbone = timm.create_model(backbone, pretrained = True)
        self.List = list(self.backbone.children())[:-2]
        self.device = device
    def forward(self,X):
        outputs = []
        X = X.to(self.device).float()
        for i,layer in enumerate(self.List):
            X = layer(X)
            if i>1:
                outputs.append(X)
        return outputs
 
class objdet_Decoder(nn.Module):
    '''series of convs ==> final output heatmap'''
    def __init__(self, n_classes, stride = 2, device = 'cuda'):
        super(objdet_Decoder, self).__init__()
        self.upsample = nn.Upsample(scale_factor=2, mode = 'bilinear')
        self.conv1 = nn.Conv2d(512,256,(3,3),padding = 1)  # 16
        self.conv2 = nn.Conv2d(256,128,(3,3),padding = 1)  #32
        self.conv3 = nn.Conv2d(128,64,(3,3),padding = 1) #64
        self.conv4 = nn.Conv2d(64,32,(3,3),padding = 1) #128
        self.hmap = nn.Conv2d(32,n_classes,(1,1)) #128
        self.regs = nn.Conv2d(32,2,(1,1))
        self.w_h_ = nn.Conv2d(32,2,(1,1))
        self.to(device)
    def forward(self,X):
        X = self.upsample(X[-1])
        X = F.relu(self.conv1(X))
        X = self.upsample(X)
        X = F.relu(self.conv2(X))
        X = self.upsample(X)
        X = F.relu(self.conv3(X))
        X = self.upsample(X)
        X = F.relu(self.conv4(X))
        return [[T.sigmoid(self.hmap(X)), T.sigmoid(self.regs(X)), T.sigmoid(self.w_h_(X))]]
        
        
class DoubleConv(nn.Module):
    """(convolution => [BN] => ReLU) * 2"""
 
    def __init__(self, in_channels, out_channels, mid_channels=None):
        super(DoubleConv,self).__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )
 
    def forward(self, x):
        return self.double_conv(x)
        
class up(nn.Module):
    '''down samling--->double conv'''
    def __init__(self,in_channels, out_channels,last_layer=False):
        super(up,self).__init__()
        self.upsample = nn.Upsample(scale_factor=2, mode = 'bilinear')
        if last_layer:
            self.conv = DoubleConv(in_channels*2,out_channels)
        else:
            self.conv = DoubleConv(in_channels*3//2,out_channels)   #since we are concatenating 
    def forward(self,x1,x2):
        x1 = self.upsample(x1)
        X = T.cat([x1,x2],dim=1)
        X = self.conv(X)
        return X
        
class seg_decoder(nn.Module):
    def __init__(self, n_classes = 23, device="cuda"):
        super(seg_decoder, self).__init__()
        
        self.up1 = up(512,256)
        self.up2 = up(256,128)
        self.up3 = up(128,64)
        self.up4 = up(64,32,last_layer=True)
        self.out_conv = nn.Conv2d(32,n_classes,(3,3),padding=1)
        self.to(device)
    
    def forward(self,outputs):
        X = self.up1(outputs[-1],outputs[-2])
        X = self.up2(X,outputs[-3])
        X = self.up3(X,outputs[-4])
        X = self.up4(X,outputs[-6])
        X = self.out_conv(X)
        return X
     
class MTL_Model(nn.Module):
    def __init__(self,n_classes = 35,device='cuda'):
        super(MTL_Model,self).__init__()
        self.encoder = Encoder(device=device)
        self.seg_decoder = seg_decoder(n_classes ,device=device)
        self.dep_decoder = seg_decoder(n_classes = 1,device=device)
        self.obj_decoder = objdet_Decoder(n_classes = 15,device=device)
        self.to(device)
        
    def forward(self,X):
        outputs = self.encoder(X)
        seg_maps = self.seg_decoder(outputs)
        depth_maps = self.dep_decoder(outputs)
        detection_maps = self.obj_decoder(outputs)
        return (seg_maps, T.sigmoid(depth_maps),detection_maps)

PALETTE = {
    (128, 64,128)  : 0 , #'road' 
    (250,170,160) : 1 , #'parking'  
    ( 81,  0, 81) : 2 ,#drivable fallback
    (244, 35,232) : 3 , #sidewalk
    (230,150,140) : 4 , #rail track
    (152,251,152) : 5 ,#non-drivable fallback
    (220, 20, 60) : 6 ,#person
    (246, 198, 145) : 7 ,#animal
    (255,  0,  0) : 8 , #rider
    (  0,  0,230) : 9 ,#motorcycle
    (119, 11, 32) : 10 ,  #bicycle
    (255, 204, 54) : 11,#autorickshaw
    (  0,  0,142) : 12,  #car
    (  0,  0, 70) : 13, #truck
    (  0, 60,100) : 14,    #bus
    (  0,  0, 90) : 15,#caravan
    (  0,  0,110) : 16,#trailer
    (  0, 80,100) : 17,#train
    (136, 143, 153) : 18,#vehicle fallback
    (220, 190, 40) : 19,#curb
    (102,102,156) : 20,#wall
    (190,153,153) : 21,#fence
    (180,165,180) : 22,#guard rail
    (174, 64, 67) : 23,#billboard
    (220,220,  0) : 24,#traffic sign
    (250,170, 30) : 25,#traffic light
    (153,153,153) : 26,#pole
    (169, 187, 214) : 27,#obs-str-bar-fallback
    ( 70, 70, 70) : 28,#building
    (150,100,100) : 29,#bridge
    (150,120, 90) : 30,#tunnel
    (107,142, 35) : 31,#vegetation
    ( 70,130,180) : 32,#sky
    (169, 187, 214) : 33,#fallback background
    (  0,  0,  0) : 34#unlabeled
}

def convert_from_color_segmentation(arr_3d):
    arr_3d = np.array(arr_3d)
    arr_2d = np.zeros((arr_3d.shape[0], arr_3d.shape[1]), dtype=np.uint8)
    palette = PALETTE
    for i in range(0, arr_3d.shape[0]):
        for j in range(0, arr_3d.shape[1]):
            key = (arr_3d[i, j, 2], arr_3d[i, j, 1], arr_3d[i, j, 0])
            arr_2d[i, j] = palette.get(key,34) # default value if key was not found is 0

    return arr_2d

def labels_to_cityscapes_palette(array):
    result = np.zeros((array.shape[0], array.shape[1], 3))
    for value, key in PALETTE.items():
        result[np.where(array == key)] = (value[2],value[1],value[0])
    return result/255

def to_one_hot(mask, n_classes=35):
    one_hot = np.zeros((mask.shape[0], mask.shape[1], n_classes))
    for i, unique_value in enumerate(np.unique(mask)):
        one_hot[:, :, unique_value][mask == unique_value] = 1
    return one_hot    

class FocalLoss(nn.Module):
    def __init__(self, alpha=4, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = T.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return T.mean(F_loss)
        else:
            return F_loss
    
class DiceLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DiceLoss, self).__init__()

    def forward(self, inputs, targets, eps=1e-7):
        inputs = F.softmax(inputs, dim=1)
        targets = targets.type(inputs.type())
        intersection = T.sum(inputs * targets, (0, 2, 3))
        cardinality = T.sum(inputs + targets, (0, 2, 3))
        dice_loss = (2. * intersection / (cardinality + eps)).mean()
        return (1 - dice_loss)
    
class DiceFocalLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DiceFocalLoss, self).__init__()
        self.criterion = FocalLoss()
        self.dice_loss = DiceLoss()
        
    def forward(self, inputs, targets):
        targets_ = T.argmax(targets, dim=1)
        floss = self.criterion(inputs, targets_.long())
        dice_loss = self.dice_loss(inputs,targets)
        Dice_BCE = floss + dice_loss
        return Dice_BCE

class DepthLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DepthLoss, self).__init__()
    def im_gradient_loss(self,d_batch, n_pixels):
        a = T.Tensor([[[[1, 0, -1],
                            [2, 0, -2],
                            [1, 0, -1]]]])
                      
        b = T.Tensor([[[[1, 2, 1],
                            [0, 0, 0],
                            [-1, -2, -1]]]])
        
        a = a.to(device)
        b = b.to(device)

        G_x = F.conv2d(d_batch, a, padding=1).to(device)
        G_y = F.conv2d(d_batch, b, padding=1).to(device)
        
        G = T.pow(G_x,2)+ T.pow(G_y,2)
    
        return G.view(-1, n_pixels).mean(dim=1).mean()

    def forward(self,preds, actual_depth):
        
        n_pixels = actual_depth.shape[2]*actual_depth.shape[3]
        preds = preds*1000
        preds[preds<=0] = 0.00001
        actual_depth[actual_depth==0] = 0.00001
        d = T.log(preds) - T.log(actual_depth)
        grad_loss_term = self.im_gradient_loss(d, n_pixels)
        term_1 = T.pow(d.view(-1, n_pixels),2).mean(dim=1).mean() #pixel wise mean, then batch sum
        term_2 = (T.pow(d.view(-1, n_pixels).sum(dim=1),2)/(2*(n_pixels**2))).mean()
        loss1 = term_1 - term_2 + grad_loss_term
        loss2 = F.mse_loss(preds,actual_depth,reduction='mean')
        return loss1 + loss2
    
class DetectionLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DetectionLoss, self).__init__()
        
    def forward(self,obj, hmap, regs, w_h_):
        regs = [self._tranpose_and_gather_feature(r, obj['inds']) for r in regs]
        w_h_ = [self._tranpose_and_gather_feature(r, obj['inds']) for r in w_h_]
        hmap_loss = self._neg_loss(hmap, obj['hmap'])
        reg_loss = self._reg_loss(regs, obj['regs'], obj['ind_masks'])
        w_h_loss = self._reg_loss(w_h_, obj['w_h_'], obj['ind_masks'])
        loss =  0.5*hmap_loss +  reg_loss +  w_h_loss 
        return loss 
    
    def _neg_loss(self,preds, targets):
        pos_inds = targets.eq(1).float()
        neg_inds = targets.lt(1).float()
        neg_weights = T.pow(1 - targets, 4)
        loss = 0
        for pred in preds:
            pred = T.clamp(T.sigmoid(pred), min=1e-4, max=1 - 1e-4)
            pos_loss = T.log(pred) * T.pow(1 - pred, 2) * pos_inds
            neg_loss = T.log(1 - pred) * T.pow(pred, 2) * neg_weights * neg_inds

            num_pos = pos_inds.float().sum()
            pos_loss = pos_loss.sum()
            neg_loss = neg_loss.sum()

            if num_pos == 0:
                loss = loss - neg_loss
            else:
                loss = loss - (pos_loss + neg_loss) / num_pos
        return loss / len(preds)
    
    def _reg_loss(self,regs, gt_regs, mask):
        mask = mask[:, :, None].expand_as(gt_regs).float()
        loss = sum(F.l1_loss(r * mask, gt_regs * mask, reduction='sum') / (mask.sum() + 1e-4) for r in regs)
        return loss / len(regs)
    
    def _gather_feature(self,feat, ind, mask=None):
        dim = feat.size(2)
        ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
        feat = feat.gather(1, ind)
        if mask is not None:
            mask = mask.unsqueeze(2).expand_as(feat)
            feat = feat[mask]
            feat = feat.view(-1, dim)
        return feat
    
    def _tranpose_and_gather_feature(self,feat, ind):
        feat = feat.permute(0, 2, 3, 1).contiguous()
        feat = feat.view(feat.size(0), -1, feat.size(3))
        feat = _gather_feature(feat, ind)
        return feat

def _neg_loss(preds, targets):
    pos_inds = targets.eq(1).float()
    neg_inds = targets.lt(1).float()

    neg_weights = T.pow(1 - targets, 4)

    loss = 0
    for pred in preds:
        pred = T.clamp(pred, min=1e-4, max=1 - 1e-4)
        pos_loss = T.log(pred) * T.pow(1 - pred, 2) * pos_inds
        neg_loss = T.log(1 - pred) * T.pow(pred, 2) * neg_weights * neg_inds

        num_pos = pos_inds.float().sum()
        pos_loss = pos_loss.sum()
        neg_loss = neg_loss.sum()
        if num_pos == 0:
            loss = loss - neg_loss
        else:
            loss = loss - (pos_loss + neg_loss) / num_pos
    return loss / len(preds)


def _reg_loss(regs, gt_regs, mask):
    mask = mask[:, :, None].expand_as(gt_regs).float()
    loss = sum(F.l1_loss(r * mask, gt_regs * mask, reduction='sum') / (mask.sum() + 1e-4) for r in regs)
    return loss / len(regs)

input_size_x,input_size_y = (640, 480)
MODEL_SCALE = 2

def _gather_feature(feat, ind, mask=None):
    dim = feat.size(2)
    ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
    feat = feat.gather(1, ind)
    if mask is not None:
        mask = mask.unsqueeze(2).expand_as(feat)
        feat = feat[mask]
        feat = feat.view(-1, dim)
    return feat

def _tranpose_and_gather_feature(feat, ind):
    feat = feat.permute(0, 2, 3, 1).contiguous()
    feat = feat.view(feat.size(0), -1, feat.size(3))
    feat = _gather_feature(feat, ind)
    return feat

def gaussian2D(shape, sigma=1):
    m, n = [(ss - 1.) / 2. for ss in shape]
    y, x = np.ogrid[-m:m + 1, -n:n + 1]

    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
    h[h < np.finfo(h.dtype).eps * h.max()] = 0
    return h

def gaussian_radius(det_size, min_overlap=0.7):
    height, width = det_size

    a1 = 1
    b1 = (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
    r1 = (b1 - sq1) / (2 * a1)

    a2 = 4
    b2 = 2 * (height + width)
    c2 = (1 - min_overlap) * width * height
    sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2)
    r2 = (b2 - sq2) / (2 * a2)

    a3 = 4 * min_overlap
    b3 = -2 * min_overlap * (height + width)
    c3 = (min_overlap - 1) * width * height
    sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3)
    r3 = (b3 + sq3) / 2
    return min(r1, r2, r3)

def draw_umich_gaussian(heatmap, center, radius, k=1):
    diameter = 2 * radius + 1
    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)

    x, y = int(center[0]), int(center[1])

    height, width = heatmap.shape[0:2]

    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)

    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:  # TODO debug
        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
    return heatmap

def convert(obj,width,height):
    x_scale = 640 / width
    y_scale = 480 / height
    x_c = int(np.round(((obj[0]+obj[2])/2)*x_scale))
    y_c = int(np.round(((obj[1]+obj[3])/2)*y_scale))
    w = int(np.round((obj[2]-obj[0])*x_scale))
    h = int(np.round((obj[3]-obj[1])*y_scale))
    box = [x_c,y_c,w,h]
    return box

def make_hm_regr(target,width,height,num_classes = 15,input_size_x = 640,input_size_y = 480,MODEL_SCALE=2,max_objs=240,gaussian_iou = 0.7):
    hmap = np.zeros((num_classes, input_size_y//MODEL_SCALE, input_size_x//MODEL_SCALE), dtype=np.float32)
    w_h_ = np.zeros((max_objs, 2), dtype=np.float32)
    regs = np.zeros((max_objs, 2), dtype=np.float32)
    inds = np.zeros((max_objs,), dtype=np.int64)
    ind_masks = np.zeros((max_objs,), dtype=np.uint8)
    boxes = literal_eval(target["bbox"])
    classes = {"bicycle":0,"bus":1,"traffic sign":2,"train":3,"motorcycle":4,"car":5,"traffic light":6,"person":7,"vehicle fallback":8,"truck":9,"autorickshaw":10,"animal":11,"caravan":12,"rider":13,"trailer":14}

    for i,a in enumerate(boxes):
        box_ = a["bbox"]
        box = convert(box_,width,height)
        if (box[0]>640) or (box[1]>480):
            continue
        center = np.array([(box[0]),(box[1])], dtype=np.float32)
        obj_c = np.array([(box[0]//MODEL_SCALE),(box[1]//MODEL_SCALE)], dtype=np.float32)
        obj_c_int = obj_c.astype(np.int32)
        h = box[3]
        w = box[2]
        if h > 0 and w > 0:
            radius = max(0, int(gaussian_radius((math.ceil(h), math.ceil(w)), gaussian_iou)))
            hmap[classes[a["label"]],:,:] = draw_umich_gaussian(hmap[classes[a["label"]],:,:], obj_c_int, radius)   
            w_h_[i] =  w/input_size_x, h/input_size_y
            regs[i] = center - (obj_c_int*MODEL_SCALE)
            inds[i] = ((obj_c_int[1]) * (input_size_x//MODEL_SCALE)) + (obj_c_int[0])
            ind_masks[i] = 1
    return {'hmap': hmap, 'w_h_': w_h_, 'regs': regs, 'inds': inds, 'ind_masks': ind_masks}

In [2]:
class MTL_Model(nn.Module):
    def __init__(self,n_classes = 35,device='cuda'):
        super(MTL_Model,self).__init__()
        self.encoder = Encoder(device=device)
        self.seg_decoder = seg_decoder(n_classes ,device=device)
        self.dep_decoder = seg_decoder(n_classes = 1,device=device)
        self.obj_decoder = objdet_Decoder(n_classes = 15,device=device)
        self.to(device)
        
    def forward(self,X):
        outputs = self.encoder(X)
        for i in outputs:
            print(i.shape)
        seg_maps = self.seg_decoder(outputs)
        depth_maps = self.dep_decoder(outputs)
        detection_maps = self.obj_decoder(outputs)
        return (seg_maps, T.sigmoid(depth_maps),detection_maps)

x = T.rand(2,3,480,640)
x.shape
model = MTL_Model(device="cpu")

In [3]:
y = model(x)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


torch.Size([2, 64, 240, 320])
torch.Size([2, 64, 120, 160])
torch.Size([2, 64, 120, 160])
torch.Size([2, 128, 60, 80])
torch.Size([2, 256, 30, 40])
torch.Size([2, 512, 15, 20])


  "See the documentation of nn.Upsample for details.".format(mode)


In [1]:
class MTL(Dataset):
    def __init__(self, filename=None, input_size=(640, 480), output_size=(320, 240), n_classes=15):
        super().__init__()
        self.filename = filename
        self.n_classes = n_classes
        self.max_objs = 240
        self.gaussian_iou = 0.7
        self.dataset = pd.read_csv(self.filename)
        self.input_size = input_size
        self.output_size = output_size
        self.input_size_x = self.input_size[0]
        self.input_size_y = self.input_size[1]
        self.MODEL_SCALE = self.input_size[0]//self.output_size[0]
        self.preprocess = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        self.resize1 = transforms.Compose([transforms.Resize(self.input_size)])
        self.resize2 = transforms.Compose([transforms.Resize(self.output_size)])


    def __len__(self): return len(self.dataset)
    
    def __getitem_internal__(self, idx, preprocess=True):
        target = self.dataset.iloc[idx]
        rgb_image = cv2.imread(target["Path"])
        height, width, channels = rgb_image.shape
        rgb_image = cv2.resize(rgb_image,self.input_size)
        obj = make_hm_regr(target,width,height,self.n_classes,self.input_size_x,self.input_size_y,self.MODEL_SCALE,self.max_objs,self.gaussian_iou)
        seg_mask = np.load(target["Seg_Path"])
        depth_image = np.load(target["Depth_path"])
        depth_image = cv2.resize(depth_image,self.output_size)
        seg_mask = cv2.resize(seg_mask,self.output_size)
        one_hot_segmask = to_one_hot(seg_mask)
        if preprocess:
            rgb_image = self.preprocess(np.array(rgb_image))
            one_hot_segmask = transforms.ToTensor()(np.array(one_hot_segmask))
            depth_image = transforms.ToTensor()(np.array(depth_image))
        else:
            rgb_image = transforms.ToTensor()(np.array(rgb_image))
            one_hot_segmask = transforms.ToTensor()(np.array(one_hot_segmask))
            depth_image = transforms.ToTensor()(np.array(depth_image))
            seg_mask = transforms.ToTensor()(np.array(seg_mask))
        return (rgb_image,seg_mask,one_hot_segmask,depth_image, obj)

    def __getitem__(self, idx):
        return self.__getitem_internal__(idx, True)
    
    def raw(self, idx):
        return self.__getitem_internal__(idx, False)
    
model = MTL_Model(device = device)
print(device)
#model.load_state_dict(T.load("/home/b170007ec/Programs/MTL/DSD_MTL/Models/model-1.713353544473648.pth",map_location=T.device('cpu')))

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print(pytorch_total_params)
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(pytorch_total_params)
    print()
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
#count_parameters(model)
train_dataloader = MTL("/home/b170007ec/Programs/MTL/DSD_MTL/Dataset/train_dataset.csv")
print("Train :",train_dataloader.__len__())
val_dataloader = MTL("/home/b170007ec/Programs/MTL/DSD_MTL/Dataset/val_dataset.csv")
print("Val :",val_dataloader.__len__())

NameError: name 'Dataset' is not defined

In [3]:
diceloss = DiceFocalLoss()
depthloss = DepthLoss()

def loss_fn(y_pred, y_true, obj, hmap, regs, w_h_):
    (pred_seg, pred_depth) = y_pred
    (true_seg, true_depth) = y_true
    dice = diceloss(pred_seg, true_seg)
    depth = depthloss(pred_depth, true_depth)
    #detect = detectionloss(obj, hmap, regs, w_h_)
    regs = [_tranpose_and_gather_feature(r, obj['inds']) for r in regs]
    w_h_ = [_tranpose_and_gather_feature(r, obj['inds']) for r in w_h_]
    hmap_loss = _neg_loss(hmap, obj['hmap'])
    reg_loss = _reg_loss(regs, obj['regs'], obj['ind_masks'])
    w_h_loss = _reg_loss(w_h_, obj['w_h_'], obj['ind_masks'])
    detect =  0.5*hmap_loss +  reg_loss +  w_h_loss 
    return dice+depth+detect, dice, depth, detect 

@T.no_grad()
def validation(model, loader, loss_fn):
    vlosses = []
    dice_vloss = []
    depth_vloss = []
    detect_vloss = []
    model.eval()
    for rgb,seg_mask,seg,depth,obj in loader:
        rgb,seg,depth = rgb.to(device), seg.to(device), depth.to(device)
        obj['hmap'], obj['w_h_'], obj['regs'], obj['inds'], obj['ind_masks'] = obj['hmap'].to(device), obj['w_h_'].to(device), obj['regs'].to(device), obj['inds'].to(device), obj['ind_masks'].to(device)
        y_pred = model(rgb)
        hmap, regs, w_h_ = zip(*y_pred[2])
        y_true = (seg,depth)
        loss, v_dice, v_depth, v_detect = loss_fn((y_pred[0],y_pred[1]), y_true, obj, hmap, regs, w_h_)
        dice_vloss.append(v_dice.item())
        depth_vloss.append(v_depth.item())
        detect_vloss.append(v_detect.item())
        vlosses.append(loss.item())
    return np.array(vlosses).mean(), np.array(dice_vloss).mean(), np.array(depth_vloss).mean(), np.array(detect_vloss).mean()

batch_size = 25
EPOCHES = 250

train_loader = DataLoader(train_dataloader,batch_size=batch_size,shuffle=False, num_workers=0, sampler=SubsetRandomSampler(list(range(train_dataloader.__len__()))),
                             drop_last=False)
val_loader = DataLoader(val_dataloader,batch_size=batch_size,shuffle=False,
                              num_workers=0,
                              sampler=SubsetRandomSampler(list(range(len(val_dataloader.dataset)))),
                             drop_last=False)
raw_line0 = r'''Epoch[{}]    |    Lr:{}'''
raw_line1 = r'''Train Loss:[SEG:{}+DEPTH:{}+DETECT:{}] | Val Loss:[SEG:{}+DEPTH:{}+DETECT:{}]'''
raw_line3 = r'''TOTAL Train loss: {}  |  TOTAL Val loss: {}  |  Time:{:.1f} min '''
import time
from tqdm import tqdm
import matplotlib.pyplot as plt

optimizer = PCGrad(T.optim.Adam([
                {'params': model.parameters()}]
                , lr=0.0001))
scheduler = T.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',factor=0.1,patience=10,verbose=True)
best_loss = None


for epoch in range(1, EPOCHES+1):
    losses = []
    dice_loss = []
    depth_loss = []
    detect_loss = []
    start_time = time.time()
    t = tqdm(train_loader)
    model.train()
    for i,(rgb,seg_mask,seg,depth,obj) in enumerate(t):
        rgb,seg,depth = rgb.to(device), seg.to(device), depth.to(device)
        obj['hmap'], obj['w_h_'], obj['regs'], obj['inds'], obj['ind_masks'] = obj['hmap'].to(device), obj['w_h_'].to(device), obj['regs'].to(device), obj['inds'].to(device), obj['ind_masks'].to(device)
        optimizer.zero_grad()
        y_pred = model(rgb)
        hmap, regs, w_h_ = zip(*y_pred[2])
        loss, dice, depth, detect = loss_fn((y_pred[0],y_pred[1]), (seg,depth), obj, hmap, regs, w_h_)
        losses = [dice,depth,detect]
#         loss.backward()
        optimizer.pc_backward(losses) 
        optimizer.step()
        dice_loss.append(dice.item())
        depth_loss.append(depth.item())
        detect_loss.append(detect.item())
        losses.append(loss.item())
    vloss, vdice, vdepth, vdetect = validation(model, val_loader, loss_fn)
    print(raw_line0.format(epoch,optimizer.param_groups[0]["lr"]))
    print(raw_line1.format(np.array(dice_loss).mean(),np.array(depth_loss).mean(),np.array(detect_loss).mean(),vdice,vdepth,vdetect))
    print(raw_line3.format(np.array(losses).mean(),vloss,(time.time()-start_time)/60**1))
    
    if best_loss == None:
        best_loss = vloss
        T.save(model.state_dict(), '/home/b170007ec/Programs/MTL/VQ_MTL/Models/model_dsd-{}.pth'.format(best_loss))
        print("saving model ..")
    if vloss < best_loss:
        best_loss = vloss
        T.save(model.state_dict(), '/home/b170007ec/Programs/MTL/VQ_MTL/Models/model_dsd-{}.pth'.format(best_loss))
        print("saving model ..")
    scheduler.step(vloss)

In [4]:
model.load_state_dict(T.load("/home/b170007ec/Programs/MTL/DSD_MTL/MTL_V3/model_v3-1.8697537092062144.pth",map_location=T.device('cpu')))

<All keys matched successfully>

In [5]:
def showbox(img, hm, off,regr,box_=None):
    sample = cv2.resize(img,(640, 480))
    boxes = ctdet_decode(hm,off,regr)
    
    classes = {0:"bicycle",1:"bus",2:"traffic sign",3:"train",4:"motorcycle",5:"car",6:"traffic light",7:"person",8:"vehicle fallback",9:"truck",10:"autorickshaw",11:"animal",12:"caravan",13:"rider",14:"trailer"}
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    thickness = 1
    color =(250, 0, 0)
    for box in boxes:
        cv2.rectangle(sample,
                      (int(box[0]-(box[2]/2)), int(box[1]-(box[3]/2))),
                      (int(box[0]+(box[2]/2)), int(box[1]+(box[3]/2))),
                      color, 2)
    return sample

def _nms(heat, kernel=7):
    hmax = F.max_pool2d(heat, kernel, stride=1, padding=(kernel - 1) // 2)
    keep = (hmax == heat).float()
    return heat * keep


def _topk(scores, K=40, threshold=0.2):
    batch, cat, height, width = scores.size()

    topk_scores, topk_inds = T.topk(scores.view(batch, cat, -1), K)

    topk_inds = topk_inds % (height * width)
    topk_ys = (topk_inds / width).int().float()
    topk_xs = (topk_inds % width).int().float()

    topk_score, topk_ind = T.topk(topk_scores.view(batch, -1), K)
    topk_clses = (topk_ind / K).int()
    topk_inds = _gather_feature(topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_ys = _gather_feature(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_xs = _gather_feature(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
    mask = T.where(topk_score>threshold, True, False)
    return topk_score[:,mask[0]], topk_inds[:,mask[0]], topk_clses[:,mask[0]], topk_ys[:,mask[0]], topk_xs[:,mask[0]], len(topk_score[:,mask[0]][0])


def ctdet_decode(hmap, regs, w_h_, K=40):
    batch, cat, height, width = hmap.shape
    batch = 1
    input_size_x = 640
    input_size_y = 480
    hmap = _nms(hmap)  # perform nms on heatmaps

    scores, inds, clses, ys, xs, M = _topk(hmap, K=K)
    regs = _tranpose_and_gather_feature(regs, inds)
    regs = regs.view(batch, M, 2)
    xs = xs.view(batch, M, 1)*MODEL_SCALE + regs[:, :, 0:1]
    ys = ys.view(batch, M, 1)*MODEL_SCALE + regs[:, :, 1:2]
    w_h_ = _tranpose_and_gather_feature(w_h_, inds)
    w_h_ = w_h_.view(batch, M, 2)

    clses = clses.view(batch, M, 1).float()
    scores = scores.view(batch, M, 1)
    bboxes = T.cat([xs ,ys  ,w_h_[..., 0:1]*input_size_x ,w_h_[..., 1:2]*input_size_y ], dim=2)
    detections = T.cat([bboxes, scores, clses], dim=2)
    return detections.cpu().numpy()[0]

In [6]:
class MTL_TEST_DETECTOR(Dataset):
    def __init__(self, filename=None, input_size=(640, 480), output_size=(320, 240)):
        super().__init__()
        self.filename = filename
        self.dataset = pd.read_csv(self.filename)
        self.input_size = input_size
        self.output_size = output_size
        self.input_size_x = self.input_size[0]
        self.input_size_y = self.input_size[1]
        self.MODEL_SCALE = self.input_size[0]//self.output_size[0]
        self.preprocess = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        self.resize1 = transforms.Compose([transforms.Resize(self.input_size)])
        self.resize2 = transforms.Compose([transforms.Resize(self.output_size)])


    def __len__(self): return len(self.dataset)
    
    def __getitem_internal__(self, idx, preprocess=True):
        target = self.dataset.iloc[idx]
        rgb_image = cv2.imread(target["Path"])
        height, width, channels = rgb_image.shape
        rgb_image = cv2.resize(rgb_image,self.input_size)
        boxes = literal_eval(target["bbox"])
        b = []
        for i,a in enumerate(boxes):
            box_ = a["bbox"]
            box = convert(box_,width,height)
            b.append(box)
        if preprocess:
            rgb_image = self.preprocess(np.array(rgb_image))
        else:
            rgb_image = transforms.ToTensor()(np.array(rgb_image))
        return (rgb_image, b)

    def __getitem__(self, idx):
        return self.__getitem_internal__(idx, True)
    
    def raw(self, idx):
        return self.__getitem_internal__(idx, False)

In [7]:
# test_dataloader = MTL_TEST_DETECTOR("/home/b170007ec/Programs/MTL/DSD_MTL/Dataset/val_dataset.csv")
# model.eval()
# device = T.device("cpu")
# j=10
# for i in range(j,j+30,5):
#     matrix = []
#     for a in range(5):
#         rgb ,box = test_dataloader[i+a]
#         rgb_raw,_ = test_dataloader.raw(i+a)
#         rgb_raw = rgb_raw.permute(1,2,0)
#         rgb = T.unsqueeze(rgb, 0)
#         rgb = rgb.to(device)
#         y_pred = model(rgb)
#         y_pred_ = F.softmax(y_pred[0],dim=1)
#         pseg = T.squeeze(y_pred_,0)
#         pseg = T.argmax(pseg, dim=0)
#         pdepth = T.squeeze(y_pred[1],0)
#         pdepth = pdepth.permute(1,2,0)
#         pdepth = pdepth.reshape(240,320)
#         hmap, regs, w_h_ = zip(*y_pred[2])
#         d = showbox(rgb_raw.numpy(), hmap[0].detach(), regs[0].detach(),w_h_[0].detach())
#         matrix.append([rgb_raw.numpy(),labels_to_cityscapes_palette(pseg.cpu().detach().numpy()),pdepth.cpu().detach().numpy(),d])
    
#     fig, ax = plt.subplots(5, 4,figsize=(40,40))
#     for k in range(5):
#         for j in range(4):
#             ax[k][j].imshow(matrix[k][j])
#             ax[k][j].set_xticks([])
#             ax[k][j].set_yticks([])
#     plt.savefig("END_TO_END_MTL_{}".format(i))
#     plt.show()

In [8]:
import sklearn
from sklearn.metrics import confusion_matrix
def compute_errors(gt, pred):
    pred *= 1000
    thresh = np.maximum((gt / pred), (pred / gt))
    a1 = (thresh < 1.25).mean()
    a2 = (thresh < (1.25 ** 2)).mean()
    a3 = (thresh < (1.25 ** 3)).mean()
    abs_rel = np.mean(np.abs(gt - pred) / gt)
    rmse = (gt - pred) ** 2
    rmse = np.sqrt(rmse.mean())
    log_10 = (np.abs(np.log10(gt)-np.log10(pred))).mean()
    return a1, a2, a3, abs_rel, rmse, log_10

def check_size(eval_segm, gt_segm):
    h_e, w_e = segm_size(eval_segm)
    h_g, w_g = segm_size(gt_segm)

    if (h_e != h_g) or (w_e != w_g):
        raise EvalSegErr("DiffDim: Different dimensions of matrices!")

'''
Exceptions
'''
class EvalSegErr(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)
    
def extract_masks(segm, cl, n_cl):
    h, w  = segm_size(segm)
    masks = np.zeros((n_cl, h, w))

    for i, c in enumerate(cl):
        masks[i, :, :] = segm == c

    return masks

def segm_size(segm):
    try:
        height = segm.shape[0]
        width  = segm.shape[1]
    except IndexError:
        raise

    return height, width

def extract_both_masks(eval_segm, gt_segm, cl, n_cl):
    eval_mask = extract_masks(eval_segm, cl, n_cl)
    gt_mask   = extract_masks(gt_segm, cl, n_cl)

    return eval_mask, gt_mask

def extract_classes(segm):
    cl = np.unique(segm)
    n_cl = len(cl)

    return cl, n_cl


def IOU_SCORE(eval_segm, gt_segm):
    intersection = np.logical_and(gt_segm, eval_segm)
    union = np.logical_or(gt_segm, eval_segm)
    iou_score = np.sum(intersection) / np.sum(union)
    
    check_size(eval_segm, gt_segm)

    cl, n_cl = extract_classes(gt_segm)
    eval_mask, gt_mask = extract_both_masks(eval_segm, gt_segm, cl, n_cl)
    
    sum_n_ii = 0
    sum_t_i  = 0

    for i, c in enumerate(cl):
        curr_eval_mask = eval_mask[i, :, :]
        curr_gt_mask = gt_mask[i, :, :]

        sum_n_ii += np.sum(np.logical_and(curr_eval_mask, curr_gt_mask))
        sum_t_i  += np.sum(curr_gt_mask)
 
    if (sum_t_i == 0):
        pixel_accuracy_ = 0
    else:
        pixel_accuracy_ = sum_n_ii / sum_t_i
    
    return iou_score, pixel_accuracy_

def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates intersection over union
    Parameters:
        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)
    Returns:
        tensor: Intersection over union for all examples
    """

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = T.max(box1_x1, box2_x1)
    y1 = T.max(box1_y1, box2_y1)
    x2 = T.min(box1_x2, box2_x2)
    y2 = T.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

class MTL_VAL(Dataset):
    def __init__(self, filename=None, input_size=(640, 480), output_size=(320, 240), n_classes=15):
        super().__init__()
        self.filename = filename
        self.n_classes = n_classes
        self.max_objs = 240
        self.gaussian_iou = 0.7
        self.dataset = pd.read_csv(self.filename)
        self.input_size = input_size
        self.output_size = output_size
        self.input_size_x = self.input_size[0]
        self.input_size_y = self.input_size[1]
        self.MODEL_SCALE = self.input_size[0]//self.output_size[0]
        self.preprocess = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        self.resize1 = transforms.Compose([transforms.Resize(self.input_size)])
        self.resize2 = transforms.Compose([transforms.Resize(self.output_size)])


    def __len__(self): return len(self.dataset)
    
    def __getitem_internal__(self, idx, preprocess=True):
        target = self.dataset.iloc[idx]
        rgb_image = cv2.imread(target["Path"])
        height, width, channels = rgb_image.shape
        rgb_image = cv2.resize(rgb_image,self.input_size)
        obj = make_hm_regr(target,width,height,self.n_classes,self.input_size_x,self.input_size_y,self.MODEL_SCALE,self.max_objs,self.gaussian_iou)
        seg_mask = np.load(target["Seg_Path"])
        depth_image = np.load(target["Depth_path"])
        depth_image = cv2.resize(depth_image,self.output_size)
        seg_mask = cv2.resize(seg_mask,self.output_size)
        one_hot_segmask = to_one_hot(seg_mask)
        boxes = literal_eval(target["bbox"])
        b = []
        classes = {"bicycle":0,"bus":1,"traffic sign":2,"train":3,"motorcycle":4,"car":5,"traffic light":6,"person":7,"vehicle fallback":8,"truck":9,"autorickshaw":10,"animal":11,"caravan":12,"rider":13,"trailer":14}
        for i,a in enumerate(boxes):
            box_ = a["bbox"]
            x_scale = 640 / width
            y_scale = 480 / height
            x1 = int(box_[0]*x_scale)
            y1 = int(box_[1]*y_scale)
            x2 = int(box_[2]*x_scale)
            y2 = int(box_[3]*y_scale)
            b.append([x1,y1,x2,y2,classes[a["label"]]])
        if preprocess:
            rgb_image = self.preprocess(np.array(rgb_image))
            one_hot_segmask = transforms.ToTensor()(np.array(one_hot_segmask))
            depth_image = transforms.ToTensor()(np.array(depth_image))
        else:
            rgb_image = transforms.ToTensor()(np.array(rgb_image))
            one_hot_segmask = transforms.ToTensor()(np.array(one_hot_segmask))
            depth_image = transforms.ToTensor()(np.array(depth_image))
            seg_mask = transforms.ToTensor()(np.array(seg_mask))
        return (rgb_image,seg_mask,one_hot_segmask,depth_image, b)

    def __getitem__(self, idx):
        return self.__getitem_internal__(idx, True)
    
    def raw(self, idx):
        return self.__getitem_internal__(idx, False)

test_dataloader = MTL_VAL("/home/b170007ec/Programs/MTL/DSD_MTL/Dataset/val_dataset.csv")

from tqdm import tqdm
from mean_average_precision import MetricBuilder
metric_fn = MetricBuilder.build_evaluation_metric("map_2d", async_mode=True, num_classes=15)
A1 = 0
A2 = 0
A3 = 0
ABS_REL = 0
RMSE = 0
LOG_10 = 0
IOU = 0
detection_iou = []
PIXCEL_ACC = 0
model.eval()
i = 0
for k in tqdm(range(100)):
    pred = []
    true = []
    rgb,seg_mask,one_hot_segmask,depth, obj = test_dataloader[k]
    rgb = T.unsqueeze(rgb, 0)
    rgb = rgb.to(device)
    y_pred = model(rgb)
    y_pred_ = F.softmax(y_pred[0],dim=1)
    pseg = T.squeeze(y_pred_,0)
    pseg = T.argmax(pseg, dim=0)
    pdepth = T.squeeze(y_pred[1],0)
    pdepth = pdepth.permute(1,2,0)
    pdepth = pdepth.reshape(240,320)
    hmap, regs, w_h_ = zip(*y_pred[2])
    pred_detections = ctdet_decode(hmap[0].detach(),regs[0].detach(),w_h_[0].detach())
    a1, a2, a3, abs_rel, rmse, log_10 = compute_errors(depth.numpy(), pdepth.cpu().detach().numpy())
    iou,pixel_accuracy_ = IOU_SCORE(pseg.cpu().detach().numpy(), seg_mask)
    for o in obj:
        true.append([o[0],o[1],o[2],o[3],o[4],0,0])
    for p in pred_detections:
        pred.append([int(p[0]-(p[2]/2)),int(p[1]-(p[3]/2)),int(p[0]+(p[2]/2)),int(p[1]+(p[3]/2)),p[5],p[4]])
    for u in pred:
        best_iou = 0
        for v in true:
            if u[4] == v[4]:
                iou_ = intersection_over_union(
                    T.tensor(u[:4]),
                    T.tensor(v[:4]),
                    box_format="midpoint",
                )
                if iou_ > best_iou:
                    best_iou = iou_
        detection_iou.append(best_iou)
    metric_fn.add(np.array(pred), np.array(true))
    A1 += a1
    A2 += a2
    A3 += a3
    ABS_REL += abs_rel
    RMSE += rmse
    LOG_10 += log_10
    IOU += iou
    PIXCEL_ACC+=pixel_accuracy_
    i+=1
    
A1 /= i
A2 /= i
A3 /= i
ABS_REL /= i
RMSE /= i
LOG_10 /= i
IOU /= i
PIXCEL_ACC /= i
seg_metric_line = r'''Segmentation :     IOU :{}
                        Pixcel Acc :{}'''
depth_metric_line = r'''Depth Estimation : A1 :{} 
                   A2 :{} 
                   A3 :{} 
                   ABS_REL :{} 
                   RMSE :{} 
                   LOG_10 :{}'''
print(seg_metric_line.format(IOU,PIXCEL_ACC))
print(depth_metric_line.format(A1,A2,A3,ABS_REL,RMSE,LOG_10))
print(f"VOC PASCAL mAP in all points: {metric_fn.value(iou_thresholds=0.5)['mAP']}")
print("Detection IOU :",sum(detection_iou)/len(detection_iou))

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  "See the documentation of nn.Upsample for details.".format(mode)
100%|██████████| 100/100 [18:28<00:00, 11.09s/it]


Segmentation :     IOU :0.9793402734016757
                        Pixcel Acc :0.943042317708333
Depth Estimation : A1 :0.8524420572916663 
                   A2 :0.9671630208333329 
                   A3 :0.9899037760416664 
                   ABS_REL :0.1191964515298605 
                   RMSE :0.030864654602482916 
                   LOG_10 :0.05106191769242287
VOC PASCAL mAP in all points: 0.2563420832157135
Detection IOU : tensor([0.7261])


In [14]:
# import torch as T
# from collections import Counter


# def mean_average_precision(
#     pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=15
# ):
#     """
#     Calculates mean average precision 
#     Parameters:
#         pred_boxes (list): list of lists containing all bboxes with each bboxes
#         specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
#         true_boxes (list): Similar as pred_boxes except all the correct ones 
#         iou_threshold (float): threshold where predicted bboxes is correct
#         box_format (str): "midpoint" or "corners" used to specify bboxes
#         num_classes (int): number of classes
#     Returns:
#         float: mAP value across all classes given a specific IoU threshold 
#     """

#     # list storing all AP for respective classes
#     average_precisions = []
#     average_iou = []
#     # used for numerical stability later on
#     epsilon = 1e-6

#     for c in range(num_classes):
#         detections = []
#         ground_truths = []

#         # Go through all predictions and targets,
#         # and only add the ones that belong to the
#         # current class c
#         for detection in pred_boxes:
#             if detection[1] == c:
#                 detections.append(detection)

#         for true_box in true_boxes:
#             if true_box[1] == c:
#                 ground_truths.append(true_box)

#         # find the amount of bboxes for each training example
#         # Counter here finds how many ground truth bboxes we get
#         # for each training example, so let's say img 0 has 3,
#         # img 1 has 5 then we will obtain a dictionary with:
#         # amount_bboxes = {0:3, 1:5}
#         amount_bboxes = Counter([gt[0] for gt in ground_truths])

#         # We then go through each key, val in this dictionary
#         # and convert to the following (w.r.t same example):
#         # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
#         for key, val in amount_bboxes.items():
#             amount_bboxes[key] = T.zeros(val)

#         # sort by box probabilities which is index 2
#         detections.sort(key=lambda x: x[2], reverse=True)
#         TP = T.zeros((len(detections)))
#         FP = T.zeros((len(detections)))
#         total_true_bboxes = len(ground_truths)
        
#         # If none exists for this class then we can safely skip
#         if total_true_bboxes == 0:
#             continue

#         for detection_idx, detection in enumerate(detections):
#             # Only take out the ground_truths that have the same
#             # training idx as detection
#             ground_truth_img = [
#                 bbox for bbox in ground_truths if bbox[0] == detection[0]
#             ]

#             num_gts = len(ground_truth_img)
#             best_iou = 0

#             for idx, gt in enumerate(ground_truth_img):
#                 iou = intersection_over_union(
#                     T.tensor(detection[3:]),
#                     T.tensor(gt[3:]),
#                     box_format=box_format,
#                 )

#                 if iou > best_iou:
#                     best_iou = iou
#                     best_gt_idx = idx
#             average_iou.append(best_iou)
#             if best_iou > iou_threshold:
#                 # only detect ground truth detection once
#                 if amount_bboxes[detection[0]][best_gt_idx] == 0:
#                     # true positive and add this bounding box to seen
#                     TP[detection_idx] = 1
#                     amount_bboxes[detection[0]][best_gt_idx] = 1
#                 else:
#                     FP[detection_idx] = 1

#             # if IOU is lower then the detection is a false positive
#             else:
#                 FP[detection_idx] = 1

#         TP_cumsum = T.cumsum(TP, dim=0)
#         FP_cumsum = T.cumsum(FP, dim=0)
#         recalls = TP_cumsum / (total_true_bboxes + epsilon)
#         precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
#         precisions = T.cat((T.tensor([1]), precisions))
#         recalls = T.cat((T.tensor([0]), recalls))
#         # torch.trapz for numerical integration
#         average_precisions.append(T.trapz(precisions, recalls))
#     print(average_iou)
#     return sum(average_precisions) / len(average_precisions), sum(average_iou)/len(average_iou)

In [1]:
# import cv2
# import numpy as np
# import os
# from tqdm import tqdm
# # choose codec according to format needed
# fourcc = cv2.VideoWriter_fourcc(*'MP4V')
# video=cv2.VideoWriter('result_v2.mp4', fourcc, 30,(2000,1500))
# l = os.listdir("/home/b170007ec/Programs/MTL/DSD_MTL/video2")
# for j in tqdm(range(len(l))):
#     img = cv2.imread("/home/b170007ec/Programs/MTL/DSD_MTL/video2/Predictions_{}.png".format(j))
#     video.write(img)

# video.release()

100%|██████████| 2001/2001 [03:25<00:00,  9.75it/s]


In [22]:
# modelA = MTL_Model(device = "cuda")
# modelA_dict = modelA.state_dict()
# modelB = MTL_ModelB(device = "cuda")
# modelB.load_state_dict(T.load("/home/b170007ec/Programs/MTL/Bhanu/Models_SDD/model-1.713353544473648.pth",map_location=T.device('cuda')))
# modelB_dict = modelB.state_dict()
# pretrained_dict = modelB_dict
# model_dict = modelA_dict
# # 1. filter out unnecessary keys
# pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# # 2. overwrite entries in the existing state dict
# model_dict.update(pretrained_dict)
# # 3. load the new state dict
# modelA.load_state_dict(model_dict)
# T.save(modelA.state_dict(), '/home/b170007ec/Programs/MTL/VQ_MTL/Models/modelvq-base.pth')

In [None]:
# import cv2
# fps = 30
# preprocess = transforms.Compose([
#                 transforms.ToTensor(),
#                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#         ])
# video_capture = cv2.VideoCapture("/home/b170007ec/Programs/MTL/DSD_MTL/video.mp4")
# size=(1280,960)
# frames = 0
# model.eval()
# while(video_capture.isOpened()):
#     ret, frame = video_capture.read()
#     if frames == 110:
#         rgb_raw = cv2.resize(frame,(640, 480))
#         rgb = preprocess(rgb_raw)
#         rgb = T.unsqueeze(rgb, 0)
#         rgb = rgb.to(device)
#         y_pred = model(rgb)
#         y_pred_ = F.softmax(y_pred[0],dim=1)
#         pseg = T.squeeze(y_pred_,0)
#         pseg = T.argmax(pseg, dim=0)
#         pseg = labels_to_cityscapes_palette(pseg.cpu().detach().numpy())
#         pdepth = T.squeeze(y_pred[1],0)
#         pdepth = pdepth.permute(1,2,0)
#         pdepth = pdepth.reshape(240,320)

#         hmap, regs, w_h_ = zip(*y_pred[2])
#         d = showbox(rgb_raw, hmap[0].detach(), regs[0].detach(),w_h_[0].detach())
#         fig = plt.figure(figsize =(8, 8))
#         plt.imshow(rgb_raw)
#         plt.tick_params(left = False, right = False , labelleft = False ,
#                 labelbottom = False, bottom = False)
#         plt.savefig("rgb.png")
#         plt.show()
#         fig = plt.figure(figsize =(8, 8))
#         plt.imshow(pseg)
#         plt.tick_params(left = False, right = False , labelleft = False ,
#                 labelbottom = False, bottom = False)
#         plt.savefig("seg.png")
#         plt.show()
#         fig = plt.figure(figsize =(8, 8))
#         plt.imshow(pdepth.cpu().detach().numpy(),cmap='magma')
#         plt.tick_params(left = False, right = False , labelleft = False ,
#                 labelbottom = False, bottom = False)
#         plt.savefig("dep.png")
#         plt.show()
#         fig = plt.figure(figsize =(8, 8))
#         plt.imshow(d)
#         plt.tick_params(left = False, right = False , labelleft = False ,
#                 labelbottom = False, bottom = False)
#         plt.savefig("det.png")
#         plt.show()
#         break
#     frames += 1

In [None]:
# import statistics
  
# def get_model_scores(pred_boxes):
#     """Creates a dictionary of from model_scores to image ids.
#     Args:
#         pred_boxes (dict): dict of dicts of 'boxes' and 'scores'
#     Returns:
#         dict: keys are model_scores and values are image ids (usually filenames)
#     """
#     model_score={}
#     for img_id, val in pred_boxes.items():
#         for score in val['scores']:
#             if score not in model_score.keys():
#                 model_score[score]=[img_id]
#             else:
#                 model_score[score].append(img_id)
#     return model_score

# def calc_iou( gt_bbox, pred_bbox):
#     x_topleft_gt, y_topleft_gt, x_bottomright_gt, y_bottomright_gt= gt_bbox[0],gt_bbox[1],gt_bbox[0]+gt_bbox[2],gt_bbox[1]+gt_bbox[3]
#     x_topleft_p, y_topleft_p, x_bottomright_p, y_bottomright_p= pred_bbox[0],pred_bbox[1],pred_bbox[0]+pred_bbox[2],pred_bbox[1]+pred_bbox[3]
    
#     if (x_topleft_gt > x_bottomright_gt) or (y_topleft_gt> y_bottomright_gt):
#         raise AssertionError("Ground Truth Bounding Box is not correct")
#     if (x_topleft_p > x_bottomright_p) or (y_topleft_p> y_bottomright_p):
#         raise AssertionError("Predicted Bounding Box is not correct",x_topleft_p, x_bottomright_p,y_topleft_p,y_bottomright_gt)
        
#     #if the GT bbox and predcited BBox do not overlap then iou=0
#     if(x_bottomright_gt< x_topleft_p):
#         # If bottom right of x-coordinate  GT  bbox is less than or above the top left of x coordinate of  the predicted BBox
#         return 0.0
#     if(y_bottomright_gt< y_topleft_p):  # If bottom right of y-coordinate  GT  bbox is less than or above the top left of y coordinate of  the predicted BBox
#         return 0.0
#     if(x_topleft_gt> x_bottomright_p): # If bottom right of x-coordinate  GT  bbox is greater than or below the bottom right  of x coordinate of  the predcited BBox
#         return 0.0
#     if(y_topleft_gt> y_bottomright_p): # If bottom right of y-coordinate  GT  bbox is greater than or below the bottom right  of y coordinate of  the predcited BBox
#         return 0.0
    
#     GT_bbox_area = (x_bottomright_gt -  x_topleft_gt + 1) * (  y_bottomright_gt -y_topleft_gt + 1)
#     Pred_bbox_area =(x_bottomright_p - x_topleft_p + 1 ) * ( y_bottomright_p -y_topleft_p + 1)
    
#     x_top_left =np.max([x_topleft_gt, x_topleft_p])
#     y_top_left = np.max([y_topleft_gt, y_topleft_p])
#     x_bottom_right = np.min([x_bottomright_gt, x_bottomright_p])
#     y_bottom_right = np.min([y_bottomright_gt, y_bottomright_p])
    
#     intersection_area = (x_bottom_right- x_top_left + 1) * (y_bottom_right-y_top_left  + 1)
    
#     union_area = (GT_bbox_area + Pred_bbox_area - intersection_area)
   
#     return intersection_area/union_area

# def calc_precision_recall(image_results):
#     """Calculates number of true_pos, false_pos, false_neg from single batch of boxes.
#     Args:
#         gt_boxes (list of list of floats): list of locations of ground truth
#             objects as [xmin, ymin, xmax, ymax]
#         pred_boxes (dict): dict of dicts of 'boxes' (formatted like `gt_boxes`)
#             and 'scores'
#         iou_thr (float): value of IoU to consider as threshold for a
#             true prediction.
#     Returns:
#         dict: true positives (int), false positives (int), false negatives (int)
#     """
#     true_positive=0
#     false_positive=0
#     false_negative=0
#     for img_id, res in image_results.items():
#         true_positive +=res['true_positive']
#         false_positive += res['false_positive']
#         false_negative += res['false_negative']
#         try:
#             precision = true_positive/(true_positive+ false_positive)
#         except ZeroDivisionError:
#             precision=0.0
#         try:
#             recall = true_positive/(true_positive + false_negative)
#         except ZeroDivisionError:
#             recall=0.0
#     return (precision, recall)

# def get_single_image_results(gt_boxes, pred_boxes, iou_thr):
#     """Calculates number of true_pos, false_pos, false_neg from single batch of boxes.
#     Args:
#         gt_boxes (list of list of floats): list of locations of ground truth
#             objects as [xmin, ymin, xmax, ymax]
#         pred_boxes (dict): dict of dicts of 'boxes' (formatted like `gt_boxes`)
#             and 'scores'
#         iou_thr (float): value of IoU to consider as threshold for a
#             true prediction.
#     Returns:
#         dict: true positives (int), false positives (int), false negatives (int)
#     """
#     all_pred_indices= range(len(pred_boxes))
#     all_gt_indices=range(len(gt_boxes))
#     if len(all_pred_indices)==0:
#         tp=0
#         fp=0
#         fn=0
#         return {'true_positive':tp, 'false_positive':fp, 'false_negative':fn}
#     if len(all_gt_indices)==0:
#         tp=0
#         fp=0
#         fn=0
#         return {'true_positive':tp, 'false_positive':fp, 'false_negative':fn}
    
#     gt_idx_thr=[]
#     pred_idx_thr=[]
#     ious=[]
#     for ipb, pred_box in enumerate(pred_boxes):
#         for igb, gt_box in enumerate(gt_boxes):
#             iou= calc_iou(gt_box, pred_box)
            
#             if iou >iou_thr:
#                 gt_idx_thr.append(igb)
#                 pred_idx_thr.append(ipb)
#                 ious.append(iou)
#     iou_sort = np.argsort(ious)[::1]
#     print(statistics.mean(ious))
#     if len(iou_sort)==0:
#         tp=0
#         fp=0
#         fn=0
#         return {'true_positive':tp, 'false_positive':fp, 'false_negative':fn}
#     else:
#         gt_match_idx=[]
#         pred_match_idx=[]
#         for idx in iou_sort:
#             gt_idx=gt_idx_thr[idx]
#             pr_idx= pred_idx_thr[idx]
#             # If the boxes are unmatched, add them to matches
#             if(gt_idx not in gt_match_idx) and (pr_idx not in pred_match_idx):
#                 gt_match_idx.append(gt_idx)
#                 pred_match_idx.append(pr_idx)
#         tp= len(gt_match_idx)
#         fp= len(pred_boxes) - len(pred_match_idx)
#         fn = len(gt_boxes) - len(gt_match_idx)
#     return {'true_positive': tp, 'false_positive': fp, 'false_negative': fn}



# def  get_avg_precision_at_iou(gt_boxes, pred_bb, iou_thr=0.5):
#     model_scores = get_model_scores(pred_bb)
#     sorted_model_scores= sorted(model_scores.keys())
# # Sort the predicted boxes in descending order (lowest scoring boxes first):
#     for img_id in pred_bb.keys():
        
#         arg_sort = np.argsort(pred_bb[img_id]['scores'])
#         pred_bb[img_id]['scores'] = np.array(pred_bb[img_id]['scores'])[arg_sort].tolist()
#         pred_bb[img_id]['boxes'] = np.array(pred_bb[img_id]['boxes'])[arg_sort].tolist()
    
#     pred_boxes_pruned = deepcopy(pred_bb)
#     precisions = []
#     recalls = []
#     model_thrs = []
#     img_results = {}
# # Loop over model score thresholds and calculate precision, recall
#     for ithr, model_score_thr in enumerate(sorted_model_scores[:-1]):
#             # On first iteration, define img_results for the first time:
#         print("Mode score : ", model_score_thr)
#         img_ids = gt_boxes.keys() if ithr == 0 else model_scores[model_score_thr]
#         for img_id in img_ids:
#             gt_boxes_img = gt_boxes[img_id]
#             box_scores = pred_boxes_pruned[img_id]['scores']
#             start_idx = 0
#             for score in box_scores:
#                 if score <= model_score_thr:
#                     pred_boxes_pruned[img_id]
#                     start_idx += 1
#                 else:
#                     break 
#             # Remove boxes, scores of lower than threshold scores:
#             pred_boxes_pruned[img_id]['scores']= pred_boxes_pruned[img_id]['scores'][start_idx:]
#             pred_boxes_pruned[img_id]['boxes']= pred_boxes_pruned[img_id]['boxes'][start_idx:]
# # Recalculate image results for this image
#             print(img_id)
#             img_results[img_id] = get_single_image_results(gt_boxes_img, pred_boxes_pruned[img_id]['boxes'], iou_thr=0.5)
# # calculate precision and recall
#         prec, rec = calc_precision_recall(img_results)
#         precisions.append(prec)
#         recalls.append(rec)
#         model_thrs.append(model_score_thr)
#     precisions = np.array(precisions)
#     recalls = np.array(recalls)
#     prec_at_rec = []
#     for recall_level in np.linspace(0.0, 1.0, 11):
#         try:
#             args= np.argwhere(recalls>recall_level).flatten()
#             prec= max(precisions[args])
#             print(recalls,"Recall")
#             print(      recall_level,"Recall Level")
#             print(       args, "Args")
#             print(       prec, "precision")
#         except ValueError:
#             prec=0.0
#         prec_at_rec.append(prec)
#     avg_prec = np.mean(prec_at_rec) 
#     return {
#         'avg_prec': avg_prec,
#         'precisions': precisions,
#         'recalls': recalls,
#         'model_thrs': model_thrs}