# Pytorch Implementation

In [375]:
from matplotlib import pyplot as plt
import collections
from PIL import Image, ImageFont, ImageDraw, ImageEnhance


Iterable =  collections.Iterable
class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images ad masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img_address = img_path
        mask_address = mask_path
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask_pil = Image.open(mask_path)
        # convert the PIL Image into a numpy array
        mask = np.array(mask_pil)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        target['mask_pil'] = mask_pil
        target['image_address']  = img_address
        target['mask_address']  = mask_address
        sample={}
        sample ['img'] = img
        sample['target'] = target
        if self.transforms is not None:
            sample = self.transforms(sample)
            img = sample['img']
            
            target = sample ['target']
            mask_pil_copy = target['mask_pil_copy'] 
        
        target_out={}
        target_out['boxes'] = target['boxes']
        target_out['labels'] = target['labels']
        target_out['masks'] = target['masks']    
        return img, target, mask_pil_copy,target_out

    def __len__(self):
        return len(self.imgs)


class ToTensor(object):
    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.

    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
    or if the numpy.ndarray has dtype = np.uint8

    In the other cases, tensors are returned without scaling.
    """

    def __call__(self, sample):
        """
        Args:
            pic (PIL Image or numpy.ndarray): Image to be converted to tensor.

        Returns:
            Tensor: Converted image.
        """
        
        img = sample['img']
        target = sample['target']
        
        
        img = F.to_tensor(img)
        masks_pil = F.to_tensor(target['mask_pil'])
        #masks_pil_copy = F.to_tensor(target['mask_pil_copy'])
        
        target['mask_pil'] = masks_pil
        #target['mask_pil_copy'] = masks_pil_copy
        
        sample ['img'] = img
        sample['target'] = target
        
        return sample


    def __repr__(self):
        return self.__class__.__name__    
    
    
    
class RandomHorizontalFlip(object):
    """Horizontally flip the given PIL Image randomly with a given probability.

    Args:
        p (float): probability of the image being flipped. Default value is 0.5
    """

    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, sample):
        
        img = sample['img']
        target = sample ['target']
        
        img_flipped = F.hflip(img)
        
        mask_pil = F.hflip (target['mask_pil'])
        mask_pil_copy = mask_pil.copy()
       
        mask = np.array(mask_pil)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
        c=[]
        c.append(0)
        c.append(0)
        c.append(0)
        for i in range(len(obj_ids)*3):
                c.append(random.randint(0,255))

        mask_pil_copy.putpalette(c)
        draw = ImageDraw.Draw(mask_pil_copy)
        for i in range(len(boxes)):
            draw.rectangle(((boxes[i][0],boxes[i][1]),(boxes[i][2],boxes[i][3])), outline = 128)    

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        
        
        
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["area"] = area
        target["iscrowd"] = iscrowd
        target['mask_pil'] = mask_pil
        target['mask_pil_copy'] = mask_pil_copy
        
        sample ['img'] = img_flipped
        sample['target'] = target

        """
        Args:
            img (PIL Image): Image to be flipped.

        Returns:
            PIL Image: Randomly flipped image.
        """
        return sample

    def __repr__(self):
        return self.__class__.__name__ + '(p={})'.format(self.p)

    
class Resize(object):
    """Resize the input PIL Image to the given size.

    Args:
        size (sequence or int): Desired output size. If size is a sequence like
            (h, w), output size will be matched to this. If size is an int,
            smaller edge of the image will be matched to this number.
            i.e, if height > width, then image will be rescaled to
            (size * height / width, size)
        interpolation (int, optional): Desired interpolation. Default is
            ``PIL.Image.BILINEAR``
    """

    def __init__(self, size, interpolation=Image.BICUBIC):
        assert isinstance(size, int) or (isinstance(size, Iterable) and len(size) == 2)
        self.size = size
        self.interpolation = interpolation


    def __call__(self, sample):
        """
        Args:
            img (PIL Image): Image to be scaled.

        Returns:
            PIL Image: Rescaled image.
        """
        
        img = sample['img']
        target = sample ['target']
        
        
        
        
        img_resize = img.resize(self.size)
        
        mask_pil_resize = target['mask_pil'].resize (self.size)
        
        mask_pil_copy = mask_pil_resize.copy()
        mask = np.array(mask_pil_resize)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([int(xmin), int(ymin), int(xmax), int(ymax)])

        c=[]
        c.append(0)
        c.append(0)
        c.append(0)
        for i in range(len(obj_ids)*3):
                c.append(random.randint(0,255))

        mask_pil_copy.putpalette(c)
        draw = ImageDraw.Draw(mask_pil_copy)
        for i in range(len(boxes)):
            draw.rectangle(((boxes[i][0],boxes[i][1]),(boxes[i][2],boxes[i][3])), outline = 128)    

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        
        
        
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["area"] = area
        target["iscrowd"] = iscrowd
        target['mask_pil'] = mask_pil_resize
        target['mask_pil_copy'] = mask_pil_copy
        sample ['img'] = img_resize
        sample['target'] = target

        """
        Args:
            img (PIL Image): Image to be flipped.

        Returns:
            PIL Image: Randomly flipped image.
        """
        return sample

  

    def __repr__(self):
        interpolate_str = _pil_interpolation_to_str[self.interpolation]
        return self.__class__.__name__ + '(size={0}, interpolation={1})'.format(self.size, interpolate_str)

In [376]:
from matplotlib.pyplot import imshow
from IPython.display import display # to display images
from mpl_toolkits.axes_grid1 import ImageGrid
   

def show_images(sample_batch):
    images_batch, target_batch = sample_batch[0], sample_batch[1]
    batch_size = len(images_batch)
    #im_size = images_batch.size(2)
    grid_border_size = 4
    grid = utils.make_grid(images_batch)
    plt.imsave('images.jpg',grid.numpy().transpose((1,2,0)))

def show_masks(sample_batch):
    masks  = sample_batch[2]
    plt.figure()
    for i in range(4):
        masks[i].save('mask{}.png'.format(i))
        
def pallete_segments(maskin):
    
    test = np.zeros((maskin.shape[1],maskin.shape[2],3))
    color_map = []
    color_map.append([0,0,0])
    for i in (np.unique(maskin)):
        color_map.append((np.random.choice(range(256), size=3)))

    for i in range(maskin.shape[1]):
        for j in range(maskin.shape[2]):
            test[i,j] = color_map[maskin[0,i,j]]
    return test





def get_transform(train):
    transforms = []
    if train:
        transforms.append(Resize((300,300)))
        transforms.append(RandomHorizontalFlip(0.5))
        
        
    transforms.append(ToTensor())

    return T.Compose(transforms)

def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    mask = [item[2] for item in batch]
    target_out = [item[3] for item in batch]
    return [data, target,mask, target_out]

In [377]:
dataset = PennFudanDataset('/home/ashish95/Pictures/Dataset/PennFudanPed', get_transform(train=True))
#dataset_test = PennFudanDataset('/home/ashish95/Pictures/Dataset/PennFudanPed', Trfs)

# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset, indices[-50:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4, collate_fn=my_collate)

data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=my_collate)


In [378]:
for i, sample in enumerate(data_loader):
        image,target, masks = sample[0] , sample[1] , sample[2]
        if i==2:
            for j in range(4):
                print(target[j]['image_address'])
            plt.figure()
            show_images(sample)
            
            plt.figure()
            for j in range(4):
                print(target[j]['mask_address'])
                                
            show_masks(sample)


/home/ashish95/Pictures/Dataset/PennFudanPed/PNGImages/PennPed00090.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PNGImages/PennPed00011.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PNGImages/PennPed00079.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PNGImages/FudanPed00022.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PedMasks/PennPed00090_mask.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PedMasks/PennPed00011_mask.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PedMasks/PennPed00079_mask.png
/home/ashish95/Pictures/Dataset/PennFudanPed/PedMasks/FudanPed00022_mask.png


<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [359]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /home/ashish95/.cache/torch/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 167502836/167502836 [03:59<00:00, 699255.42it/s] 


In [379]:
def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [None]:
num_epochs=1
for i in range (num_epochs):
    for i,sample in enumerate(data_loader):
        image,target_in,mask,target_out = sample[0],sample[1],sample[2],sample[3]
        model.eval()
        predictions = model(image,target_out)
        print(predictions)
        

[{'boxes': tensor([[162.0173,  97.1730, 171.4453, 115.9252],
        [ 75.1344, 137.4904,  82.7737, 159.0323],
        [ 84.8555, 130.3780, 130.6535, 142.6487],
        [ 87.8385, 129.6041, 116.3362, 136.3241],
        [183.3962, 162.7813, 191.6299, 189.1281],
        [  4.0828, 130.7319, 102.8573, 156.8064],
        [ 87.7609, 139.0770, 112.5542, 167.5904],
        [158.8852,  96.7684, 169.2105, 120.2165],
        [186.5894, 164.6309, 194.7240, 189.7834],
        [ 37.9428, 128.5624,  90.5403, 253.3686],
        [ 42.2832, 145.7981,  59.8041, 158.6195],
        [ 83.1859, 132.7010,  91.1846, 151.1511],
        [  1.9236, 129.5305,  52.1875, 140.5801],
        [180.3875, 160.9858, 189.7662, 191.5862],
        [ 68.0642, 135.2209,  93.8407, 160.4700],
        [ 56.5828, 135.4446, 133.4342, 168.8324],
        [ 69.9078, 121.3707,  98.0842, 189.2449],
        [179.0718, 164.5763, 209.2260, 193.9974],
        [ 88.6800, 131.4789, 116.4250, 140.4637],
        [ 54.8546, 133.5273, 100.2065, 