# Assignment 2

In [38]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import torch
import torch.utils.data
import torchvision.transforms as transforms
from torch.autograd import Variable
import matplotlib.pyplot as plt

#added by me
import torchvision
import torch.nn as nn
import torchvision.models as models
import os
from PIL import Image
import unicodedata
from IPython import display
import time
import xml.etree.ElementTree as ET
import random
#end of added by me

%matplotlib inline
plt.ion()
# Import other modules if required

resnet_input = 224 #size of resnet18 input images
use_gpu = True
num_of_classes = 21
window_batch_size = 20
window_threshold = 12
epsilon = 0.000001

In [39]:
# Choose your hyper-parameters using validation data
batch_size = 1
num_epochs = 5
learning_rate =  0.005
hyp_momentum = 0.9



## Build the data
Use the following links to locally download the data:
<br/>Training and validation:
<br/>http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
<br/>Testing data:
<br/>http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
<br/>The dataset consists of images from 20 classes, with detection annotations included. The JPEGImages folder houses the images, and the Annotations folder has the object-wise labels for the objects in one xml file per image. You have to extract the object information, ie. the [xmin, ymin] (the top left x,y co-ordinates) and the [xmax, ymax] (the bottom right x,y co-ordinates) of only the objects belonging to the given 20 classes(aeroplane, bicycle, boat, bottle, bus, car, cat, chair, cow, dining table, dog, horse, motorbike, person, potted plant, sheep, train, TV). For parsing the xml file, you can import xml.etree.ElementTree for you. <br/>
<br/> Organize the data as follows:
<br/> For every image in the dataset, extract/crop the object patch from the image one by one using their respective co-ordinates:[xmin, ymin, xmax, ymax], resize the image to resnet_input, and store it with its class label information. Do the same for training/validation and test datasets. <br/>
##### Important
You also have to collect data for an extra background class which stands for the class of an object which is not a part of any of the 20 classes. For this, you can crop and resize any random patches from an image. A good idea is to extract patches that have low "intersection over union" with any object present in the image frame from the 20 Pascal VOC classes. The number of background images should be roughly around those of other class objects' images. Hence the total classes turn out to be 21. This is important for applying the sliding window method later.

In [40]:
classes = ('__background__',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat', 'chair',
           'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor')


In [41]:
def getRandomWindow(width, height):
    xa = random.randint(0,width)
    xb = random.randint(0,width)
    ya = random.randint(0,height)
    yb = random.randint(0,height)
    x1 = min(xa,xb)
    x2 = max(xa,xb)
    y1 = min(ya,yb)
    y2 = max(ya,yb)
    return (x1,y1,x2,y2)

In [42]:
def maxiou(rwindow, boxes, threshold=0.2):
    minWindowSize = 60
    if(rwindow[2] - rwindow[0] < minWindowSize or rwindow[3] - rwindow[1] < minWindowSize ):
        return 0
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    xx1 = np.maximum(rwindow[0], x1[:])
    yy1 = np.maximum(rwindow[1], y1[:])
    xx2 = np.minimum(rwindow[2], x2[:])
    yy2 = np.minimum(rwindow[3], y2[:])
    # compute the width and height of the bounding box
    w = np.maximum(0, xx2 - xx1 + 1)
    h = np.maximum(0, yy2 - yy1 + 1)
    # compute the ratio of overlap
    overlap = (w * h) / area[:]
    #print(max(overlap))
    if(max(overlap) < threshold):
        return 1
    else:
        return 0
    

In [43]:
def build_dataset(train, rd, rd_annot):
    # Begin
    new_map = list()
    class_count = np.zeros(num_of_classes)
    for subdir, dirs, files in os.walk(rd):
        for File in files:
            xmlFile = File.split('.')[0] + '.xml'
            xmlFileDest = rd_annot + '/' + xmlFile
            tree = ET.parse(xmlFileDest)
            root = tree.getroot()
            sz = root.find('size')
            width = int(sz.find('width').text)
            height = int(sz.find('height').text)

            objinfile = list()
            objlist = list()
            for obj in root.iter('object'):
                objclass = obj.find('name').text
                classid = classes.index(objclass)
                bndbox = obj.find('bndbox')
                xmin = int(bndbox.find('xmin').text)
                ymin = int(bndbox.find('ymin').text)
                xmax = int(bndbox.find('xmax').text)
                ymax = int(bndbox.find('ymax').text)
                    
                    #if(train == True):
                new_map = new_map + [(File, (xmin, ymin, xmax, ymax), classid)] #objclass is a string name of class.
                objinfile = objinfile + [(xmin, ymin, xmax, ymax)]
                    #else:
                        #objlist = objlist + [((xmin, ymin, xmax, ymax), classid)]
                class_count[classid]+=1
                if(train == True):
                    rwindow = getRandomWindow(width, height)
                    #print(rwindow)
                    if( maxiou(rwindow, np.array(objinfile)) > 0):
                        #print(classes[0])
                        #add background image
                        new_map = new_map + [(File, rwindow, 0)]
                        class_count[0]+=1
                        #else:
                 #   self.map = self.map + [(File, objlist)]
    return new_map

In [44]:
class voc_dataset(torch.utils.data.Dataset): # Extend PyTorch's Dataset class
    def __init__(self, root_dir, train, transform=None):
        # Begin
        self.root_dir = root_dir
        self.train = train
        self.transform  = transform
        self.map = list()
        
        if(train == True):
            rd = root_dir + "/VOCdevkit_train/VOC2007/JPEGImages"
            rd_annot = root_dir + "/VOCdevkit_train/VOC2007/Annotations"
            
        else:
            rd = root_dir + "/VOCdevkit_test/VOC2007/JPEGImages"
            rd_annot = root_dir + "/VOCdevkit_test/VOC2007/Annotations"
        
        self.map = build_dataset(train, rd, rd_annot)
                
                       
        #print("min height and width")
        #print(h,w)
        #print("num of examples per class:")
        #print(classes)
        #print(class_count)
               
        
    def __len__(self):
        # Begin
        return len(self.map)
        
    def __getitem__(self, idx):
       # Begin
        if(self.train == True):
            rd = self.root_dir + "/VOCdevkit_train/VOC2007/JPEGImages"
            rd_annot = self.root_dir + "/VOCdevkit_train/VOC2007/Annotations"
            
        else:
            rd = self.root_dir + "/VOCdevkit_test/VOC2007/JPEGImages"
            rd_annot = self.root_dir + "/VOCdevkit_test/VOC2007/Annotations"
            #return (im, self.map[1])
        img = Image.open(rd + '/' + self.map[idx][0])
        area = self.map[idx][1] #(xmin, ymin, xmax, ymax)
        img = img.crop(area)  #taking only the part specified by the (xmin, ymin, xmax, ymax)
        if self.transform is not None:
            im = self.transform(img)
        return (im, self.map[idx][2])
        

In [45]:
class voc_test_dataset(torch.utils.data.Dataset): # Extend PyTorch's Dataset class
    def __init__(self, root_dir, train, transform=None):
        # Begin
        self.root_dir = root_dir
        self.train = train
        self.transform  = transform
        self.map = list()
        
        if(train == True):
            rd = root_dir + "/VOCdevkit_train/VOC2007/JPEGImages"
            rd_annot = root_dir + "/VOCdevkit_train/VOC2007/Annotations"
            
        else:
            rd = root_dir + "/VOCdevkit_test/VOC2007/JPEGImages"
            rd_annot = root_dir + "/VOCdevkit_test/VOC2007/Annotations"
        
        class_count = np.zeros(num_of_classes)
        for subdir, dirs, files in os.walk(rd):
            for File in files:
                xmlFile = File.split('.')[0] + '.xml'
                xmlFileDest = rd_annot + '/' + xmlFile
                tree = ET.parse(xmlFileDest)
                root = tree.getroot()
                sz = root.find('size')
                width = int(sz.find('width').text)
                height = int(sz.find('height').text)
                
                objinfile = list()
                objlist = list()
                for obj in root.iter('object'):
                    objclass = obj.find('name').text
                    classid = classes.index(objclass)
                    bndbox = obj.find('bndbox')
                    xmin = int(bndbox.find('xmin').text)
                    ymin = int(bndbox.find('ymin').text)
                    xmax = int(bndbox.find('xmax').text)
                    ymax = int(bndbox.find('ymax').text)
                    
                    if(train == True):
                        self.map = self.map + [(File, (xmin, ymin, xmax, ymax), classid)] #objclass is a string name of class.
                        objinfile = objinfile + [(xmin, ymin, xmax, ymax)]
                    else:
                        objlist = objlist + [((xmin, ymin, xmax, ymax), classid)]
                    class_count[classid]+=1
                if(train == True):
                    rwindow = getRandomWindow(width, height)
                    #print(rwindow)
                    if( maxiou(rwindow, np.array(objinfile)) > 0):
                        #print(classes[0])
                        #add background image
                        self.map = self.map + [(File, rwindow, 0)]
                        class_count[0]+=1
                else:
                    self.map = self.map + [(File, objlist)]
                       
        #print("min height and width")
        #print(h,w)
        print("num of examples per class:")
        print(classes)
        print(class_count)
               
        
    def __len__(self):
        # Begin
        return len(self.map)
        
    def __getitem__(self, idx):
       # Begin
        if(self.train == True):
            rd = self.root_dir + "/VOCdevkit_train/VOC2007/JPEGImages"
            rd_annot = self.root_dir + "/VOCdevkit_train/VOC2007/Annotations"
            img = Image.open(rd + '/' + self.map[idx][0])
            area = self.map[idx][1] #(xmin, ymin, xmax, ymax)
            img = img.crop(area)  #taking only the part specified by the (xmin, ymin, xmax, ymax)
            if self.transform is not None:
                im = self.transform(img)
            return (im, self.map[idx][2])
        else:
            rd = self.root_dir + "/VOCdevkit_test/VOC2007/JPEGImages"
            rd_annot = self.root_dir + "/VOCdevkit_test/VOC2007/Annotations"
            img = Image.open(rd + '/' + self.map[idx][0])
            if self.transform is not None:
                img = self.transform(img)
            #print("objlist:")
            #print(self.map[1])
            return (img, self.map[idx][1])
        
        

## Train the netwok
<br/>You can train the network on the created dataset. This will yield a classification network on the 21 classes of the VOC dataset. 

In [46]:
composed_transform = transforms.Compose([transforms.Scale((resnet_input,resnet_input)),
                                         transforms.ToTensor() ])
composed_test_transform = transforms.Compose([
                                         transforms.ToTensor() ])
revert_transform = transforms.Compose([
                                         transforms.ToPILImage() ])
#transforms.RandomHorizontalFlip() was removed by me from above.
train_dataset = voc_dataset(root_dir='.', train=True, transform=composed_transform) # Supply proper root_dir
test_dataset = voc_dataset(root_dir='.', train=False, transform=composed_transform) # Supply proper root_dir
test_pipe_dataset = voc_test_dataset(root_dir='.', train=False)#, transform=composed_test_transform) # Supply proper root_dir

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
test_pipe_loader = torch.utils.data.DataLoader(dataset=test_pipe_dataset, batch_size=batch_size, shuffle=True)

num of examples per class:
('__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor')
[    0.   311.   389.   576.   393.   657.   254.  1541.   370.  1374.
   329.   299.   530.   395.   369.  5227.   592.   311.   396.   302.
   361.]


### Fine-tuning
Use the pre-trained network to fine-tune the network in the following section:

In [47]:
resnet18 = models.resnet18(pretrained=True)

resnet18.fc = nn.Linear(resnet18.fc.in_features, 21)

# Add code for using CUDA here
if(torch.cuda.is_available() and use_gpu):
    resnet18.cuda()

In [48]:
criterion = nn.CrossEntropyLoss()
# Update if any errors occur
optimizer = torch.optim.SGD(resnet18.parameters(), learning_rate, hyp_momentum)

In [49]:
def train():
    # Begin
    x = list()
    y = list()
    trdata_batchsize = len(train_dataset)//batch_size
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):  
            # Convert torch tensor to Variable
            images = Variable(images)
            labels = Variable(labels)
            if(use_gpu):
                images=images.cuda()
                labels=labels.cuda()
            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = resnet18(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            if (i+1) % 256 == 0: 
                print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                       %(epoch+1, num_epochs, i+1, trdata_batchsize, loss.data[0]))
                x.append((epoch*trdata_batchsize) + 1+i)
                y.append(loss.data[0])
                plt.plot(x,y,color = 'red')
                plt.title('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                       %(epoch+1, num_epochs, i+1, trdata_batchsize, loss.data[0]))
                plt.xlabel("Batch Number")
                plt.ylabel("Cross Entropy Loss")
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #if i == 400:
             #   break
        #break    #to run just 1 epoch
    plt.savefig('Loss_1.png')

In [50]:
#load the weights you got by training it earlier.
resnet18.load_state_dict(torch.load('assignment2_0005_20_model.pkl'))

In [51]:
#%time train()

In [52]:
#saving the weights for future analysis
#torch.save(resnet18.state_dict(), 'assignment2_2_model.pkl')


In [53]:
#load saved model
#resnet18.load_state_dict(torch.load('assignment2_2_model.pkl'))

In [54]:
def test_accuracy(model):
    # Write loops for testing the model on the test set
    # You should also print out the accuracy of the model
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = Variable(images)
        
        #doing this on cpu due to gpu memory leak.
        if(use_gpu):
            images = images.cuda()
        
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted.cpu() == labels.cpu()).sum()
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))

In [55]:
#%time test_accuracy(resnet18)
# 62% acc after 5 epochs, lrate = 0.005
# 65% acc after 10 epochs, lrate = 0.005


# Testing and Accuracy Calculation
For applying detection, use a slding window method to test the above trained trained network on the detection task:<br/>
Take some windows of varying size and aspect ratios and slide it through the test image (considering some stride of pixels) from left to right, and top to bottom, detect the class scores for each of the window, and keep only those which are above a certain threshold value. There is a similar approach used in the paper -Faster RCNN by Ross Girshick, where he uses three diferent scales/sizes and three different aspect ratios, making a total of nine windows per pixel to slide. You need to write the code and use it in testing code to find the predicted boxes and their classes.

Apply non_maximum_supression to reduce the number of boxes. You are free to choose the threshold value for non maximum supression, but choose wisely [0,1].

In [56]:
def non_maximum_supression(boxes,threshold = 0.4):
    # 
    # if there are no boxes, return an empty list
    if len(boxes) == 0:
        return []

    # if the bounding boxes integers, convert them to floats --
    # this is important since we'll be doing a bunch of divisions
    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")
 
    # initialize the list of picked indexes	
    pick = []
 
    # grab the coordinates of the bounding boxes
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]

    # compute the area of the bounding boxes and sort the bounding
    # boxes by the bottom-right y-coordinate of the bounding box
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(y2)

    # keep looping while some indexes still remain in the indexes
    # list
    while len(idxs) > 0:
        # grab the last index in the indexes list and add the
        # index value to the list of picked indexes
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)
 
        # find the largest (x, y) coordinates for the start of
        # the bounding box and the smallest (x, y) coordinates
        # for the end of the bounding box
        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])

        # compute the width and height of the bounding box
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)

        # compute the ratio of overlap
        overlap = (w * h) / (area[idxs[:last]])

        # delete all indexes from the index list that have
        idxs = np.delete(idxs, np.concatenate(([last],
            np.where(overlap > threshold)[0])))
 
    # return only the bounding boxes that were picked using the
    # integer data type
    #return boxes[pick].astype("int")
    #print(pick)
    return pick

In [57]:
def sliding_window(sz):
    
    # Begin
    stride = 20   #subject to alot of change
    #for image in images:
    #image = revert_transform(image)
    #image = transforms.ToPILImage(image)
    res = np.array([(50,100),(100,50),(100,100)])/2  #may wish to change to powers of 2.
    res = res.astype(int)
    #sz = image.size
    area_set = list()
    #cropped_images = list()
    for i in range(5):
        res = 2*res
        #print("changing window scale")
        for w,h in res:
            #print("changing window apect ratio")
            #print(w,h, sz, 0,sz[0] - w, stride, 0,sz[1] - h,stride )
            point_set = set((x,y) for x in range(0,sz[0] - w, stride) for y in range(0,sz[1] - h,stride))
            #print("point set:")
            #print(len(point_set))
            for cx,cy in point_set:
                area = (cx, cy, cx + w, cy + h)
                area_set = area_set + [area]
                #cropped_images = cropped_images + [ (composed_transform(image.crop(area)), area)]
    #print("returning cropped areas")
    return area_set
    #cropped_images = Variable(cropped_images)
        #outputs = resnet18(cropped_images)
            

In [58]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))


In [59]:
def maxiou_accept(rwindow, boxes):
    minWindowSize = 60
    if(rwindow[2] - rwindow[0] < minWindowSize or rwindow[3] - rwindow[1] < minWindowSize ):
        return -1,0 
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    xx1 = np.maximum(rwindow[0], x1[:])
    yy1 = np.maximum(rwindow[1], y1[:])
    xx2 = np.minimum(rwindow[2], x2[:])
    yy2 = np.minimum(rwindow[3], y2[:])
    
    rwindow_area = (rwindow[3] - rwindow[1] + 1)*(rwindow[2] - rwindow[0] + 1)
    # compute the width and height of the bounding box
    w = np.maximum(0, xx2 - xx1 + 1)
    h = np.maximum(0, yy2 - yy1 + 1)
    # compute the ratio of overlap
    print(w, h, w*h, area[:], rwindow_area)
    overlap = (w * h) / (area[:]+rwindow_area - (w * h))
    #print(max(overlap))
    val = np.max(overlap)
    ix = np.argmax(overlap)
    #print("val,ix")
    #print(val,ix)
    #print("overlap")
    #print(overlap)
    return (val, ix)

In [60]:
def find_class_ap(class_areas, objlist, threshold=0.3):  #objlist:np.array, class_areas: dict
    obj_dict = {}
    class_ap = np.zeros(num_of_classes)
    class_bin = np.zeros(num_of_classes)
    for i in range(len(objlist)):
        if objlist[i][1] in obj_dict:
            obj_dict[objlist[i][1]] = obj_dict[objlist[i][1]] + [objlist[i][0]]
        else:
            obj_dict[objlist[i][1]] = [objlist[i][0]]
            class_bin[objlist[i][1]] = 1
    
    #print("obj_dict is:")
    #print(obj_dict)
    for key in class_areas:
        #print("key in class area is {}".format(key))
        y = class_areas[key][:,1]
        idx = np.argsort(-y)
        ap = 0.0
        if key in obj_dict:
            #print("key in obj_dict : {}".format(key))
            count = 0.0
            for i in range(len(idx)):
                #print("error is here")
                #print(maxiou_accept(class_areas[key][idx[i],0], np.array(obj_dict[key])))
                (overlap, ix) = maxiou_accept(class_areas[key][idx[i],0], np.array(obj_dict[key]))  #ix for if u want to erase that true window
                
                if(overlap > threshold):
                    count = count + 1
                    ap = ap + count/(i+1)
            ap = ap/len(idx)
        class_ap[key] = ap
    #class_ap = class_ap/(len(class_areas) + epsilon)
    return (class_ap, class_bin)
    

Test the trained model on the test dataset.

In [61]:
# def test(resnet18):
#     # Write loops for testing the model on the test set
#     # You should also print out the accuracy of the model
#     correct = 0
#     total = 0
    
# #     a = test_pipe_dataset.__getitem__(4)
# #     print("hey")
# #     print(a)
# #     print("yo")
#     for images, objlists in test_pipe_loader:   #presently only 1 image
       
#         for image in images:
            
#             objlists[0] = np.array(objlists[0])
#             print(objlists[0][0].tuple())
#             print("poop")
#             print(objlists[0].tolist())
#             #print(type())
#             print(objlists[0].numpy.tolist())
#             imshow(torchvision.utils.make_grid(image))
#             image_tensor = image
#             image = revert_transform(image)
#             image.save("image/" + "test" + ".png","PNG")
#             sz = image.size
#             area_set = sliding_window(sz)
#             win_batch = list()
#             output_area_map = list()
#             area_batch = list()
#             area_predictions = torch.LongTensor()
#             area_score = torch.LongTensor()
#             for area in area_set:
#                 #win_batch = list()
#                 im = image.crop(area)
#                 im = composed_transform(im)
#                 win_batch = win_batch + [im]
#                 #area_batch = area_batch + [area]
#                 if(len(win_batch)%window_batch_size == 0 or area == area_set[-1]):
#                     windows = torch.cat(win_batch)
#                     windows = windows.view(-1, 3, resnet_input, resnet_input)
#                     del win_batch[:]
#                     windows = Variable(windows)
#                     if(use_gpu):
#                         windows = windows.cuda()
#                     outputs = resnet18(windows)
#                     #print(outputs.cpu())
#                     #print(torch.nn.functional.softmax(outputs.data))
#                     #probability, predicted = torch.max(torch.nn.functional.softmax(outputs.data).data, 1)
#                     probability, predicted = torch.max(outputs.data, 1)
#                     #print(probability)
#                     backs = (probability < window_threshold)
#                     predicted[backs] = 0 # background.
#                     #print("baba baba")
#                     #print(type(probability.cpu().long()))
#                     #print(type(predicted.cpu()))
#                     #print(type(area_predictions))
#                     area_predictions = torch.cat([area_predictions, predicted.cpu()])
#                     area_score = torch.cat([area_score, probability.cpu().long()])
#                     #print("preditctions:")
#                     #print(predicted)
#             class_areas = {}
#             for i in range(len(area_predictions)):
#                 if area_predictions[i] == 0:
#                     continue
#                 if area_predictions[i] in class_areas:
#                     class_areas[area_predictions[i]] = class_areas[area_predictions[i]] + [(area_set[i], area_score[i])]
#                 else:
#                     class_areas[area_predictions[i]] = [(area_set[i], area_score[i])]
#             print(class_areas)
#             for key in class_areas:
                
#                 pick = non_maximum_supression( np.array([i[0] for i in class_areas[key]]) )
#                 class_areas[key] = np.array(class_areas[key])
#                 print(class_areas[key])
#                 print(pick)
#                 class_areas[key] = class_areas[key][pick]
                
#             print(class_areas)
#             for key in class_areas:
#                 print("class is:")
#                 print(key)
#                 for area, score in class_areas[key]:
#                     #img_crp = image.crop
#                     image.crop(area).save("image/"+str(area)+ "_"+ str(score)+ "_" + str(key) + ".png","PNG")
            
# #             for (ar, cl) in objlists[0]:
# #                 image.crop(ar).save("image/" + "truth_" + str(ar) + "_" + str(cl) + ".png", "PNG")
            
#             ans_ap = find_class_ap(class_areas, objlists[0].numpy.tolist())
#             print(ans_ap)
#             break
#         break
#             #image_windows = Variable(cropped_images)
#             #if(use_gpu):
#             #    image_windows = image_windows.cuda()
#             #outputs = resnet18(image_windows)
            

            
#             #_, predicted = torch.max(outputs.data, 1)
#             #total += labels.size(0)
#             #correct += (predicted.cpu() == labels.cpu()).sum()

In [62]:
temp = {}
def test(resnet18):
    # Write loops for testing the model on the test set
    # You should also print out the accuracy of the model
    correct = 0
    total = 0
    
    count = 0
    num_of_test_samples = len(test_pipe_dataset)
    #for images, objlist in test_pipe_loader:   #presently only 1 image   
    #for image in images:
    class_bin = np.zeros(num_of_classes) + epsilon
    class_map = np.zeros(num_of_classes)
    for idx in range(num_of_test_samples):
        idx = random.randint(0,num_of_test_samples)
        os.system('mkdir image/images_' + str(idx) + '/' )
        image , objlist = test_pipe_dataset.__getitem__(idx)
        image.save("image/images_" + str(idx) + '/' + "test" + ".png","PNG")
        sz = image.size
        area_set = sliding_window(sz)
        win_batch = list()
        output_area_map = list()
        area_batch = list()
        area_predictions = torch.LongTensor()
        area_score = torch.LongTensor()
        for area in area_set:
            im = image.crop(area)
            im = composed_transform(im)
            win_batch = win_batch + [im]
            if(len(win_batch)%window_batch_size == 0 or area == area_set[-1]):
                windows = torch.cat(win_batch)
                windows = windows.view(-1, 3, resnet_input, resnet_input)
                del win_batch[:]   #free up space
                windows = Variable(windows)
                if(use_gpu):
                    windows = windows.cuda()
                outputs = resnet18(windows)
                score, predicted = torch.max(outputs.data, 1)
                backs = (score < window_threshold)
                predicted[backs] = 0 # background.
                area_predictions = torch.cat([area_predictions, predicted.cpu()])
                area_score = torch.cat([area_score, score.cpu().long()])
        class_areas = {}
        for i in range(len(area_predictions)):
            if area_predictions[i] == 0:
                continue
            if area_predictions[i] in class_areas:
                class_areas[area_predictions[i]] = class_areas[area_predictions[i]] + [(area_set[i], area_score[i])]
            else:
                class_areas[area_predictions[i]] = [(area_set[i], area_score[i])]
        #print(class_areas)
        for key in class_areas:

            pick = non_maximum_supression( np.array([i[0] for i in class_areas[key]]) )
            class_areas[key] = np.array(class_areas[key])
            #print(class_areas[key])
            #print(pick)
            class_areas[key] = class_areas[key][pick]

        #print(class_areas)
        
        for key in class_areas:
            #print("class is:")
            #print(key)
            for area, score in class_areas[key]:
                #img_crp = image.crop
                image.crop(area).save("image/images_" + str(idx) + '/'+str(area)+ "_"+ str(score)+ "_" + str(classes[key]) + ".png","PNG")

        for (ar, cl) in objlist:
            image.crop(ar).save("image/images_" + str(idx) + '/' + "truth_" + str(ar) + "_" + str(classes[cl]) + ".png", "PNG")
        temp = class_areas
        ans_ap, cls_bin = find_class_ap(class_areas, np.array(objlist))
        class_bin += cls_bin
        class_map += ans_ap
        
        print("count = ")
        print(count)
        print("ans_ap: for one image")
        print(ans_ap)
        #print(cls_bin)
        print("class map till this point , count is: ")
        print(class_map)
        #print("class_bin :")
        #print(class_bin)
        count+=1
        if(count%10 == 0):
            break
    class_map = class_map / class_bin
    print("final class map is")
    print(class_map)
    print("final class bin is")
    print(class_bin)
        #break
            

In [63]:
%time test(resnet18)


[100] [164] [16400] [23275] 20301
[101] [178] [17978] [81090] 20301
count = 
0
ans_ap: for one image
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]
class map till this point , count is: 
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]
[0] [0] [0] [6930] 10201
[0] [70] [0] [6930] 20301
[20] [42] [840] [6930] 20301
[94] [141] [13254] [17750] 20301
count = 
1
ans_ap: for one image
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0.  0.]
class map till this point , count is: 
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0.  0.]
[  0 101] [201 201] [    0 20301] [37599 29106] 20301
[151  45] [201 201] [30351  9045] [37599 29106] 40401
[ 0 82] [201 198] [    0 16236] [37599 29106] 40401
[32 85] [156 138] [ 4992 11730] [37599 29106] 20301
count = 
2
ans_ap: for one image
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0. 