# Evaluation for SVT

Replicate Fig. 9 in IJCV paper Jaderberg et al. (2016)

## 1. Get Ready for SVT-50, SVT test dataset with fixed lexicon

In [1]:
import sys
caffe_root = '../'  # this file should be run from {caffe_root}/examples (otherwise change this line)

import os
if os.path.isdir('./testsets/SVT-50'):
    print 'File found.'
else:
    print 'SVT-50 file not found'

File found.


* Parse XML

In [31]:
import xml.etree.ElementTree as ET
tree = ET.parse('./testsets/SVT-50/test.xml')
root = tree.getroot()

#delete ''' to print the xml 


#for child in root:
#    for child1 in child:
#        print child1.tag, child1.text
#        for child2 in child1:
#            print child2.tag, child2.attrib
#            for child3 in child2:
#                print child3.text


* Define create datalist class

In [32]:
class Lexicon:
    def __init__(self,xml):
        self.imageName = xml.find('imageName').text
        self.address = xml.find('address').text
        self.lex = xml.find('lex').text.split(",")
        self.image_width = xml.find('Resolution').attrib.get('x')
        self.image_height = xml.find('Resolution').attrib.get('y')
    
    def rectangle(self,rect):
        self.rect_width = int(rect.attrib.get('width'))
        self.rect_height = int(rect.attrib.get('height'))
        self.rect_x= int(rect.attrib.get('x'))
        self.rect_y = int(rect.attrib.get('y'))
        self.gt = rect.find('tag').text
        
    def rectpoint(self):
        self.rect_lt_x = self.rect_x
        if self.rect_lt_x < 0:
            self.rect_lt_x = 0
        self.rect_lt_y = self.rect_y
        if self.rect_lt_y < 0:
            self.rect_lt_y = 0
        self.rect_rb_x = self.rect_x + self.rect_width
        self.rect_rb_y = self.rect_y + self.rect_height

    def withoutlex(self,label):
        self.label = label
    
    def withlex(self,label):
        self.lexlabel = label
        self.lexnum = len(self.lex)
        
    def set_image_number(self,number):
        self.image_number = number

* Create datalist

In [33]:
import copy
dataList = []
image_number = 0
for child in root:
    imageInfo = Lexicon(child)
    for child1 in child.find('taggedRectangles'):
        rectInfo = copy.copy(imageInfo)
        rectInfo.rectangle(child1)
        rectInfo.rectpoint()    
        rectInfo.set_image_number(image_number)
        dataList.append(rectInfo)
        image_number += 1


## 2. Get Ready for Models

* First, set up Python, `numpy`, and `matplotlib`.

In [34]:
# set up Python environment: numpy for numerical routines, and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt
import time
# display plots in this notebook
%matplotlib inline

# set display defaults
plt.rcParams['figure.figsize'] = (10, 10)        # large images
plt.rcParams['image.interpolation'] = 'nearest'  # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray'  # use grayscale output rather than a (potentially misleading) color heatmap

* Load `caffe`.

In [35]:
# The caffe module needs to be on the Python path;
#  we'll add it here explicitly.
import sys
caffe_root = '../'  # this file should be run from {caffe_root}/examples (otherwise change this line)
sys.path.insert(0, caffe_root + 'python')

import caffe
# If you get "No module named _caffe", either you have not built pycaffe or you have the wrong path.

* Assume you have the vgg_dictnet.

In [36]:
import os
if os.path.isfile('models/DICTNET-VGG/dictnet_vgg_mtoc.caffemodel'):
    print 'VGG Models found.'
else:
    print 'Models not found.'

VGG Models found.


* Load net and set up input preprocessing

* Set Caffe to CPU or GPU mode and load the net from disk.

In [37]:
caffe.set_device(0)  # if we have multiple GPUs, pick the first one
caffe.set_mode_gpu()
#caffe.set_mode_cpu()

model_def = 'models/dictnet_vgg_deploy.prototxt'
model_weights = 'models/DICTNET-VGG/dictnet_vgg_mtoc.caffemodel'

net = caffe.Net(model_def,      # defines the structure of the model
                model_weights,  # contains the trained weights
                caffe.TEST)     # use test mode (e.g., don't perform dropout)

## 3. Classification

* Now we're ready to perform classification. Even though we'll only classify one image, we'll set a batch size of 1 to demonstrate batching.

In [38]:
# create transformer for the input called 'data'
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
print net.blobs['data'].data.shape


transformer.set_raw_scale('data', 255.0)
transformer.set_transpose('data', (2,0,1))  # move image channels to outermost dimension

(1, 1, 32, 100)


In [39]:
# set the size of the input (we can skip this if we're happy
#  with the default; we can also change it later, e.g., for different batch sizes)
net.blobs['data'].reshape(1,        # batch size
                          1,         # 1-channel gray images
                          32, 100)  # image size is 32x100

In [40]:
# print total number of prediction
print len(dataList)

647


In [41]:
def debug_model_info(net):
    for layer_name, param in net.params.iteritems():
        print 'layer:', layer_name, " ", str(param[0].data.shape)
    
#print net.layers[1].blobs[0].na,e
debug_model_info(net)

layer: conv1   (64, 1, 5, 5)
layer: conv2   (128, 64, 5, 5)
layer: conv3   (256, 128, 3, 3)
layer: conv3_5   (512, 256, 3, 3)
layer: conv4   (512, 512, 3, 3)
layer: fc1   (4096, 512, 4, 13)
layer: fc2   (4096, 4096, 1, 1)
layer: fc_class   (88172, 4096, 1, 1)


* Perform classification

In [42]:
def classify(data,number):
    output = []
    if number == 0:
        length = len(data)
    else:
        length = number
        
    for itera in range(0,length):
        
        # set image path
        img_path = './testsets/SVT-50/' + data[itera].imageName
        
        # import original image and cropped image for prediction
        image = caffe.io.load_image(img_path, False)
        
        image_rectangle = image[data[itera].rect_lt_y:data[itera].rect_rb_y,data[itera].rect_lt_x:data[itera].rect_rb_x]
        transformed_image = transformer.preprocess('data', image_rectangle)
        
        transformed_image -= np.mean(transformed_image)
        
        # copy the image data into the memory allocated for the net
        net.blobs['data'].data[...] = transformed_image

        # perform classification
        out = copy.deepcopy(net.forward())
        output.append(out)
    
    return output

In [43]:
print "classifying......",
output = classify(dataList,0)
print "done"
#print output[10]['prob'][0].argmax()

classifying...... done


* Accuracy with lexicon and without lexicon

* Load ImageNet labels

In [44]:
labels_file = './dictnet_vgg_labels.txt'
if not os.path.exists(labels_file):
    print 'label file does not exist'

labels = np.loadtxt(labels_file, str, delimiter='\t')

* Without Lexicon, hit definition: label of max probability in all class same as groundtruth

In [45]:
import itertools
def lexoff(out,data):
    wiou_lex_hit = 0
    wiou_lex_non_hit = 0
    non_hit_case = []
    
    for output,datalist in itertools.izip(out,data):
    
        output_prob = output['prob'][0]  # the output probability vector for the first image in the batch

        #print 'predicted class is:', output_prob.argmax()
        #print 'predicted prob is:', output_prob.max()
        #print 'output label:', labels[output_prob.argmax()]
        #print 'grount truth:', datalist.gt.lower(),'\n'
        datalist.withoutlex(labels[output_prob.argmax()])
        
        if labels[output_prob.argmax()] == datalist.gt.lower():
            wiou_lex_hit += 1
        else:
            wiou_lex_non_hit +=1
            non_hit_case.append(datalist.image_number)
            
    print 'hit:',wiou_lex_hit,' not hit:',wiou_lex_non_hit
    
    wiou_accuracy = wiou_lex_hit / float(wiou_lex_hit+wiou_lex_non_hit)
    print 'accuracy: ', wiou_accuracy
    print 'non_hit_case: ', non_hit_case
    

In [46]:
lexoff(output,dataList)

hit: 531  not hit: 116
accuracy:  0.820710973725
non_hit_case:  [11, 12, 20, 25, 32, 41, 45, 59, 60, 63, 91, 92, 93, 94, 95, 96, 112, 113, 114, 117, 126, 135, 138, 147, 156, 162, 164, 167, 172, 186, 193, 201, 202, 204, 205, 206, 213, 219, 220, 231, 234, 238, 241, 248, 256, 271, 277, 283, 305, 318, 320, 327, 334, 336, 343, 344, 346, 350, 352, 358, 366, 368, 372, 377, 379, 380, 390, 398, 399, 400, 401, 402, 404, 406, 407, 409, 429, 435, 451, 455, 472, 473, 478, 486, 487, 488, 490, 491, 492, 502, 505, 507, 509, 515, 521, 537, 544, 553, 558, 559, 567, 571, 582, 588, 589, 601, 607, 608, 610, 612, 618, 623, 626, 633, 635, 637]


* With Lexicon, hit definition: label of max probability in fixed lexicons same as groundtruth

In [47]:
import itertools
def lexon(out,data):
    with_lex_hit = 0
    with_lex_non_hit = 0
    gt_in_lexicons = 0
    non_hit_case = []
    
    for output,datalist in itertools.izip(out,data):
        
        gt_in_lexicons_case = 0
        
        output_prob = output['prob'][0]  # the output probability vector for the first image in the batch
        max_prob = 0
        # read prob of every lexicon
        for lexi in datalist.lex:
            count = 0
            if lexi.lower() == datalist.gt.lower():
                gt_in_lexicons_case = 1

            # check lexicon in dictnet_vgg_labels and prob
            for index in labels:
                if index == lexi.lower(): 
                    number = count
                    if  output_prob[number][0][0] > max_prob:
                        max_lexi = lexi.lower()
                        max_prob = output_prob[number][0][0]
                        max_arg = index
                        max_class = number  
                count += 1
        
        #print 'total lexicons number: ',len(datalist.lex)        
        #print 'class: ', max_class 
        #print 'prob: ', max_prob
        #print 'class label: ',  max_arg
        #print 'grount truth: ', datalist.gt.lower(),'\n'
        datalist.withlex(max_arg)
        
        if max_arg == datalist.gt.lower():
            with_lex_hit += 1
        else:
            with_lex_non_hit +=1
            non_hit_case.append(datalist.image_number)
        
        gt_in_lexicons += gt_in_lexicons_case
        
    print 'hit:',with_lex_hit,' not hit:',with_lex_non_hit
    print 'gt_in_lexicons: ', gt_in_lexicons
    
    with_accuracy = with_lex_hit / float(with_lex_hit+with_lex_non_hit)
    print 'accuracy: ', with_accuracy
    print 'non_hit_case: ', non_hit_case

In [48]:
lexon(output,dataList)

hit: 627  not hit: 20
gt_in_lexicons:  647
accuracy:  0.969088098918
non_hit_case:  [94, 95, 126, 205, 320, 343, 352, 366, 368, 400, 401, 407, 435, 486, 491, 582, 612, 623, 633, 635]
