In [None]:
import numpy as np
import cv2
import random
import json
import pickle
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import colorsys
from matplotlib.widgets import Button
import ipywidgets as widgets
from IPython.display import display
import time
%matplotlib notebook

In [None]:
def draw_bboxes(img, bboxes, classes, xy_is_center=True):
    img_w, img_h, _ = img.shape
    unique_classes = list(set(classes))
    #generate range of colors. One for each class present in the image
    N = len(unique_classes)
    HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
    RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
    for i in range(len(classes)):
        x,y,w,h = bboxes[i]
        c = classes[i]
        class_index = unique_classes.index(c)
        class_color = RGB_tuples[class_index]
        if xy_is_center:
            start = int(img_w*(x-w/2)), int(img_h*(y-h/2))
            end = int(img_w*(x+w/2)), int(img_h*(y+h/2))
        else:
            start = int(img_w*(x)), int(img_h*(y))
            end = int(img_w*(x+w)), int(img_h*(y+h))
        img = cv2.rectangle(img, start, end, class_color, img_w//256)
        text_size = img_h//512
        text_origin = list(start)
        text_origin[1] = text_origin[1]+(text_size*20)
        
        img = cv2.putText(img, c, text_origin, cv2.FONT_HERSHEY_SIMPLEX, text_size, class_color)
    return img

In [None]:
#model configurations:
#encoder-decoder
HIDDEN_SIZE = 768 #size of feature embeddings outputted by encoder and used by decoder (depends on the pretrained model used for encoding)
TEMPERATURE = 0.4 #temperature to use in temperature softmax for sampling xy
GPU_INDEX = 1

# Dataset hyperparameters
IMAGE_SIZE = (256, 256)
UQ_CAP = True # Use one caption or all the captions. Values: False -> All the captions. True -> One caption

MAX_OBJECTS = 10 # Maximum number of objects to use from the dataset
NORMALIZE_INPUT = True # Normalize the pictures to range [0, 1].
USE_ATTENTION = False # use attention in the decoder
XY_DISTRIBUTION_SIZE = 32 # Size of grid use in the picture to approximate the bounding boxes.

# Training
STARTING_EPOCH = 80 #epoch to start  training back from. The STARTING_EPOCH th model, ie. model with index STARTING_EPOCH-1 will be loaded from CHECKPOINTS_PATH. If STARTING_EPOCH=0 or less no epoch will be loaded.
EPOCHS = 30 # Number of epochs to train
PRINT_EVERY = 500 # Print information about the model every n steps
IS_TRAINING = False # Set the model to training or validation. Values: True -> Training mode. False -> Validation mode
CHECKPOINTS_PATH = "./checkpoints/1" # Path to save the epochs and average losses
PRETRAINED_ENCODER = False # Use the pretrained encoder
FREEZE_ENCODER = False # Freeze the weights of the encoder
ENCODER_PATH = None # Path of the pretrained encoder
LEARNING_RATE = 5e-5

In [None]:
encoder = SentenceEncoder(FREEZE_ENCODER)

decoder = DecoderRNN(vocab, hidden_size, is_training, use_attention=use_attention, bidirectional=bidirectional, xy_distribution_size=xy_distribution_size, temperature = TEMPERATURE)