# Import COCO dataset and REFER assets

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import sys
sys.path.append('/content/gdrive/MyDrive/coco')
sys.path.append('/content/gdrive/MyDrive')

from refer import REFER
import numpy as np
import skimage.io as sio
import matplotlib.pyplot as plt
import os
from PIL import Image

data_root = '/content/gdrive/MyDrive/coco'  # contains refclef, refcoco, refcoco+, refcocog and images
dataset = 'refcoco' 
splitBy = 'unc'
refer = REFER(data_root, dataset, splitBy)

print ('dataset [%s_%s] contains: ' % (dataset, splitBy))
print ('%s expressions for %s refs in %s images.' % (len(refer.Sents), len(refer.getRefIds()), len(refer.getImgIds())))

# Creating sets of refs for train, eval, and testing.
TRAIN_IDS = refer.getRefIds(split='train')
EVAL_IDS = refer.getRefIds(split='val')
TEST_IDS = refer.getRefIds(split='test')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
loading dataset refcoco into memory...
creating index...
index created.
DONE (t=13.38s)
dataset [refcoco_unc] contains: 
142210 expressions for 50000 refs in 19994 images.


# Import CLIP assets and model

In [None]:
# For CLIP model
!pip install sentence-transformers --quiet

import sentence_transformers
from sentence_transformers import SentenceTransformer, util
import glob
import torch
import pickle
import zipfile
from IPython.display import display
from IPython.display import Image as IPImage
from collections import defaultdict
import tqdm.notebook as tq
from io import BytesIO
#torch.set_num_threads(4)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Load the respective CLIP model. Embeds both images and text into same vector space.
CLIP_model = SentenceTransformer('clip-ViT-B-32').to(device)

cuda


  "Argument interpolation should be of type InterpolationMode instead of int. "


# Compute and Gather Training Data



## 1. Compute CLIP embeddings of COCO images
* Stored in a dictionary that has ImageId as key and list of features as the value.

In [None]:
def get_clip_embedding(raw_input):
    # Computes embedding for both text AND images.
    clip_emb = CLIP_model.encode(raw_input, convert_to_tensor=True)
    return clip_emb

def compute_img_embeddings(img_diction, ref_split):
  
  for ref_id in tq.tqdm(ref_split):
    curr_ref = refer.Refs[ref_id]

    img_id = curr_ref['image_id']
    curr_img = refer.Imgs[img_id]
    full_path = refer.IMAGE_DIR + '/' + curr_img['file_name']
            
    clip_emb = get_clip_embedding(Image.open(full_path))
    img_diction[img_id].append(clip_emb)

  return None

In [None]:
# TRAIN IMG FEATURES ~ 4 hours

# ImgId to list of features that have been computed
#train_img_feats = defaultdict(list)

#compute_img_embeddings(train_img_feats, TRAIN_IDS)
# Save coco train image embeddings to file
#with open('train_img_embeddings.pickle', 'wb') as handle:
 # pickle.dump(train_img_feats, handle)

# Open
#with open('/content/gdrive/MyDrive/train_img_embeddings.pickle', 'rb') as handle:
 # test = pickle.load(handle)

In [None]:
# TEST IMG FEATURES

test_img_feats = defaultdict(list)
compute_img_embeddings(test_img_feats, TEST_IDS)

# Save coco train image embeddings to file
with open('/content/gdrive/MyDrive/embeddings/test_img_embeddings.pickle', 'wb') as handle:
  pickle.dump(test_img_feats, handle)

HBox(children=(FloatProgress(value=0.0, max=3785.0), HTML(value='')))




In [None]:
# EVAL IMG FEATURES

eval_img_feats = defaultdict(list)
compute_img_embeddings(eval_img_feats, EVAL_IDS)

# Save coco train image embeddings to file
with open('/content/gdrive/MyDrive/embeddings/eval_img_embeddings.pickle', 'wb') as handle:
  pickle.dump(eval_img_feats, handle)

HBox(children=(FloatProgress(value=0.0, max=3811.0), HTML(value='')))




## 2. Compute CLIP embeddings of COCO annotations
* Stored in a dictionary that has AnnId as key and list of features as the value.

In [None]:
def get_annotation_embeddings(ann_embeddings, ref_split, show=False):
  processed_img_ids = set()

  for ref_id in tq.tqdm(ref_split):
    try:
      curr_ref = refer.Refs[ref_id]
      img_id = curr_ref['image_id']

      if img_id in processed_img_ids:
        continue
      else:
        processed_img_ids.add(img_id)
        
      main_img = refer.Imgs[img_id]
      main_img_data  = sio.imread(refer.IMAGE_DIR + '/' + main_img['file_name'])

      if len(main_img_data.shape) < 3: continue

      # Get all annotations for current image_id
      img_anns = refer.imgToAnns[img_id]

      # Get subimage for each annotation. 
      # DO ONLY ONCE, FOR EACHH ANNOTATION THOUGH!!
      for curr_ann in img_anns:
      
        ann_id = curr_ann['id']
        bb = refer.Anns[ann_id]['bbox']
        bbox = [int(b) for b in bb]

        # Pull sub-image out of main image
        sub = main_img_data[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]]

        if len(sub) == 0: continue 

        subimg = Image.fromarray(sub)

        # Compute positional features
        ih, iw, _ = main_img_data.shape
        x,y,w,h = bb
        # x1, relative
        x1r = x / iw
        # y1, relative
        y1r = y / ih
        # x2, relative
        x2r = (x+w) / iw
        # y2, relative
        y2r = (y+h) / ih
        # area
        area = (w*h) / (iw*ih)
        # ratio image sides (= orientation)
        ratio = iw / ih
        # distance from center (normalised)
        cx = iw / 2
        cy = ih / 2
        bcx = x + w / 2
        bcy = y + h / 2
        distance = np.sqrt((bcx-cx)**2 + (bcy-cy)**2) / np.sqrt(cx**2+cy**2)
        # Final array of pos-features
        pos_feats = np.array([x1r,y1r,x2r,y2r,area,ratio,distance]).reshape(1,7)
        # TODO: Reshape pos_feats into a flat numpy array
        # convert subimg to jpg
        rgb_im = subimg.convert("RGB")

        # Create temp. jpg file in order to get CLIP embedding.
        with BytesIO() as f:
          rgb_im.save(f, format='JPEG')
          f.seek(0)
          newjpg = Image.open(f)
          subimg_embedding = get_clip_embedding(newjpg)

          if show:
            plt.figure()
            ax = plt.gca()
            ax.imshow(newjpg)
            plt.show()

        # Add features
        ann_embeddings[ann_id].append(subimg_embedding)
        ann_embeddings[ann_id].append(pos_feats)
    except:
        continue
  return None

In [None]:
train_ann_embeddings = defaultdict(list)# 2:28 hrs, min
get_annotation_embeddings(train_ann_embeddings, TRAIN_IDS)

# Save coco train image embeddings to file
with open('/content/gdrive/MyDrive/embeddings/train_ann_embeddingsII.pickle', 'wb') as handle:
  pickle.dump(train_ann_embeddings, handle)

In [None]:
# TEST ANN EMBEDDINGS
test_ann_emb = defaultdict(list)
get_annotation_embeddings(test_ann_emb, TEST_IDS)

with open('/content/gdrive/MyDrive/embeddings/test_ann_embeddings.pickle', 'wb') as handle:
  pickle.dump(test_ann_emb, handle)

HBox(children=(FloatProgress(value=0.0, max=3785.0), HTML(value='')))




In [None]:
# EVAL ANN EMBEDDINGS
eval_ann_emb = defaultdict(list)
get_annotation_embeddings(eval_ann_emb, EVAL_IDS)

with open('/content/gdrive/MyDrive/embeddings/eval_ann_embeddings.pickle', 'wb') as handle:
  pickle.dump(eval_ann_emb, handle)

HBox(children=(FloatProgress(value=0.0, max=3811.0), HTML(value='')))




## 3. Compute CLIP embeddings of COCO referring expressions
* Stored in a dictionary that has AnnId as key and list of features as the value.

In [None]:
def compute_sentence_embeddings(sent_diction, ref_split):
  
  for ref_id in tq.tqdm(ref_split):
    curr_ref = refer.Refs[ref_id]

    # Gather embeddings for each sentence in ref
    for sent in curr_ref['sentences']:
      sent_emb = get_clip_embedding(sent['raw'])
      sent_diction[sent['sent_id']].append(sent_emb)
  return None

In [None]:
# TEST SENT
test_sent_embeddings = defaultdict(list)

compute_sentence_embeddings(test_sent_embeddings, TEST_IDS)

# Save coco train image embeddings to file
with open('/content/gdrive/MyDrive/embeddings/test_sentence_embeddings.pickle', 'wb') as handle:
  pickle.dump(test_sent_embeddings, handle)

HBox(children=(FloatProgress(value=0.0, max=3785.0), HTML(value='')))




In [None]:
# EVAL SENT
eval_sent_emb = defaultdict(list)
compute_sentence_embeddings(eval_sent_emb, EVAL_IDS)

with open('/content/gdrive/MyDrive/embeddings/eval_sentence_embeddings.pickle', 'wb') as handle:
  pickle.dump(eval_sent_emb, handle)

HBox(children=(FloatProgress(value=0.0, max=3811.0), HTML(value='')))




In [None]:
len(test_sent_embeddings)

10752