# Importing Libraries


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import sys
sys.path.append('/content/gdrive/MyDrive/coco')
sys.path.append('/content/gdrive/MyDrive')

from refer import REFER
import numpy as np
import skimage.io as sio
import matplotlib.pyplot as plt
import os
from PIL import Image

In [None]:
# 12 seconds on high ram, gpu
data_root = '/content/gdrive/MyDrive/coco'  # contains refclef, refcoco, refcoco+, refcocog and images
dataset = 'refcoco' 
splitBy = 'unc'
refer = REFER(data_root, dataset, splitBy)

print ('dataset [%s_%s] contains: ' % (dataset, splitBy))
print ('%s expressions for %s refs in %s images.' % (len(refer.Sents), len(refer.getRefIds()), len(refer.getImgIds())))

loading dataset refcoco into memory...
creating index...
index created.
DONE (t=14.39s)
dataset [refcoco_unc] contains: 
142210 expressions for 50000 refs in 19994 images.


In [None]:
# Creating sets of refs for train, eval, and testing.
TRAIN_IDS = refer.getRefIds(split='train')
EVAL_IDS = refer.getRefIds(split='val')
TEST_IDS = refer.getRefIds(split='test')

In [None]:
# For CLIP model and cosine function
!pip install sentence-transformers --quiet

[K     |████████████████████████████████| 81kB 5.5MB/s 
[K     |████████████████████████████████| 2.1MB 9.1MB/s 
[K     |████████████████████████████████| 1.2MB 50.3MB/s 
[K     |████████████████████████████████| 3.3MB 52.0MB/s 
[K     |████████████████████████████████| 901kB 51.2MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
import sentence_transformers
from sentence_transformers import SentenceTransformer, util
import glob
import torch
import pickle
import zipfile
from collections import defaultdict
import tqdm.notebook as tq
from io import BytesIO

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

# Importing Embedded Features

In [None]:
# 1 min, 53 seconds

# COCO train main image embeddings
with open('/content/gdrive/MyDrive/embeddings/train_img_embeddings.pickle', 'rb') as handle:
  #main_img_emb = pickle.load(torch.load('/content/gdrive/MyDrive/embeddings/train_img_embeddings.pickle', map_location=torch.device('cpu')))
  main_img_emb = pickle.load(handle)

# COCO train set phrase embeddings
with open('/content/gdrive/MyDrive/embeddings/train_sentence_embeddings.pickle', 'rb') as handle:
  sentence_emb = pickle.load(handle)

# COCO train annotation features
with open('/content/gdrive/MyDrive/embeddings/train_ann_embeddings.pickle', 'rb') as handle:
  ann_features = pickle.load(handle)

In [None]:
check_cuda()

1497088

In [None]:
def compute_cosine(eOne, eTwo):
    cos_scores = util.cos_sim(eOne, eTwo)
    return cos_scores

# Generating Positive Samples from Feature Engineering

In [None]:
def create_positive_samples(ref_split, main_img_features, ann_img_features, sentence_features):
  all_samples = []

  img_fails = 0
  ann_fails = 0
  ann_pos_fails = 0

  for ref_id in tq.tqdm(ref_split):
    curr_ref = refer.Refs[ref_id]

    img_id = curr_ref['image_id']
    ann_id = curr_ref['ann_id']
    sent_ids = curr_ref['sent_ids']

    # Extract features from appropiate dictionaries
    try: 
      img_emb = main_img_features[img_id][0]
    except: 
      img_fails += 1
      continue
    
    try:
      ann_emb = ann_img_features[ann_id][0]
    except:
      ann_fails += 1
      continue
    try:
      ann_pos_feat = ann_img_features[ann_id][1]
    except:
      ann_pos_fails += 1
      continue

    # Create tensor from annotation positional features
    ann_pos_t = torch.flatten(torch.from_numpy(ann_pos_feat)).to(device)

    sent_embs = []
    for s_id in sent_ids:
      try:
        sent_embs.append(sentence_features[s_id][0])
      except:
        # Sentence embedding not found. Skip this sentence
        continue

    # Now, for each sentence embedding, generate a positve sample. Compute cosines and concat all feature tensors
    for sent_embedding in sent_embs:
      img_cos = torch.flatten(compute_cosine(img_emb, sent_embedding))
      ann_cos = torch.flatten(compute_cosine(ann_emb, sent_embedding))

      # All 6 features now ready to be joined into one tensor
      current_sample = torch.cat((img_emb, ann_emb, sent_embedding, ann_pos_t, img_cos, ann_cos), 0).to(device)
      all_samples.append(current_sample)

  print(f"Image fails: {img_fails}\nAnn fails: {ann_fails}\nAnn position fails: {ann_pos_fails}")

  # Stack all sample tensors into one tensor of N x num_features
  return torch.stack(all_samples).to(device)

# Creating positive samples feature and target labels

In [None]:
# Time on to generate all: 1 minute, 120k samples created.
test = create_positive_samples(TRAIN_IDS, main_img_emb, ann_features, sentence_emb)
test.shape # torch.Size([120264, 1545])

HBox(children=(FloatProgress(value=0.0, max=42404.0), HTML(value='')))


Image fails: 0
Ann fails: 126
Ann position fails: 0


torch.Size([120264, 1545])

In [None]:
# Creating label tensor for positive samples
positive_labels = torch.ones(1202645)
positive_labels.shape

torch.Size([1202645])

In [None]:
# Saving positive samples to file
torch.save(test, '/content/gdrive/MyDrive/Training_Tensors/positive_samples.pt')

In [None]:
# Loading positive samples to file. 7seconds
posFromFile = torch.load('/content/gdrive/MyDrive/Training_Tensors/positive_samples.pt')

In [None]:
# Loading positive tensor and moving to cpu and saving
# 29 seconds
positive_t = torch.load('/content/gdrive/MyDrive/Training_Tensors/positive_samples.pt')

positive_cpu = positive_t.to('cpu')

# Saving tensor to cpu
torch.save(positive_cpu, '/content/gdrive/MyDrive/cpu_tensors/positive_samples.pt')

# Creating Related Negative Samples

In [None]:
# Break ref list in half

train_ref_ids = refer.getRefIds(split='train')
mid = len(train_ref_ids) // 2
batch_one = train_ref_ids[:mid]
batch_two = train_ref_ids[mid:]

print(f"batch_one size: {len(batch_one)}\nbatch_two size: {len(batch_two)}")

batch_one size: 21202
batch_two size: 21202


In [None]:
def related_negative_samples(ref_split, main_img_features, ann_img_features, sentence_features):
  all_samples = []

  img_fails = 0
  ann_fails = 0
  ann_pos_fails = 0

  for ref_id in tq.tqdm(ref_split):
    curr_ref = refer.Refs[ref_id]

    img_id = curr_ref['image_id']
    matching_ann_id = curr_ref['ann_id']
    sent_ids = curr_ref['sent_ids']

    # Get main image embedding
    try: 
      img_emb = main_img_features[img_id][0]
    except: 
      img_fails += 1
      continue

    # Load embeddings for sentences tied to current reference
    sent_embs = []
    for s_id in sent_ids:
      try:
        sent_embs.append(sentence_features[s_id][0])
      except:
        # Sentence embedding not found. Skip this sentence
        continue

    # Get all annotations for current image_id
    img_anns = refer.imgToAnns[img_id]

    for related_ann in img_anns:
      current_ann_id = related_ann['id']

      # Don't include positive sample
      if current_ann_id == matching_ann_id:
        continue

      # Attempt to pull out ann embedding
      try:
        ann_emb = ann_img_features[current_ann_id][0]
      except:
        ann_fails += 1
        continue

      try:
        ann_pos_feat = ann_img_features[current_ann_id][1]
      except:
        ann_pos_fails += 1
        continue

      # Create tensor from annotation positional features
      ann_pos_t = torch.flatten(torch.from_numpy(ann_pos_feat)).to(device)


      # Generate a negative sample for each sentence embedding.
      for sent_embedding in sent_embs:
        img_cos = torch.flatten(compute_cosine(img_emb, sent_embedding))
        ann_cos = torch.flatten(compute_cosine(ann_emb, sent_embedding))

        # All 6 features now ready to be joined into one tensor
        current_sample = torch.cat((img_emb, ann_emb, sent_embedding, ann_pos_t, img_cos, ann_cos), 0)
        all_samples.append(current_sample)

  print(f"Image fails: {img_fails}\nAnn fails: {ann_fails}\nAnn position fails: {ann_pos_fails}")

  # Stack all sample tensors into one tensor of N x num_features
  return torch.stack(all_samples)

### Create two batches of related negative samples

In [None]:
# Batch One. 13 minutes
neg_samples_rel = related_negative_samples_one(batch_one, main_img_emb, ann_features, sentence_emb)
print(neg_samples_rel_one.shape) # torch.Size([578495, 1545])

# Saving related negative samples to file. 1min 27 s
torch.save(neg_samples_rel_one, '/content/gdrive/MyDrive/Training_Tensors/negative_related_batch_one.pt')

HBox(children=(FloatProgress(value=0.0, max=21202.0), HTML(value='')))


Image fails: 0
Ann fails: 1192
Ann position fails: 0


torch.Size([578495, 1545])

In [None]:
# Batch Two
#torch.cuda.empty_cache()

neg_samples_rel_two = related_negative_samples(batch_two, main_img_emb, ann_features, sentence_emb)
print(neg_samples_rel_two.shape) # torch.Size([567127, 1545])

# Save sample tensor to file
torch.save(neg_samples_rel_two, '/content/gdrive/MyDrive/Training_Tensors/negative_related_batch_two.pt')

HBox(children=(FloatProgress(value=0.0, max=21202.0), HTML(value='')))


Image fails: 0
Ann fails: 843
Ann position fails: 0
torch.Size([567127, 1545])


In [None]:
# Loading batch one tensors and moving to cpu and saving

neg_batch_one = torch.load('/content/gdrive/MyDrive/Training_Tensors/negative_related_batch_one.pt')

neg_batch_one_cpu = neg_batch_one.to('cpu')

# Saving tensor to cpu
torch.save(neg_batch_one_cpu, '/content/gdrive/MyDrive/cpu_tensors/negative_related_batch_one.pt')

In [None]:
# Loading batch two and moving to cpu and saving

neg_batch_two = torch.load('/content/gdrive/MyDrive/Training_Tensors/negative_related_batch_two.pt')

neg_batch_two_cpu = neg_batch_two.to('cpu')

# Saving tensor to cpu
torch.save(neg_batch_two_cpu, '/content/gdrive/MyDrive/cpu_tensors/negative_related_batch_two.pt')

# Create Random Negative Samples

In [None]:
def get_random_sample(ref_split, main_img_features, ann_img_features, curr_img_id, sample_count, sent_embedding):
  count = 0
  all_samples = []

  # Set of image ids to randomly draw from
  train_img_ids = set(refer.getImgIds(TRAIN_IDS))

  while count < sample_count:
    # Get a random image that's not the current image
    rand_img = random.sample(train_img_ids, 1)[0]
    while curr_img_id == rand_img:
      rand_img = random.sample(train_img_ids, 1)[0]

    # Get a single annotation from the random image
    ann_id = refer.imgToAnns[rand_img][0]['id']
    
    # Get features for annotation. try again if annotation failed.
    try:
      ann_emb = ann_img_features[ann_id][0]
      ann_pos_feat = ann_img_features[ann_id][1]
      img_emb = main_img_features[rand_img][0]
    except:
      continue

    # Compute sample tensor and append to list
    ann_pos_t = torch.flatten(torch.from_numpy(ann_pos_feat)).to(device)
    img_cos = torch.flatten(compute_cosine(img_emb, sent_embedding))
    ann_cos = torch.flatten(compute_cosine(ann_emb, sent_embedding))

    # All 6 features now ready to be joined into one tensor
    current_sample = torch.cat((img_emb, ann_emb, sent_embedding, ann_pos_t, img_cos, ann_cos), 0)
    all_samples.append(current_sample)
    count += 1

  return all_samples

In [None]:
import random

def random_negative_samples(ref_split, main_img_features, ann_img_features, sentence_features, ratio):
  all_samples = []

  for ref_id in tq.tqdm(ref_split):
    
    curr_ref = refer.Refs[ref_id]
    curr_img_id = curr_ref['image_id']

    # Load embeddings for sentences tied to current reference
    sent_embs = []
    for s_id in curr_ref['sent_ids']:
      try:
        sent_embs.append(sentence_features[s_id][0])
      except:
        # Sentence embedding not found. Skip this sentence
        continue

    # For each sentence in ref, get ratio number of random negative samples for that sentence
    for sent_embedding in sent_embs:
      curr_samples = get_random_sample(ref_split, main_img_features, ann_img_features, curr_img_id, ratio, sent_embedding)
      all_samples = all_samples + curr_samples

  return torch.stack(all_samples)

In [None]:
test = random_negative_samples(TRAIN_IDS, main_img_emb, ann_features, sentence_emb, 2)
test.shape

HBox(children=(FloatProgress(value=0.0, max=42404.0), HTML(value='')))




torch.Size([241248, 1545])

In [None]:
# Saving random negative samples

torch.save(test, '/content/gdrive/MyDrive/Training_Tensors/negative_random.pt')

In [None]:
negative_random = test.to('cpu') # 56.38

In [None]:
negative_random

tensor([[ 0.1825, -0.0498,  0.1209,  ...,  0.2689,  0.1613,  0.1632],
        [-0.4963, -0.2756,  0.0073,  ...,  0.1969,  0.1791,  0.1613],
        [ 0.0104, -0.0083, -0.2167,  ...,  0.0634,  0.1826,  0.1926],
        ...,
        [-0.6372, -0.3318,  0.1387,  ...,  0.2103,  0.1704,  0.1693],
        [-0.2534,  0.1180, -0.1149,  ...,  0.4029,  0.2049,  0.1772],
        [ 0.1938,  0.2064, -0.0591,  ...,  0.5591,  0.1623,  0.1868]],
       dtype=torch.float64)

In [None]:
# Saving tensor to cpu
torch.save(negative_random, '/content/gdrive/MyDrive/cpu_tensors/negative_random.pt')

In [None]:
# Loading negative tensor to file. 7seconds
neg_file = torch.load('/content/gdrive/MyDrive/cpu_tensors/negative_random.pt')

# Stacking all training tensors and training labels to cpu

In [None]:
# Load all tensors onto CPU
pos = torch.load('/content/gdrive/MyDrive/cpu_tensors/positive_samples.pt')
neg_one = torch.load('/content/gdrive/MyDrive/cpu_tensors/negative_related_batch_one.pt')
neg_two = torch.load('/content/gdrive/MyDrive/cpu_tensors/negative_related_batch_two.pt')
#neg_three = torch.load('/content/gdrive/MyDrive/cpu_tensors/negative_random.pt')

In [None]:
# Concatenating all negative samples:
all_samples = torch.cat([pos, neg_one, neg_two], dim=0)

In [None]:
# Saving tensor to cpu
torch.save(all_samples, '/content/gdrive/MyDrive/cpu_tensors/all_samples.pt')

In [None]:
# Creating label vectors
positive_labels = torch.ones(1202645)
positive_labels.shape

# Structure of a ref

In [None]:
50000
{'sent_ids': [100272, 100273, 100274],
 'file_name': 'COCO_train2014_000000173056_1.jpg',
 'ann_id': 275551,
 'ref_id': 35254,
 'image_id': 173056,
 'split': 'train',
 'sentences': [{'tokens': ['white', 'brown', 'sheep', 'right'],
   'raw': 'white brown sheep right',
   'sent_id': 100272,
   'sent': 'white brown sheep right'},
  {'tokens': ['black', 'sheep', 'on', 'right'],
   'raw': 'black sheep on right',
   'sent_id': 100273,
   'sent': 'black sheep on right'},
  {'tokens': ['sheep', 'on', 'the', 'right'],
   'raw': 'sheep on the right',
   'sent_id': 100274,
   'sent': 'sheep on the right'}],
 'category_id': 20}

{'ann_id': 275551,
 'category_id': 20,
 'file_name': 'COCO_train2014_000000173056_1.jpg',
 'image_id': 173056,
 'ref_id': 35254,
 'sent_ids': [100272, 100273, 100274],
 'sentences': [{'raw': 'white brown sheep right',
   'sent': 'white brown sheep right',
   'sent_id': 100272,
   'tokens': ['white', 'brown', 'sheep', 'right']},
  {'raw': 'black sheep on right',
   'sent': 'black sheep on right',
   'sent_id': 100273,
   'tokens': ['black', 'sheep', 'on', 'right']},
  {'raw': 'sheep on the right',
   'sent': 'sheep on the right',
   'sent_id': 100274,
   'tokens': ['sheep', 'on', 'the', 'right']}],
 'split': 'train'}

# Testing Embeddings and Cosine function. Ensuring embeddings are valid

In [None]:
# Testing image imbed still works
test_img_id = 581857
test_img = refer.Imgs[test_img_id]

img_path = refer.IMAGE_DIR + '/' + test_img['file_name']
img_path

In [None]:
# Opening image
img_j = Image.open(img_path)

plt.figure()
ax = plt.gca()
ax.imshow(img_j)
plt.show()

In [None]:
# Load the respective CLIP model. Embeds both images and text into same vector space.
CLIP_model = SentenceTransformer('clip-ViT-B-32').to(device)

def get_clip_embedding(raw_input):
    # Computes embedding for both text AND images.
    clip_emb = CLIP_model.encode(raw_input, convert_to_tensor=True)
    return clip_emb

HBox(children=(FloatProgress(value=0.0, max=327977311.0), HTML(value='')))




  "Argument interpolation should be of type InterpolationMode instead of int. "


In [None]:
# testing cosine vectors
pe1 = get_clip_embedding("Man in black jacket")
pe2 = get_clip_embedding("Person wearing a coat")

test_cos = compute_cosine(pe1, pe2)

cos_flat = torch.flatten(test_cos)
print("Cos vector contents:", test_cos)

print("OG cos vector shape: " , test_cos.shape)
print("Flattend cos vector shape:", cos_flat.shape)
cos_flat

Cos vector contents: tensor([[0.8354]], device='cuda:0')
OG cos vector shape:  torch.Size([1, 1])
Flattend cos vector shape: torch.Size([1])


tensor([0.8354], device='cuda:0')

In [None]:
print("pe1 shape:", pe1.shape)

pe_flat = torch.flatten(pe1)
print("pe1 flat shape:", pe_flat.shape)

pe1 shape: torch.Size([512])
pe1 flat shape: torch.Size([512])


In [None]:
# Getting image embedding
img_emb = get_clip_embedding(img_j)

In [None]:
x = 0 
test_embed = None
for k in main_img_emb:
  if x > 0: 
    break
  
  print(k)
  x += 1
  test_embed = main_img_emb[k][0]

test_embed

In [None]:
phrase = "People at a library"

pe = get_clip_embedding(phrase)
#pe.to(device)
compute_cosine(pe, img_emb)