# Exploration on MMT Retrieval repo

In [1]:
import os
HOME_DIR = "/home/ec2-user/SageMaker"
os.chdir(f"{HOME_DIR}/MMT-Retrieval/")

In [2]:
!pip install -r requirements.txt

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
!pip install anytree

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
from mmt_retrieval.model.models import OSCAR, UNITER, M3P, ClassificationHead, Pooling
from mmt_retrieval import MultimodalTransformer

# Loading pre-trained Model

## M3P

In [6]:
# Load pre-trained model M3P
path_model_m3p = os.path.join(HOME_DIR, "model_m3p/0_M3P")
pretrained_model = M3P(model_path = path_model_m3p)

2021-08-08 13:50:02 - Lock 139649952230760 acquired on /home/ec2-user/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.ab95cf27f9419a99cce4f19d09e655aba382a2bafe2fe26d0cc24c18cf1a1af6.lock


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

2021-08-08 13:50:03 - Lock 139649952230760 released on /home/ec2-user/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.ab95cf27f9419a99cce4f19d09e655aba382a2bafe2fe26d0cc24c18cf1a1af6.lock
2021-08-08 13:50:03 - Lock 139649952230760 acquired on /home/ec2-user/.cache/huggingface/transformers/9df9ae4442348b73950203b63d1b8ed2d18eba68921872aee0c3a9d05b9673c6.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

2021-08-08 13:50:04 - Lock 139649952230760 released on /home/ec2-user/.cache/huggingface/transformers/9df9ae4442348b73950203b63d1b8ed2d18eba68921872aee0c3a9d05b9673c6.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock
2021-08-08 13:50:04 - Lock 139649952230760 acquired on /home/ec2-user/.cache/huggingface/transformers/daeda8d936162ca65fe6dd158ecce1d8cb56c17d89b78ab86be1558eaef1d76a.a984cf52fc87644bd4a2165f1e07e0ac880272c1e82d648b4674907056912bd7.lock


Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2021-08-08 13:50:05 - Lock 139649952230760 released on /home/ec2-user/.cache/huggingface/transformers/daeda8d936162ca65fe6dd158ecce1d8cb56c17d89b78ab86be1558eaef1d76a.a984cf52fc87644bd4a2165f1e07e0ac880272c1e82d648b4674907056912bd7.lock


In [8]:
pretrained_model.tokenize("Pizza con champiñones")

[78011, 158, 91714, 35984, 1444]

In [21]:
pretrained_model.tokenize("Pizza con jamón dulche")

[78011, 158, 79, 52960, 24532, 1430]

### Load image features

In [168]:
from mmt_retrieval.data.image_embeddings import ImageDict

In [169]:
imagedict = ImageDict()

In [170]:
%%time
imagedict.load_obj_tsv(
    fname="/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny/train_flickr30k_resnet101_faster_rcnn_genome.tsv.1",
    topk=10
)

CPU times: user 38.2 ms, sys: 67 µs, total: 38.2 ms
Wall time: 35 ms


In [174]:
value = imagedict["1213185795"]

In [175]:
imagedict["1213185795"]

OrderedDict([('img_id', '1213185795'),
             ('img_w', 500),
             ('img_h', 333),
             ('num_boxes', 39),
             ('boxes',
              array([[178.77786 ,   0.      , 499.5     , 189.06422 ],
                     [286.2556  ,  91.49405 , 409.02844 , 224.29764 ],
                     [  0.      ,  17.339499, 211.28693 , 317.61197 ],
                     [  6.189648,  34.51717 , 108.51078 , 292.2112  ],
                     [319.35147 , 286.56503 , 466.34363 , 332.445   ],
                     [323.2654  ,  30.65785 , 477.2827  ,  78.58263 ],
                     [263.98987 , 235.54553 , 499.5     , 332.445   ],
                     [  0.      ,   0.      , 316.1118  ,  54.290844],
                     [203.84828 , 190.30222 , 499.5     , 277.82635 ],
                     [229.45714 , 221.64037 , 275.42932 , 300.42758 ],
                     [119.19319 ,   0.      , 427.99127 , 332.445   ],
                     [ 42.994564,   9.276467, 231.20543 , 332.445  

In [176]:
# coordinates of boxes
imagedict["1213185795"]["boxes"].shape

(39, 4)

In [177]:
# boxes features
imagedict["1213185795"]["features"].shape

(39, 2048)

In [178]:
len(imagedict)

10

# Run train test

See application notebook in MMT-Retrieval/examples/applications/Image_Search.ipynb \
**See code examples/experiments/run_train_test.py**

In [5]:
!pwd

/home/ec2-user/SageMaker/MMT-Retrieval


In [6]:
import torch
import yaml
import numpy as np
import random
from datetime import datetime

In [7]:
config = yaml.load(open("examples/experiments/super_config.yaml"), Loader=yaml.FullLoader)

In [9]:
# Model paths
path_model_m3p = os.path.join(HOME_DIR, "model_m3p/0_M3P")
path_image_feature_folder = "/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny"
path_flickr30k_original_folder = "/home/ec2-user/SageMaker/1_data/flickr30k_entities"
path_flickr_split_folder = "/home/ec2-user/SageMaker/1_data/flickr30k_split_folder"

In [18]:
# ######################################

#   IMPORTANT!! modify config paths here

# ######################################
# Manual modifications
config["model_path"] = "david-test"

# model name
config["model"]["name"] = "m3p"

# Model path of the pretrained weights
config["model"]["model_path"] = path_model_m3p
config["model"]["pretrained_model_path"] = path_model_m3p

# Path of the flickr features downloaded from  
# https://drive.google.com/uc?export=download&id=11OD_qq7ITBarJwWZfi0bWIRw3HPEaHwE 
#(source: https://github.com/jnhwkim/ban-vqa/blob/master/tools/download_flickr.sh)
config["data"]["image_feature_folder"] = path_image_feature_folder
config["data"]["flickr30k_original_folder"] = path_flickr30k_original_folder
config["data"]["flickr_split_folder"] = path_flickr_split_folder

In [19]:
# Seed
torch.manual_seed(config["seed"])
np.random.seed(config["seed"])
random.seed(config["seed"])

In [20]:
# Model config
model_config = config["model"]

# Creating folders
model_folder_name = f"{model_config['name']}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}"
model_save_path = os.path.join(config["model_path"], model_folder_name)
os.makedirs(model_save_path, exist_ok=True)

In [21]:
# <TODO: CHANGE HERE> Put the path of our own model


### Train

In [22]:
# Indicate to Train
config.get("do_train", True)

'Train'

#### Output"build_model"

In [23]:
model_config

{'model_path': '/home/ec2-user/SageMaker/model_m3p/0_M3P',
 'model_path_cross': 'YOUR_CROSS_ENCODER',
 'model_path_embedding': 'YOUR_EMBEDDING_MODEL',
 'name': 'm3p',
 'pretrained_model_path': '/home/ec2-user/SageMaker/model_m3p/0_M3P',
 'max_seq_length': 70,
 'max_image_seq_len': 50,
 'input_key': 'pooled_cls_token_embeddings',
 'classifier_type': 'linear',
 'scaling_factor': 1}

In [24]:
# Build model
model = build_model(model_config)

2021-08-08 16:03:29 - Use pytorch device: cpu


In [25]:
model.tokenize("Coca-cola")

[72625, 9, 21854]

In [26]:
model.tokenize("Hamburguesa con queso y patatas")

[33112, 21049, 11, 158, 41, 991, 113, 33156, 1073]

#### Deep dive "build_model"

In [15]:
def build_model(model_config):
    if "legacy" not in model_config["name"]:
        if "oscar" in model_config["name"]:
            embedding_model = OSCAR(model_config["pretrained_model_path"],
                                    max_seq_length=model_config.get("max_seq_length", 70),
                                    max_image_seq_len=model_config.get("max_image_seq_len", 50))
        elif "m3p" in model_config["name"]:
            embedding_model = M3P(model_config["pretrained_model_path"],
                                  max_seq_length=model_config.get("max_seq_length", 128),
                                  max_image_seq_len=model_config.get("max_image_seq_len", 50))
        elif "villa" in model_config["name"] or "uniter" in model_config["name"]:
            embedding_model = UNITER(model_config["pretrained_model_path"],
                                     max_seq_length=model_config.get("max_seq_length", 70),
                                     max_image_seq_len=model_config.get("max_image_seq_len", 50))
        if model_config.get("half_layers", False):
            module_list = torch.nn.ModuleList()
            for i, layer in enumerate(embedding_model.auto_model.encoder.layer):
                if i % 2 == 0:
                    module_list.append(layer)
            embedding_model.auto_model.encoder.layer = module_list

        class_head = ClassificationHead(2, model_config.get("input_key", "pooled_cls_token_embeddings"), 768,
                                        model_config.get("classifier_type", "linear"),
                                        model_config.get("scaling_factor", 2))
        pooling_model = Pooling(768,
                                       pooling_mode_mean_tokens=model_config.get("mean", True),
                                       pooling_mode_cls_token=model_config.get("cls", False),
                                       pooling_mode_max_tokens=model_config.get("max", False))
        return MultimodalTransformer(modules=[embedding_model, class_head, pooling_model])

In [80]:
# <TODO: DIG DIVE INTO M3P initialization> (see model/models.py)
embedding_model = M3P(
    model_config["pretrained_model_path"],
    max_seq_length=model_config.get("max_seq_length", 128),
    max_image_seq_len=model_config.get("max_image_seq_len", 50))

In [37]:
embedding_model

M3P(
  (auto_model): TransformerModel(
    (position_embeddings): Embedding(514, 768)
    (embeddings): Embedding(250002, 768, padding_idx=1)
    (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (image_embeddings): BertImageEmbeddings(
      (image_embeddings): Linear(in_features=2048, out_features=768, bias=True)
      (image_distbution_embeddings): Linear(in_features=1600, out_features=768, bias=True)
      (image_location_embeddings): Linear(in_features=5, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (refine_embeddings): AoA_Refiner_Core(
      (layers): ModuleList(
        (0): AoA_Refiner_Layer(
          (self_attn): MultiHeadedDotAttention(
            (linears): ModuleList(
              (0): Linear(in_features=768, out_features=768, bias=True)
              (1): Linear(in_features=768, out_features=768, bias=True)
              (2): Linea

In [38]:
# <TODO: DIG DIVE INTO ClassificationHead initialization> (see model/models.py)
class_head = ClassificationHead(
    2, model_config.get("input_key", "pooled_cls_token_embeddings"), 768,
    model_config.get("classifier_type", "linear"),
    model_config.get("scaling_factor", 2)
)

In [39]:
class_head

ClassificationHead(
  (_dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)

In [40]:
model_config.get("input_key", "pooled_cls_token_embeddings")

'pooled_cls_token_embeddings'

In [41]:
model_config.get("classifier_type", "linear")

'linear'

In [42]:
model_config.get("scaling_factor", 2)

1

In [43]:
# <TODO: DIG DIVE INTO ClassificationHead initialization> (see model/models.py)
pooling_model = Pooling(
    768,
   pooling_mode_mean_tokens=model_config.get("mean", True),
   pooling_mode_cls_token=model_config.get("cls", False),
   pooling_mode_max_tokens=model_config.get("max", False)
)

In [44]:
model_config.get("cls", False)

False

In [45]:
model_config.get("mean", True)

True

In [46]:
pooling_model

Pooling()

## Task in train_config

In [27]:
from examples.experiments.process_data import get_sampler, DATA_LOADER

In [28]:
train_config = config["training"]

In [29]:
tasks = train_config["tasks"]

In [30]:
tasks

[{'name': 'flickr30k',
  'tiny': False,
  'batchsize': 128,
  'loss': {'name': 'triplet', 'margin': 0.1},
  'data_args': {'jit_loading': False, 'languages': ['en', 'de', 'cs', 'fr']}},
 {'name': 'flickr30k',
  'tiny': False,
  'batchsize': 128,
  'loss': {'name': 'joint'},
  'data_args': {'jit_loading': False, 'languages': ['en', 'de', 'cs', 'fr']}},
 {'name': 'flickr30k',
  'batchsize': 16,
  'tiny': False,
  'loss': {'name': 'ance', 'margin': 0.1},
  'data_args': {'topk': 50, 'negative_examples': 7, 'sim_batchsize': 512}},
 {'name': 'flickr30k',
  'tiny': False,
  'batchsize': 128,
  'do_hard': True,
  'loss': {'name': 'joint'},
  'data_args': {'sim_batchsize': 512, 'topk': 50, 'hard_p': 0.05}}]

In [31]:
task = tasks[0]

In [32]:
task["name"]

'flickr30k'

In [33]:
# Defining get_data
def get_data(data_config, name):
    """
    for backwards compatibility of old configs
    :param data_config:
    :param name:
    :return:
    """
    if "all" in data_config:
        return data_config["all"]
    elif name in data_config:
        return data_config[name]
    else:
        return data_config

In [34]:
task["name"]

'flickr30k'

In [35]:
DATA_LOADER[task["name"]]

<function examples.experiments.process_data.get_flickr30k(config, split, tiny, image_dict, **args)>

In [36]:
config["data"]

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [37]:
# Inspecting get_flickr30k function (filling arguments of get_flickr30k) <ARGUMENTS>
config_get = get_data(config["data"], task["name"])
split = "train"
tiny = task.get("tiny", False)
image_dict = model.image_dict

In [38]:
get_data(config["data"], task["name"])

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [40]:
%%time
dataset = DATA_LOADER[task["name"]](
    get_data(config["data"], task["name"]), 
    "train", 
    task.get("tiny", False), 
    model.image_dict,
    joint=task["loss"]["name"] == "joint", 
    **task.get("data_args", 
    {}))

2021-08-08 16:04:56 - Flickr30k Image Feature Split does not exist. Creating in /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
reading  /home/ec2-user/SageMaker/1_data/flickr30k_features/tiny/test_flickr30k_resnet101_faster_rcnn_genome.tsv.3
line 0
reading  /home/ec2-user/SageMaker/1_data/flickr30k_features/tiny/train_flickr30k_resnet101_faster_rcnn_genome.tsv.1
line 0
line 500
reading  /home/ec2-user/SageMaker/1_data/flickr30k_features/tiny/train_flickr30k_resnet101_faster_rcnn_genome.tsv.2
line 0
line 500
reading  /home/ec2-user/SageMaker/1_data/flickr30k_features/tiny/val_flickr30k_resnet101_faster_rcnn_genome.tsv.3
line 0
2021-08-08 16:04:59 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/train_flickr30k_resnet101_faster_rcnn_genome.tsv
CPU times: user 9.96 s, sys: 1.23 s, total: 11.2 s
Wall time: 11.6 s


In [None]:
# see how this code created a folder named flickr30k_split_folder

In [42]:
dataset

<mmt_retrieval.data.datasets.ImageSentenceTripletDataset at 0x7f153b6b1e48>

In [46]:
# Caption keys
len(dataset.caption_keys)

148915

In [57]:
# Captions 
len(dataset.captions)

148915

In [47]:
dataset.caption_keys[:5]

['762678729_1', '762678729_2', '762678729_3', '762678729_4', '762678729_5']

In [60]:
dataset.captions["762678729_1"]

"a group of people , restrained by fencing , wait at the entrance to a tent ; there are signs at the entrance that read ' tent closed ' ."

In [62]:
item = 0
pos_caption = dataset.caption_keys[item]
print(pos_caption)
pos_image = dataset.caption2image[pos_caption]
print(pos_image)

762678729_1
762678729


In [63]:
# Label
label = torch.LongTensor([int(pos_image)])
label

tensor([762678729])

In [64]:
# Caption of image
pos_caption = (dataset.captions[pos_caption], None)
pos_caption

("a group of people , restrained by fencing , wait at the entrance to a tent ; there are signs at the entrance that read ' tent closed ' .",
 None)

In [65]:
# No tag
tag = dataset.img2tag[pos_image] if dataset.tags else None
print(tag)

None


In [66]:
print((tag, pos_image))

(None, '762678729')


In [68]:
# Testing __getitem__
dataset[0]

([[("a group of people , restrained by fencing , wait at the entrance to a tent ; there are signs at the entrance that read ' tent closed ' .",
    None)]],
 [[(None, '762678729')]],
 tensor([762678729]))

### Get sampler

In [69]:
get_sampler

<function examples.experiments.process_data.get_sampler(name, dataset, batchsize, **args)>

In [None]:
# Uses the class MultidatasetImageSentenceTripletSampler in data > datasets.py

In [70]:
task["name"]

'flickr30k'

In [76]:
task.get("batchsize")

128

In [80]:
sampler, batch_sampler, shuffle, batch_size = get_sampler(
    task["name"], 
    dataset,
    task.get("batchsize"), 
    **task.get("data_args", {}))

In [81]:
sampler

In [82]:
batch_sampler

In [83]:
shuffle

True

In [84]:
batch_size

128

### DataLoader

In [92]:
from torch.utils.data import DataLoader

In [86]:
dataset

<mmt_retrieval.data.datasets.ImageSentenceTripletDataset at 0x7f153b6b1e48>

In [87]:
shuffle

True

In [89]:
batch_sampler

In [90]:
train_config.get("num_workers", 0)

0

In [93]:
# DataLoader
dataloader = DataLoader(
    dataset, 
    shuffle=shuffle, 
    batch_size=batch_size, 
    sampler=sampler, 
    batch_sampler=batch_sampler, 
    num_workers=train_config.get("num_workers", 0))

In [94]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f159250f7b8>

### Loss definition

In [97]:
from mmt_retrieval.losses.losses import BatchHardTripletLoss

In [98]:
task["loss"]["name"] == "triplet"

True

In [101]:
task["loss"].get("margin", 0.2)

0.1

In [99]:
loss = BatchHardTripletLoss(model=model, margin=task["loss"].get("margin", 0.2))

In [102]:
training_tasks = [(dataloader, loss)]

### Optimizer

In [114]:
dev_config = train_config["dev"]

#### Evaluator

In [110]:
from mmt_retrieval.evaluation.evaluations import EmbeddingImageTextRetrievalEvaluator, CrossEncoderImageTextRetrievalEvaluator, \
    RetrieveRerankImageTextRetrievalEvaluator

In [111]:
def get_evaluator(data_config, config, model, split):
    split_evaluators = []
    for task in config["tasks"]:
        load_split = task.get("overwrite_split", split)
        split_dataset = DATA_LOADER[task["name"]](get_data(data_config, task["name"]), load_split, task.get("tiny", False),
                                                  model.image_dict, return_dict=True, **task.get("data_args", {}))
        split_eval_name = task["evaluator"]["name"]
        file_name = f"{split}-{task['name']}-{split_eval_name}"
        if split_eval_name == "embedding_itr" or split_eval_name == "itr":
            evaluator = EmbeddingImageTextRetrievalEvaluator(split_dataset["images"], split_dataset["captions"], split_dataset["imageid2captions"], split_dataset.get("tags", None),
                                                             name=file_name, show_progress_bar=True,
                                                             batched_sim=task.get("batched_sim", 0))
        elif split_eval_name == "ce_itr" or split_eval_name == "joint_itr":
            evaluator = CrossEncoderImageTextRetrievalEvaluator(split_dataset["images"], split_dataset["captions"], split_dataset["imageid2captions"], split_dataset.get("tags", None),
                                                                name=file_name, batch_size=config["batchsize"], show_progress_bar=True)
        elif split_eval_name == "rr_itr":
            evaluator = RetrieveRerankImageTextRetrievalEvaluator(split_dataset["images"], split_dataset["captions"], split_dataset["imageid2captions"], split_dataset.get("tags", None),
                                                                  name=file_name, batch_size=config["batchsize"], show_progress_bar=True,
                                                                  retrieve=task.get("retrieve", 10), scoring=task.get("scoring", "standard"),
                                                                  scoring_factor=task.get("scoring_factor", 0.5))

In [115]:
dev_config

{'tasks': [{'name': 'flickr30k',
   'tiny': False,
   'evaluator': {'name': 'embedding_itr'},
   'data_args': {'languages': ['en']}},
  {'name': 'flickr30k',
   'tiny': True,
   'evaluator': {'name': 'ce_itr'},
   'data_args': {'captions_per_image': 1, 'languages': ['en']}}],
 'batchsize': 512,
 'main_score_function': 'mean'}

In [113]:
config["data"]

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [112]:
dev_evaluator = get_evaluator(config["data"], dev_config, model, "dev")

2021-08-08 16:32:27 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 16:32:27 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/dev_flickr30k_resnet101_faster_rcnn_genome.tsv
2021-08-08 16:32:28 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 16:32:28 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/dev_flickr30k_resnet101_faster_rcnn_genome.tsv


##### Deep dive into get_evaluator

In [116]:
# Arguments
data_config, config, model, split = config["data"], dev_config, model, "dev"

In [117]:
split_evaluators = []

In [118]:
config["tasks"]

[{'name': 'flickr30k',
  'tiny': False,
  'evaluator': {'name': 'embedding_itr'},
  'data_args': {'languages': ['en']}},
 {'name': 'flickr30k',
  'tiny': True,
  'evaluator': {'name': 'ce_itr'},
  'data_args': {'captions_per_image': 1, 'languages': ['en']}}]

In [120]:
task = config["tasks"][0]
task

{'name': 'flickr30k',
 'tiny': False,
 'evaluator': {'name': 'embedding_itr'},
 'data_args': {'languages': ['en']}}

In [122]:
load_split = task.get("overwrite_split", split)
load_split

'dev'

In [123]:
split_dataset = DATA_LOADER[task["name"]](
    get_data(data_config, task["name"]), 
    load_split, 
    task.get("tiny", False),
    model.image_dict, 
    return_dict=True, 
    **task.get("data_args", {}))

2021-08-08 16:39:07 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 16:39:07 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/dev_flickr30k_resnet101_faster_rcnn_genome.tsv


In [125]:
split_dataset.keys()

dict_keys(['images', 'captions', 'imageid2captions', 'tags'])

In [128]:
len(split_dataset["images"])

1000

In [129]:
split_dataset["images"][:5]

['2000387055', '6911576445', '4984550402', '2814913390', '3541915243']

In [130]:
split_eval_name = task["evaluator"]["name"]
split_eval_name

'embedding_itr'

In [131]:
file_name = f"{split}-{task['name']}-{split_eval_name}"
file_name

'dev-flickr30k-embedding_itr'

In [132]:
split_eval_name == "embedding_itr" or split_eval_name == "itr"

True

In [133]:
evaluator = EmbeddingImageTextRetrievalEvaluator(
    split_dataset["images"], 
    split_dataset["captions"], 
    split_dataset["imageid2captions"],
    split_dataset.get("tags", None),
    name=file_name, show_progress_bar=True,
    batched_sim=task.get("batched_sim", 0)
)

In [141]:
split_dataset["imageid2captions"]["2000387055"]

{'2000387055_1',
 '2000387055_2',
 '2000387055_3',
 '2000387055_4',
 '2000387055_5'}

In [142]:
split_dataset["captions"]["2000387055_1"]

'A girl red and white striped , one-sleeved outfit stands with her arms raised in the air .'

In [134]:
evaluator

<mmt_retrieval.evaluation.evaluations.EmbeddingImageTextRetrievalEvaluator at 0x7f159250f400>

**CAPTION EMBEDDINGS**

In [150]:
%%time
caption_embeddings = model.encode(
    sentences=evaluator.captions[:10], 
    show_progress_bar=True, 
    batch_size=3, 
    convert_to_tensor=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

CPU times: user 1.12 s, sys: 15 ms, total: 1.13 s
Wall time: 304 ms


In [151]:
caption_embeddings

tensor([[-0.1977,  0.0885, -0.0476,  ...,  0.3186,  0.3787,  1.2237],
        [ 0.1109, -0.5627, -0.2971,  ..., -0.1173,  0.3215,  1.0074],
        [-0.0926,  0.0981, -0.0644,  ..., -0.3189,  0.5240,  1.6126],
        ...,
        [ 0.1945, -0.0589, -0.4145,  ...,  0.3135,  0.8455,  0.0203],
        [ 0.5727,  0.4543, -0.2724,  ...,  0.0539,  0.4742,  0.3150],
        [ 1.2888,  0.1356, -0.4167,  ..., -0.9703,  0.8579,  1.5636]])

In [152]:
caption_embeddings.shape

torch.Size([10, 768])

In [153]:
evaluator.captions[:3]

['A girl red and white striped , one-sleeved outfit stands with her arms raised in the air .',
 'A girl in a red and white outfit is standing inside a display with other holiday items .',
 'Social awkward girl looking outside a window onto the street outside .']

**IMAGE EMBEDDINGS**

In [155]:
evaluator.tags

In [156]:
evaluator.images[:5]

['2000387055', '6911576445', '4984550402', '2814913390', '3541915243']

In [159]:
model.encode?

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mencode[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msentences[0m[0;34m:[0m[0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mimages[0m[0;34m:[0m[0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m:[0m[0mint[0m[0;34m=[0m[0;36m32[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshow_progress_bar[0m[0;34m:[0m[0mbool[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_value[0m[0;34m:[0m[0mstr[0m[0;34m=[0m[0;34m'multimodal_embedding'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcross_product_input[0m[0;34m:[0m[0mbool[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0

In [179]:
list(model.image_dict.keys())[:5]

['1213185795', '202175131', '4898266768', '5737435305', '23016250']

In [188]:
%%time
image_embeddings = model.encode(
    sentences=evaluator.tags, 
    #images=evaluator.images[:5], 
    images=list(model.image_dict.keys())[:10],
    show_progress_bar=False, 
    batch_size=3, 
    convert_to_tensor=True
)

CPU times: user 1.34 s, sys: 37.7 ms, total: 1.38 s
Wall time: 361 ms


In [189]:
image_embeddings

tensor([[ 1.4355,  0.3462,  0.3862,  ..., -0.0315, -0.5879,  0.7955],
        [ 0.7523, -0.3277, -0.1638,  ...,  0.2090, -0.5337, -0.4010],
        [ 0.7111,  0.1779, -0.0382,  ..., -0.2678, -0.3330,  1.2341],
        ...,
        [ 1.2927,  0.0137, -0.0959,  ...,  0.2119, -0.7002,  0.9523],
        [ 1.0059,  0.4814, -0.0499,  ..., -0.1109, -0.3053,  1.8183],
        [ 1.3537,  0.1951,  0.1604,  ..., -0.0261, -0.4137,  1.3153]])

In [190]:
image_embeddings.shape

torch.Size([10, 768])

#### Optimizer

In [None]:


optimizer_class = transformers.AdamW
optimizer_params={"lr": train_config.get("lr", 2e-5), "eps": train_config.get("eps", 1e-6)}