# Exploration on MMT Retrieval repo

In [1]:
import os
HOME_DIR = "/home/ec2-user/SageMaker"
os.chdir(f"{HOME_DIR}/MMT-Retrieval/")

In [2]:
!pip install -r requirements.txt

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
!pip install anytree

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
from mmt_retrieval.model.models import OSCAR, UNITER, M3P, ClassificationHead, Pooling
from mmt_retrieval import MultimodalTransformer

# Loading pre-trained Model

## M3P (no execute)

In [6]:
# Load pre-trained model M3P
path_model_m3p = os.path.join(HOME_DIR, "model_m3p/0_M3P")
pretrained_model = M3P(model_path = path_model_m3p)

2021-08-08 13:50:02 - Lock 139649952230760 acquired on /home/ec2-user/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.ab95cf27f9419a99cce4f19d09e655aba382a2bafe2fe26d0cc24c18cf1a1af6.lock


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

2021-08-08 13:50:03 - Lock 139649952230760 released on /home/ec2-user/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.ab95cf27f9419a99cce4f19d09e655aba382a2bafe2fe26d0cc24c18cf1a1af6.lock
2021-08-08 13:50:03 - Lock 139649952230760 acquired on /home/ec2-user/.cache/huggingface/transformers/9df9ae4442348b73950203b63d1b8ed2d18eba68921872aee0c3a9d05b9673c6.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

2021-08-08 13:50:04 - Lock 139649952230760 released on /home/ec2-user/.cache/huggingface/transformers/9df9ae4442348b73950203b63d1b8ed2d18eba68921872aee0c3a9d05b9673c6.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock
2021-08-08 13:50:04 - Lock 139649952230760 acquired on /home/ec2-user/.cache/huggingface/transformers/daeda8d936162ca65fe6dd158ecce1d8cb56c17d89b78ab86be1558eaef1d76a.a984cf52fc87644bd4a2165f1e07e0ac880272c1e82d648b4674907056912bd7.lock


Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2021-08-08 13:50:05 - Lock 139649952230760 released on /home/ec2-user/.cache/huggingface/transformers/daeda8d936162ca65fe6dd158ecce1d8cb56c17d89b78ab86be1558eaef1d76a.a984cf52fc87644bd4a2165f1e07e0ac880272c1e82d648b4674907056912bd7.lock


In [8]:
pretrained_model.tokenize("Pizza con champiñones")

[78011, 158, 91714, 35984, 1444]

In [21]:
pretrained_model.tokenize("Pizza con jamón dulche")

[78011, 158, 79, 52960, 24532, 1430]

### Load image features (no execute)

In [168]:
from mmt_retrieval.data.image_embeddings import ImageDict

In [169]:
imagedict = ImageDict()

In [170]:
%%time
imagedict.load_obj_tsv(
    fname="/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny/train_flickr30k_resnet101_faster_rcnn_genome.tsv.1",
    topk=10
)

CPU times: user 38.2 ms, sys: 67 µs, total: 38.2 ms
Wall time: 35 ms


In [174]:
value = imagedict["1213185795"]

In [175]:
imagedict["1213185795"]

OrderedDict([('img_id', '1213185795'),
             ('img_w', 500),
             ('img_h', 333),
             ('num_boxes', 39),
             ('boxes',
              array([[178.77786 ,   0.      , 499.5     , 189.06422 ],
                     [286.2556  ,  91.49405 , 409.02844 , 224.29764 ],
                     [  0.      ,  17.339499, 211.28693 , 317.61197 ],
                     [  6.189648,  34.51717 , 108.51078 , 292.2112  ],
                     [319.35147 , 286.56503 , 466.34363 , 332.445   ],
                     [323.2654  ,  30.65785 , 477.2827  ,  78.58263 ],
                     [263.98987 , 235.54553 , 499.5     , 332.445   ],
                     [  0.      ,   0.      , 316.1118  ,  54.290844],
                     [203.84828 , 190.30222 , 499.5     , 277.82635 ],
                     [229.45714 , 221.64037 , 275.42932 , 300.42758 ],
                     [119.19319 ,   0.      , 427.99127 , 332.445   ],
                     [ 42.994564,   9.276467, 231.20543 , 332.445  

In [176]:
# coordinates of boxes
imagedict["1213185795"]["boxes"].shape

(39, 4)

In [177]:
# boxes features
imagedict["1213185795"]["features"].shape

(39, 2048)

In [178]:
len(imagedict)

10

# Run train test (execute)

See application notebook in MMT-Retrieval/examples/applications/Image_Search.ipynb \
**See code examples/experiments/run_train_test.py**

In [5]:
!pwd

/home/ec2-user/SageMaker/MMT-Retrieval


In [6]:
import torch
import yaml
import numpy as np
import random
from datetime import datetime

In [7]:
config = yaml.load(open("examples/experiments/super_config.yaml"), Loader=yaml.FullLoader)

In [8]:
# Model paths
path_model_m3p = os.path.join(HOME_DIR, "model_m3p/0_M3P")
path_image_feature_folder = "/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny"
path_flickr30k_original_folder = "/home/ec2-user/SageMaker/1_data/flickr30k_entities"
path_flickr_split_folder = "/home/ec2-user/SageMaker/1_data/flickr30k_split_folder"

In [9]:
# ######################################

#   IMPORTANT!! modify config paths here

# ######################################
# Manual modifications
config["model_path"] = "david-test"

# model name
config["model"]["name"] = "m3p"

# Model path of the pretrained weights
config["model"]["model_path"] = path_model_m3p
config["model"]["pretrained_model_path"] = path_model_m3p

# Path of the flickr features downloaded from  
# https://drive.google.com/uc?export=download&id=11OD_qq7ITBarJwWZfi0bWIRw3HPEaHwE 
#(source: https://github.com/jnhwkim/ban-vqa/blob/master/tools/download_flickr.sh)
config["data"]["image_feature_folder"] = path_image_feature_folder
config["data"]["flickr30k_original_folder"] = path_flickr30k_original_folder
config["data"]["flickr_split_folder"] = path_flickr_split_folder

In [10]:
# Seed
torch.manual_seed(config["seed"])
np.random.seed(config["seed"])
random.seed(config["seed"])

In [11]:
# Model config
model_config = config["model"]

# Creating folders
model_folder_name = f"{model_config['name']}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}"
model_save_path = os.path.join(config["model_path"], model_folder_name)
os.makedirs(model_save_path, exist_ok=True)

In [12]:
# <TODO: CHANGE HERE> Put the path of our own model


### Train

In [13]:
# Indicate to Train
config.get("do_train", True)

'Train'

#### Output"build_model"

In [14]:
model_config

{'model_path': '/home/ec2-user/SageMaker/model_m3p/0_M3P',
 'model_path_cross': 'YOUR_CROSS_ENCODER',
 'model_path_embedding': 'YOUR_EMBEDDING_MODEL',
 'name': 'm3p',
 'pretrained_model_path': '/home/ec2-user/SageMaker/model_m3p/0_M3P',
 'max_seq_length': 70,
 'max_image_seq_len': 50,
 'input_key': 'pooled_cls_token_embeddings',
 'classifier_type': 'linear',
 'scaling_factor': 1}

In [15]:
def build_model(model_config):
    if "legacy" not in model_config["name"]:
        if "m3p" in model_config["name"]:
            embedding_model = M3P(model_config["pretrained_model_path"],
                                  max_seq_length=model_config.get("max_seq_length", 128),
                                  max_image_seq_len=model_config.get("max_image_seq_len", 50))
        if model_config.get("half_layers", False):
            module_list = torch.nn.ModuleList()
            for i, layer in enumerate(embedding_model.auto_model.encoder.layer):
                if i % 2 == 0:
                    module_list.append(layer)
            embedding_model.auto_model.encoder.layer = module_list

        class_head = ClassificationHead(2, model_config.get("input_key", "pooled_cls_token_embeddings"), 768,
                                        model_config.get("classifier_type", "linear"),
                                        model_config.get("scaling_factor", 2))
        pooling_model = Pooling(768,
                                       pooling_mode_mean_tokens=model_config.get("mean", True),
                                       pooling_mode_cls_token=model_config.get("cls", False),
                                       pooling_mode_max_tokens=model_config.get("max", False))
        return MultimodalTransformer(modules=[embedding_model, class_head, pooling_model])

In [16]:
# Build model
model = build_model(model_config)

2021-08-08 18:53:04 - Use pytorch device: cpu


In [17]:
model.tokenize("Coca-cola")

[72625, 9, 21854]

In [18]:
model.tokenize("Hamburguesa con queso y patatas")

[33112, 21049, 11, 158, 41, 991, 113, 33156, 1073]

#### Deep dive "build_model" (no execute)

In [15]:
def build_model(model_config):
    if "legacy" not in model_config["name"]:
        if "oscar" in model_config["name"]:
            embedding_model = OSCAR(model_config["pretrained_model_path"],
                                    max_seq_length=model_config.get("max_seq_length", 70),
                                    max_image_seq_len=model_config.get("max_image_seq_len", 50))
        elif "m3p" in model_config["name"]:
            embedding_model = M3P(model_config["pretrained_model_path"],
                                  max_seq_length=model_config.get("max_seq_length", 128),
                                  max_image_seq_len=model_config.get("max_image_seq_len", 50))
        elif "villa" in model_config["name"] or "uniter" in model_config["name"]:
            embedding_model = UNITER(model_config["pretrained_model_path"],
                                     max_seq_length=model_config.get("max_seq_length", 70),
                                     max_image_seq_len=model_config.get("max_image_seq_len", 50))
        if model_config.get("half_layers", False):
            module_list = torch.nn.ModuleList()
            for i, layer in enumerate(embedding_model.auto_model.encoder.layer):
                if i % 2 == 0:
                    module_list.append(layer)
            embedding_model.auto_model.encoder.layer = module_list

        class_head = ClassificationHead(2, model_config.get("input_key", "pooled_cls_token_embeddings"), 768,
                                        model_config.get("classifier_type", "linear"),
                                        model_config.get("scaling_factor", 2))
        pooling_model = Pooling(768,
                                       pooling_mode_mean_tokens=model_config.get("mean", True),
                                       pooling_mode_cls_token=model_config.get("cls", False),
                                       pooling_mode_max_tokens=model_config.get("max", False))
        return MultimodalTransformer(modules=[embedding_model, class_head, pooling_model])

In [80]:
# <TODO: DIG DIVE INTO M3P initialization> (see model/models.py)
embedding_model = M3P(
    model_config["pretrained_model_path"],
    max_seq_length=model_config.get("max_seq_length", 128),
    max_image_seq_len=model_config.get("max_image_seq_len", 50))

In [37]:
embedding_model

M3P(
  (auto_model): TransformerModel(
    (position_embeddings): Embedding(514, 768)
    (embeddings): Embedding(250002, 768, padding_idx=1)
    (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (image_embeddings): BertImageEmbeddings(
      (image_embeddings): Linear(in_features=2048, out_features=768, bias=True)
      (image_distbution_embeddings): Linear(in_features=1600, out_features=768, bias=True)
      (image_location_embeddings): Linear(in_features=5, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (refine_embeddings): AoA_Refiner_Core(
      (layers): ModuleList(
        (0): AoA_Refiner_Layer(
          (self_attn): MultiHeadedDotAttention(
            (linears): ModuleList(
              (0): Linear(in_features=768, out_features=768, bias=True)
              (1): Linear(in_features=768, out_features=768, bias=True)
              (2): Linea

In [38]:
# <TODO: DIG DIVE INTO ClassificationHead initialization> (see model/models.py)
class_head = ClassificationHead(
    2, model_config.get("input_key", "pooled_cls_token_embeddings"), 768,
    model_config.get("classifier_type", "linear"),
    model_config.get("scaling_factor", 2)
)

In [39]:
class_head

ClassificationHead(
  (_dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)

In [40]:
model_config.get("input_key", "pooled_cls_token_embeddings")

'pooled_cls_token_embeddings'

In [41]:
model_config.get("classifier_type", "linear")

'linear'

In [42]:
model_config.get("scaling_factor", 2)

1

In [43]:
# <TODO: DIG DIVE INTO ClassificationHead initialization> (see model/models.py)
pooling_model = Pooling(
    768,
   pooling_mode_mean_tokens=model_config.get("mean", True),
   pooling_mode_cls_token=model_config.get("cls", False),
   pooling_mode_max_tokens=model_config.get("max", False)
)

In [44]:
model_config.get("cls", False)

False

In [45]:
model_config.get("mean", True)

True

In [46]:
pooling_model

Pooling()

## Task in train_config (execute)

In [19]:
from examples.experiments.process_data import get_sampler, DATA_LOADER

In [20]:
train_config = config["training"]

In [21]:
tasks = train_config["tasks"]

In [22]:
tasks

[{'name': 'flickr30k',
  'tiny': False,
  'batchsize': 128,
  'loss': {'name': 'triplet', 'margin': 0.1},
  'data_args': {'jit_loading': False, 'languages': ['en', 'de', 'cs', 'fr']}},
 {'name': 'flickr30k',
  'tiny': False,
  'batchsize': 128,
  'loss': {'name': 'joint'},
  'data_args': {'jit_loading': False, 'languages': ['en', 'de', 'cs', 'fr']}},
 {'name': 'flickr30k',
  'batchsize': 16,
  'tiny': False,
  'loss': {'name': 'ance', 'margin': 0.1},
  'data_args': {'topk': 50, 'negative_examples': 7, 'sim_batchsize': 512}},
 {'name': 'flickr30k',
  'tiny': False,
  'batchsize': 128,
  'do_hard': True,
  'loss': {'name': 'joint'},
  'data_args': {'sim_batchsize': 512, 'topk': 50, 'hard_p': 0.05}}]

In [24]:
task = tasks[0]

In [25]:
task["name"]

'flickr30k'

In [26]:
# Defining get_data
def get_data(data_config, name):
    """
    for backwards compatibility of old configs
    :param data_config:
    :param name:
    :return:
    """
    if "all" in data_config:
        return data_config["all"]
    elif name in data_config:
        return data_config[name]
    else:
        return data_config

In [27]:
task["name"]

'flickr30k'

In [28]:
DATA_LOADER[task["name"]]

<function examples.experiments.process_data.get_flickr30k(config, split, tiny, image_dict, **args)>

In [29]:
config["data"]

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [30]:
# Inspecting get_flickr30k function (filling arguments of get_flickr30k) <ARGUMENTS>
config_get = get_data(config["data"], task["name"])
split = "train"
tiny = task.get("tiny", False)
image_dict = model.image_dict

In [31]:
get_data(config["data"], task["name"])

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [32]:
%%time
# ENSURE we delete the folder flickr30k_split_folder
dataset = DATA_LOADER[task["name"]](
    get_data(config["data"], task["name"]), 
    "train", 
    task.get("tiny", False), 
    model.image_dict,
    joint=task["loss"]["name"] == "joint", 
    **task.get("data_args", 
    {}))

2021-08-08 18:54:23 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 18:54:23 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/train_flickr30k_resnet101_faster_rcnn_genome.tsv
CPU times: user 6.44 s, sys: 252 ms, total: 6.69 s
Wall time: 6.69 s


In [33]:
# see how this code created a folder named flickr30k_split_folder

In [34]:
dataset

<mmt_retrieval.data.datasets.ImageSentenceTripletDataset at 0x7f511bfa7dd8>

In [35]:
# Caption keys
len(dataset.caption_keys)

148915

In [36]:
# Captions 
len(dataset.captions)

148915

In [37]:
dataset.caption_keys[:5]

['4328445479_1',
 '4328445479_2',
 '4328445479_4',
 '4328445479_5',
 '4328445479_3']

In [42]:
for vv in dataset.caption_keys[:5]:
    print(vv, ": ", dataset.captions[vv])

4328445479_1 :  Several kids in what seems to be a classroom , working on a problem together or potentially giving a presentation .
4328445479_2 :  Two girls and three boys are presenting in front of a dry erase board .
4328445479_4 :  Students are in a classroom , reading something off a paper .
4328445479_5 :  Five children stand in front of a whiteboard .
4328445479_3 :  Five students stand in front of a whiteboard in a classroom .


In [43]:
item = 0
pos_caption = dataset.caption_keys[item]
print(pos_caption)
pos_image = dataset.caption2image[pos_caption]
print(pos_image)

4328445479_1
4328445479


In [44]:
# Label
label = torch.LongTensor([int(pos_image)])
label

tensor([4328445479])

In [45]:
# Caption of image
pos_caption = (dataset.captions[pos_caption], None)
pos_caption

('Several kids in what seems to be a classroom , working on a problem together or potentially giving a presentation .',
 None)

In [46]:
# No tag
tag = dataset.img2tag[pos_image] if dataset.tags else None
print(tag)

None


In [47]:
print((tag, pos_image))

(None, '4328445479')


In [48]:
# Testing __getitem__
dataset[0]

([[('Several kids in what seems to be a classroom , working on a problem together or potentially giving a presentation .',
    None)]],
 [[(None, '4328445479')]],
 tensor([4328445479]))

### Get sampler

In [49]:
get_sampler

<function examples.experiments.process_data.get_sampler(name, dataset, batchsize, **args)>

In [50]:
# Uses the class MultidatasetImageSentenceTripletSampler in data > datasets.py

In [51]:
task["name"]

'flickr30k'

In [52]:
task.get("batchsize")

128

In [53]:
sampler, batch_sampler, shuffle, batch_size = get_sampler(
    task["name"], 
    dataset,
    task.get("batchsize"), 
    **task.get("data_args", {}))

In [54]:
sampler

In [55]:
batch_sampler

In [56]:
shuffle

True

In [57]:
batch_size

128

### DataLoader

In [58]:
from torch.utils.data import DataLoader

In [59]:
dataset

<mmt_retrieval.data.datasets.ImageSentenceTripletDataset at 0x7f511bfa7dd8>

In [60]:
shuffle

True

In [61]:
batch_sampler

In [62]:
train_config.get("num_workers", 0)

0

In [63]:
# DataLoader
dataloader = DataLoader(
    dataset, 
    shuffle=shuffle, 
    batch_size=batch_size, 
    sampler=sampler, 
    batch_sampler=batch_sampler, 
    num_workers=train_config.get("num_workers", 0))

In [64]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f5022ea3908>

### Loss definition

In [65]:
from mmt_retrieval.losses.losses import BatchHardTripletLoss

In [66]:
task["loss"]["name"] == "triplet"

True

In [67]:
task["loss"].get("margin", 0.2)

0.1

In [68]:
loss = BatchHardTripletLoss(model=model, margin=task["loss"].get("margin", 0.2))

In [69]:
training_tasks = [(dataloader, loss)]

### Optimizer

In [70]:
dev_config = train_config["dev"]

#### Evaluator

In [71]:
from mmt_retrieval.evaluation.evaluations import EmbeddingImageTextRetrievalEvaluator, CrossEncoderImageTextRetrievalEvaluator, \
    RetrieveRerankImageTextRetrievalEvaluator

In [72]:
def get_evaluator(data_config, config, model, split):
    split_evaluators = []
    for task in config["tasks"]:
        load_split = task.get("overwrite_split", split)
        split_dataset = DATA_LOADER[task["name"]](get_data(data_config, task["name"]), load_split, task.get("tiny", False),
                                                  model.image_dict, return_dict=True, **task.get("data_args", {}))
        split_eval_name = task["evaluator"]["name"]
        file_name = f"{split}-{task['name']}-{split_eval_name}"
        if split_eval_name == "embedding_itr" or split_eval_name == "itr":
            evaluator = EmbeddingImageTextRetrievalEvaluator(split_dataset["images"], split_dataset["captions"], split_dataset["imageid2captions"], split_dataset.get("tags", None),
                                                             name=file_name, show_progress_bar=True,
                                                             batched_sim=task.get("batched_sim", 0))
        elif split_eval_name == "ce_itr" or split_eval_name == "joint_itr":
            evaluator = CrossEncoderImageTextRetrievalEvaluator(split_dataset["images"], split_dataset["captions"], split_dataset["imageid2captions"], split_dataset.get("tags", None),
                                                                name=file_name, batch_size=config["batchsize"], show_progress_bar=True)
        elif split_eval_name == "rr_itr":
            evaluator = RetrieveRerankImageTextRetrievalEvaluator(split_dataset["images"], split_dataset["captions"], split_dataset["imageid2captions"], split_dataset.get("tags", None),
                                                                  name=file_name, batch_size=config["batchsize"], show_progress_bar=True,
                                                                  retrieve=task.get("retrieve", 10), scoring=task.get("scoring", "standard"),
                                                                  scoring_factor=task.get("scoring_factor", 0.5))

In [74]:
dev_config

{'tasks': [{'name': 'flickr30k',
   'tiny': False,
   'evaluator': {'name': 'embedding_itr'},
   'data_args': {'languages': ['en']}},
  {'name': 'flickr30k',
   'tiny': True,
   'evaluator': {'name': 'ce_itr'},
   'data_args': {'captions_per_image': 1, 'languages': ['en']}}],
 'batchsize': 512,
 'main_score_function': 'mean'}

In [75]:
config["data"]

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [77]:
dev_evaluator = get_evaluator(config["data"], dev_config, model, "train")

2021-08-08 18:59:13 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 18:59:13 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/train_flickr30k_resnet101_faster_rcnn_genome.tsv
2021-08-08 18:59:18 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 18:59:18 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/train_flickr30k_resnet101_faster_rcnn_genome.tsv


In [78]:
dev_evaluator

##### Deep dive into get_evaluator

In [79]:
# Arguments
data_config, config, model, split = config["data"], dev_config, model, "train"

In [80]:
split_evaluators = []

In [81]:
config["tasks"]

[{'name': 'flickr30k',
  'tiny': False,
  'evaluator': {'name': 'embedding_itr'},
  'data_args': {'languages': ['en']}},
 {'name': 'flickr30k',
  'tiny': True,
  'evaluator': {'name': 'ce_itr'},
  'data_args': {'captions_per_image': 1, 'languages': ['en']}}]

In [82]:
task = config["tasks"][0]
task

{'name': 'flickr30k',
 'tiny': False,
 'evaluator': {'name': 'embedding_itr'},
 'data_args': {'languages': ['en']}}

In [83]:
load_split = task.get("overwrite_split", split)
load_split

'train'

In [84]:
get_data(data_config, task["name"])

{'mscoco_folder': 'SEE_README',
 'image_feature_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_features/tiny',
 'flickr_split_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_split_folder',
 'flickr30k_original_folder': '/home/ec2-user/SageMaker/1_data/flickr30k_entities',
 'cc_original_folder': 'SEE_README',
 'cc_feature_folder': 'SEE_README',
 'multi30k_original_folder': 'SEE_README'}

In [85]:
task.get("data_args", {})

{'languages': ['en']}

In [86]:
load_split = "train"

In [87]:
split_dataset = DATA_LOADER[task["name"]](
    get_data(data_config, task["name"]), 
    load_split, 
    task.get("tiny", False),
    model.image_dict, 
    return_dict=True, 
    **task.get("data_args", {}))

2021-08-08 18:59:34 - Flickr30k Image Feature Split exists. Loading from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder
2021-08-08 18:59:34 - Start to load Faster-RCNN detected objects from /home/ec2-user/SageMaker/1_data/flickr30k_split_folder/train_flickr30k_resnet101_faster_rcnn_genome.tsv


In [88]:
split_dataset.keys()

dict_keys(['images', 'captions', 'imageid2captions', 'tags'])

In [89]:
len(split_dataset["images"])

29783

In [90]:
split_dataset["images"][:5]

['4612733800', '404591376', '22822422', '3289433994', '2472634822']

In [91]:
split_eval_name = task["evaluator"]["name"]
split_eval_name

'embedding_itr'

In [92]:
file_name = f"{split}-{task['name']}-{split_eval_name}"
file_name

'train-flickr30k-embedding_itr'

In [93]:
split_eval_name == "embedding_itr" or split_eval_name == "itr"

True

In [94]:
evaluator = EmbeddingImageTextRetrievalEvaluator(
    split_dataset["images"], 
    split_dataset["captions"], 
    split_dataset["imageid2captions"],
    split_dataset.get("tags", None),
    name=file_name, show_progress_bar=True,
    batched_sim=task.get("batched_sim", 0)
)

In [104]:
len(split_dataset["images"])

29783

In [95]:
di = split_dataset["imageid2captions"]

In [96]:
di["4328445479"]

{'4328445479_1',
 '4328445479_2',
 '4328445479_3',
 '4328445479_4',
 '4328445479_5'}

In [97]:
split_dataset["imageid2captions"]["4328445479_1"]

{'4328445479'}

In [98]:
split_dataset["captions"]["4328445479_1"]

'Several kids in what seems to be a classroom , working on a problem together or potentially giving a presentation .'

In [99]:
split_dataset["captions"]["4328445479_4"]

'Students are in a classroom , reading something off a paper .'

In [100]:
evaluator

<mmt_retrieval.evaluation.evaluations.EmbeddingImageTextRetrievalEvaluator at 0x7f5124a9f8d0>

In [101]:
config.get("main_score_function", "normal") == "mean"
msf = lambda scores: scores[-1]

In [102]:
# Finally it is wrapped with the SequentialEvaluator class
from sentence_transformers.evaluation import SequentialEvaluator
split_evaluators = [evaluator]
dev_evaluator = SequentialEvaluator(split_evaluators, main_score_function=msf)

In [103]:
dev_evaluator

<sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator at 0x7f5124a9f390>

**CAPTION EMBEDDINGS**

In [123]:
%%time
caption_embeddings = model.encode(
    sentences=evaluator.captions[:10], 
    show_progress_bar=True, 
    batch_size=3, 
    convert_to_tensor=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

CPU times: user 810 ms, sys: 7.09 ms, total: 817 ms
Wall time: 224 ms


In [124]:
caption_embeddings

tensor([[-0.7562,  0.1262,  0.5512,  ..., -0.0147,  1.1224,  0.6292],
        [-0.8599, -0.3763,  0.4521,  ..., -0.1440,  0.0118, -0.7298],
        [-1.0304, -0.3263,  0.2968,  ..., -0.3083,  0.5256,  0.5384],
        ...,
        [-0.4071, -0.5545,  0.0050,  ..., -0.4316, -0.2530, -0.4787],
        [-0.4062, -0.4642, -0.0715,  ..., -0.0750, -0.2105, -0.4227],
        [ 0.5167, -0.3092,  0.3547,  ..., -0.2126, -0.0420, -0.3917]])

In [125]:
caption_embeddings.shape

torch.Size([10, 768])

In [127]:
evaluator.captions[:5]

['Several kids in what seems to be a classroom , working on a problem together or potentially giving a presentation .',
 'Two girls and three boys are presenting in front of a dry erase board .',
 'Five students stand in front of a whiteboard in a classroom .',
 'Students are in a classroom , reading something off a paper .',
 'Five children stand in front of a whiteboard .']

**IMAGE EMBEDDINGS**

In [132]:
evaluator.tags

In [133]:
evaluator.images[:5]

['4328445479', '4361912325', '7036852925', '540503255', '4591344824']

In [134]:
list(model.image_dict.keys())[:5]

['1213185795', '202175131', '4898266768', '5737435305', '23016250']

In [131]:
%%time
image_embeddings = model.encode(
    sentences=evaluator.tags, 
    #images=evaluator.images[:5], 
    images=list(model.image_dict.keys())[:10],
    show_progress_bar=False, 
    batch_size=3, 
    convert_to_tensor=True
)

CPU times: user 937 ms, sys: 0 ns, total: 937 ms
Wall time: 251 ms


In [135]:
image_embeddings

tensor([[ 1.4355,  0.3462,  0.3862,  ..., -0.0315, -0.5879,  0.7955],
        [ 0.7523, -0.3277, -0.1638,  ...,  0.2090, -0.5337, -0.4010],
        [ 0.7111,  0.1779, -0.0382,  ..., -0.2678, -0.3330,  1.2341],
        ...,
        [ 1.2927,  0.0137, -0.0959,  ...,  0.2119, -0.7002,  0.9523],
        [ 1.0059,  0.4814, -0.0499,  ..., -0.1109, -0.3053,  1.8183],
        [ 1.3537,  0.1951,  0.1604,  ..., -0.0261, -0.4137,  1.3153]])

In [136]:
image_embeddings.shape

torch.Size([10, 768])

##### Cosine similarity of embeddings

In [137]:
from sentence_transformers.util import pytorch_cos_sim

In [138]:
cos_scores = pytorch_cos_sim(image_embeddings, caption_embeddings)

In [139]:
cos_scores

tensor([[-0.0251, -0.0448, -0.0634, -0.0090, -0.0274, -0.2069, -0.1470, -0.1726,
         -0.2069,  0.0179],
        [-0.0533, -0.0960, -0.0917, -0.1056, -0.0514, -0.0543,  0.0043, -0.0180,
         -0.0609,  0.0549],
        [-0.0354, -0.0900, -0.0730, -0.0534, -0.0291, -0.1378, -0.0231, -0.0629,
         -0.0901,  0.0275],
        [-0.0904, -0.0544, -0.0817, -0.0955, -0.0322, -0.0858, -0.0256, -0.0384,
         -0.0724,  0.0594],
        [-0.0546, -0.0904, -0.0872, -0.0522, -0.0448, -0.0861, -0.0441, -0.0692,
         -0.0682,  0.0661],
        [-0.0285, -0.0057, -0.0162, -0.0089,  0.0241, -0.0915, -0.0457, -0.0464,
         -0.0900,  0.0597],
        [-0.0204, -0.0469, -0.0381, -0.0329, -0.0140, -0.1505, -0.0813, -0.0935,
         -0.1502,  0.0409],
        [-0.0662, -0.0669, -0.0772, -0.0305, -0.0598, -0.1120, -0.0631, -0.0857,
         -0.1210,  0.0545],
        [-0.0299, -0.1095, -0.0692, -0.0423, -0.0313, -0.1224, -0.0529, -0.0873,
         -0.1183,  0.0388],
        [-0.0381, -

In [140]:
cos_scores.shape

torch.Size([10, 10])

In [141]:
mrr_at_k = [10]
ndcg_at_k = [10]
recall_at_k = [1, 3, 5, 10]
map_at_k = [10]
mR_ks = [1, 5, 10]

In [142]:
max_k = max(max(mrr_at_k), max(ndcg_at_k), max(recall_at_k), max(map_at_k))
max_k

10

In [143]:
min(max_k, len(cos_scores[0]) - 1)

9

In [144]:
# Image to Text
cos_scores_top_k_values, cos_scores_top_k_idx = \
    torch.topk(
        cos_scores, 
        min(max_k, len(cos_scores[0]) - 1), 
        dim=1, 
        largest=True, 
        sorted=False
)

In [145]:
cos_scores_top_k_values.shape

torch.Size([10, 9])

In [146]:
cos_scores_top_k_idx

tensor([[1, 0, 9, 3, 4, 6, 2, 7, 5],
        [5, 9, 7, 6, 4, 0, 2, 8, 1],
        [1, 0, 2, 3, 4, 9, 6, 7, 8],
        [1, 9, 7, 6, 4, 2, 8, 5, 0],
        [5, 9, 8, 3, 4, 0, 6, 7, 2],
        [1, 9, 4, 0, 2, 3, 6, 7, 8],
        [1, 0, 2, 3, 4, 9, 6, 7, 8],
        [1, 0, 9, 3, 4, 6, 7, 2, 5],
        [1, 0, 2, 3, 4, 9, 6, 7, 8],
        [1, 9, 4, 3, 7, 0, 6, 2, 8]])

In [147]:
i2t_cos_scores_top_k_values = cos_scores_top_k_values.tolist()
i2t_cos_scores_top_k_idx = cos_scores_top_k_idx.tolist()

In [148]:
i2t_result_list = [[] for _ in range(0, len(image_embeddings))]
t2i_result_list = [[] for _ in range(0, len(caption_embeddings))]

In [149]:
# index of the batch iteration
start_idx = 0

for query_itr in range(len(i2t_cos_scores_top_k_values)):
    for captions_id, score in zip(i2t_cos_scores_top_k_idx[query_itr], i2t_cos_scores_top_k_values[query_itr]):
        captions_id = evaluator.captions_ids[captions_id]
        i2t_result_list[start_idx+query_itr].append({'captions_id': captions_id, 'score': score})

In [150]:
# for each image, it computes the cosine similarity score with the rest of images in the batch
i2t_result_list

[[{'captions_id': '4328445479_2', 'score': -0.044752687215805054},
  {'captions_id': '4328445479_1', 'score': -0.025120362639427185},
  {'captions_id': '4361912325_5', 'score': 0.01792328804731369},
  {'captions_id': '4328445479_4', 'score': -0.008965553715825081},
  {'captions_id': '4328445479_5', 'score': -0.02743445336818695},
  {'captions_id': '4361912325_2', 'score': -0.14695048332214355},
  {'captions_id': '4328445479_3', 'score': -0.06335169821977615},
  {'captions_id': '4361912325_3', 'score': -0.17259559035301208},
  {'captions_id': '4361912325_1', 'score': -0.20686817169189453}],
 [{'captions_id': '4361912325_1', 'score': -0.05434556305408478},
  {'captions_id': '4361912325_5', 'score': 0.05491766333580017},
  {'captions_id': '4361912325_3', 'score': -0.01797068491578102},
  {'captions_id': '4361912325_2', 'score': 0.004297451116144657},
  {'captions_id': '4328445479_5', 'score': -0.05144333094358444},
  {'captions_id': '4328445479_1', 'score': -0.05332810431718826},
  {'capt

#### Optimizer

In [151]:
import transformers

In [152]:
optimizer_class = transformers.AdamW
optimizer_params={"lr": train_config.get("lr", 2e-5), "eps": train_config.get("eps", 1e-6)}

In [153]:
dev_evaluator

<sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator at 0x7fe76d5d05f8>

### Model fit

#### Arguments 

In [154]:
#training_tasks = training_tasks = [(dataloader, loss)]
# evaluator = dev_evaluator (sequential evaluator from )

In [155]:
train_objectives=training_tasks
optimizer_model=model
evaluator=dev_evaluator
epochs=train_config.get("epochs", 1)
steps_per_epoch=train_config.get("steps_per_epoch", None)
gradient_accumulation=train_config.get("gradient_accumulation", 1)
evaluation_steps=train_config.get("evaluation_steps", 1000)
warmup_steps=train_config.get("warmup_steps", 0.1)
output_path=model_save_path
use_amp=train_config.get("use_amp", True)
optimizer_params=optimizer_params
optimizer_class=optimizer_class
load_optimizer_scheduler_path=train_config.get("load_optimizer_scheduler_path", None)
save_latest_model=train_config.get("save_latest_model", False)
weight_decay=train_config.get("weight_decay", 0.01)
max_grad_norm=train_config.get("max_grad_norm", 1)
use_wandb=False
logging_steps=config.get("wandb", {}).get("logging_steps", 0)
dataset_callback=None
objectives_sequence=train_config.get("task_sequence", None)

# default
output_path_ignore_not_empty = False
scheduler = 'WarmupLinear'

In [156]:
print(steps_per_epoch)
print(gradient_accumulation)
print(evaluation_steps)
print(warmup_steps)
print(output_path)

40000
1
5000
0.0
david-test/m3p-2021-08-08_18-17


In [157]:
print(use_amp)
print(optimizer_params)
print(optimizer_class)
print(load_optimizer_scheduler_path)
print(save_latest_model)

True
{'lr': 2e-05, 'eps': 1e-06}
<class 'transformers.optimization.AdamW'>
None
False


In [158]:
print(weight_decay)
print(max_grad_norm)
print(use_wandb)
print(logging_steps)
print(dataset_callback)
print(objectives_sequence)

0.05
1
False
0
None
[0, 0, 1]


#### Fit

In [159]:
if use_amp:
    from torch.cuda.amp import autocast
    scaler = torch.cuda.amp.GradScaler()
model.use_wandb = use_wandb
model.to(model._target_device)



MultimodalTransformer(
  (0): M3P(
    (auto_model): TransformerModel(
      (position_embeddings): Embedding(514, 768)
      (embeddings): Embedding(250002, 768, padding_idx=1)
      (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (image_embeddings): BertImageEmbeddings(
        (image_embeddings): Linear(in_features=2048, out_features=768, bias=True)
        (image_distbution_embeddings): Linear(in_features=1600, out_features=768, bias=True)
        (image_location_embeddings): Linear(in_features=5, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (refine_embeddings): AoA_Refiner_Core(
        (layers): ModuleList(
          (0): AoA_Refiner_Layer(
            (self_attn): MultiHeadedDotAttention(
              (linears): ModuleList(
                (0): Linear(in_features=768, out_features=768, bias=True)
                (1): Linear(in_f

In [160]:
if output_path is not None:
    os.makedirs(output_path, exist_ok=True)
    if not output_path_ignore_not_empty and len(os.listdir(output_path)) > 0:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(
            output_path))
    if save_latest_model:
        os.makedirs(os.path.join(output_path, "latest_checkpoint"), exist_ok=True)

dataloaders = [dataloader for dataloader, _ in train_objectives]

In [161]:
os.path.join(output_path, "latest_checkpoint")

'david-test/m3p-2021-08-08_18-17/latest_checkpoint'

In [162]:
# Use smart batching
for dataloader in dataloaders:
    dataloader.collate_fn = model.smart_batching_collate

In [163]:
# Compile losses to the desired device
loss_models = [loss for _, loss in train_objectives]
device = model._target_device

for loss_model in loss_models:
    loss_model.to(device)

In [164]:
model.best_score = -9999999

if steps_per_epoch is None or steps_per_epoch == 0:
    steps_per_epoch = min([len(dataloader) for dataloader in dataloaders])
steps_per_epoch

40000

In [165]:
num_train_steps = int(steps_per_epoch * epochs)
param_optimizer = list(optimizer_model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [166]:
optimizer_grouped_parameters[0]["params"][0].shape

torch.Size([514, 768])

In [167]:
optimizer_grouped_parameters[0]["params"][1].shape

torch.Size([250002, 768])

In [168]:
print(num_train_steps)
print(no_decay)

40000
['bias', 'LayerNorm.bias', 'LayerNorm.weight']


In [169]:
optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)

In [170]:
if isinstance(warmup_steps, float):
    warmup_steps = int(warmup_steps*num_train_steps)

In [171]:
scheduler = model._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps)
model._load_optimizer_scheduler(load_optimizer_scheduler_path, optimizer, scheduler)

In [172]:
global_step = 0
data_iterators = [iter(dataloader) for dataloader in dataloaders]
data_iterators

[<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x7fe74db11f28>]

In [173]:
# Train objectives tuple( dataloader, loss)
num_train_objectives = len(train_objectives)
if objectives_sequence is None:
    objectives_sequence = range(num_train_objectives)

In [174]:
logging_loss_scaling = [len([j for j in objectives_sequence if j==i]) for i in range(num_train_objectives)]
skip_scheduler = False

In [175]:
load_optimizer_scheduler_path

#### Starting epoch

In [176]:
start_epoch = 0
start_step = 0

In [177]:
tr_losses = [torch.tensor(0.0).to(model._target_device) for _ in range(len(loss_models))]
logging_losses_scalar = [0.0]*len(loss_models)

In [178]:
print(tr_losses)
print(logging_losses_scalar)

[tensor(0.)]
[0.0]


In [179]:
#  for epoch in trange(start_epoch, epochs, desc="Epoch"):
epoch = 0

In [180]:
training_steps = start_step
training_steps

0

In [181]:
for loss_model in loss_models:
    loss_model.zero_grad()
    loss_model.train()

In [182]:
steps_per_epoch

40000

In [183]:
objectives_sequence

[0, 0, 1]

In [184]:
train_idx = objectives_sequence[0]

In [185]:
loss_model = loss_models[train_idx]
data_iterator = data_iterators[train_idx]
tr_loss = tr_losses[train_idx]

In [186]:
type(loss_model)

mmt_retrieval.losses.losses.BatchHardTripletLoss

In [187]:
type(data_iterator)

torch.utils.data.dataloader._SingleProcessDataLoaderIter

In [188]:
type(tr_loss)

torch.Tensor

In [189]:
gradient_accumulation

1

In [190]:
acc_idx = 0

In [192]:
data_iterator

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x7fe74db11f28>

In [191]:
data = next(data_iterator)

KeyError: '5293561334'

#### Collate

In [105]:
# When we call model.encode we are triggering smart_batching_collate_eval

In [106]:
def smart_batching_collate(self, batch):
        """
        Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
        Here, batch is a list of tuples: [(tokens, label), ...]

        :param batch:
            a batch from a SmartBatchingDataset
        :return:
            a batch of tensors for the model
        """
        labels = []
        num_blocks = len(batch[0]) - 1
        num_pairs = [len(batch[0][i]) for i in range(num_blocks)]
        paired_blocks = [[[] for _ in range(num_pairs[i])] for i in range(num_blocks)]
        max_seq_lens = [[[0, 0] for _ in range(num_pairs[i])] for i in range(num_blocks)]
        for item in batch:
            blocks, label = item[:-1], item[-1]
            for i, block in enumerate(blocks):
                for j, pair in enumerate(block):
                    for p in pair:
                        tokens = None
                        image_features = None
                        if p[0] is not None:
                            tokens = self.tokenize(p[0])
                            max_seq_lens[i][j][0] = max(max_seq_lens[i][j][0], self._text_length(tokens))
                        if p[1] is not None:
                            image_features = self.image_dict[p[1]]
                            max_seq_lens[i][j][1] = max(max_seq_lens[i][j][1], len(image_features['features']))
                        paired_blocks[i][j].append((tokens, image_features))
            if isinstance(label, list):
                labels.extend(label)
            else:
                labels.append(label)

        features = []
        for i in range(num_blocks):
            for j in range(num_pairs[i]):
                feature_lists = {}
                for pair in paired_blocks[i][j]:
                    pair_features = self.get_features(pair[0], pair[1], max_seq_lens[i][j][0], max_seq_lens[i][j][1])

                    for feature_name in pair_features:
                        if feature_name not in feature_lists:
                            feature_lists[feature_name] = []

                        feature_lists[feature_name].append(pair_features[feature_name])
                for feature_name in feature_lists:
                    feature_lists[feature_name] = torch.cat(feature_lists[feature_name])
                features.append(feature_lists)

        return {'features': features, 'labels': torch.stack(labels)}

In [114]:
token = model.tokenize("Pizza con jamón dulche")

In [115]:
label = "4328445479"

In [116]:
batch = [(token, label)]

In [117]:
labels = []
num_blocks = len(batch[0]) - 1
num_pairs = [len(batch[0][i]) for i in range(num_blocks)]
paired_blocks = [[[] for _ in range(num_pairs[i])] for i in range(num_blocks)]
max_seq_lens = [[[0, 0] for _ in range(num_pairs[i])] for i in range(num_blocks)]

In [118]:
for item in batch:
    blocks, label = item[:-1], item[-1]
    for i, block in enumerate(blocks):
        for j, pair in enumerate(block):
            for p in pair:
                tokens = None
                image_features = None
                if p[0] is not None:
                    tokens = self.tokenize(p[0])
                    max_seq_lens[i][j][0] = max(max_seq_lens[i][j][0], self._text_length(tokens))
                if p[1] is not None:
                    image_features = self.image_dict[p[1]]
                    max_seq_lens[i][j][1] = max(max_seq_lens[i][j][1], len(image_features['features']))
                paired_blocks[i][j].append((tokens, image_features))
    if isinstance(label, list):
        labels.extend(label)
    else:
        labels.append(label)

TypeError: 'int' object is not iterable

In [119]:
pair

78011