# DEEP DIVE MODEL

In [1]:
import os
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [2]:
#!pip install -r requirements.txt

In [3]:
#import nltk
#nltk.download('punkt')

In [4]:
import os
import torch
from tqdm import tqdm

import params
from retrieval.train import train
from retrieval.utils import helper
from retrieval.model import loss
from retrieval.model.model import Retrieval
from retrieval.data.loaders import get_loaders
from retrieval.utils.logger import create_logger
from retrieval.utils.helper import load_model
from retrieval.utils.file_utils import load_yaml_opts, parse_loader_name

In [5]:
from addict import Dict

# Functions

In [6]:
def get_data_path(opt):
    if 'DATA_PATH' not in os.environ:
        if not opt.dataset.data_path:
            raise Exception('''
                DATA_PATH not specified.
                Please, run "$ export DATA_PATH=/path/to/dataset"
                or add path to yaml file
            ''')
        return opt.dataset.data_path
    else:
        return os.environ['DATA_PATH']

In [7]:
def get_tokenizers(train_loader):
    tokenizers = train_loader.dataset.tokenizer
    if type(tokenizers) != list:
        tokenizers = [tokenizers]
    return tokenizers

In [8]:
def set_criterion(opt, model):
    if 'name' in opt.criterion:
        logger.info(opt.criterion)
        multimodal_criterion = loss.get_loss(**opt.criterion)
        multilanguage_criterion = loss.get_loss(**opt.criterion)
    else:
        multimodal_criterion = loss.ContrastiveLoss(**opt.criterion)
        multilanguage_criterion = loss.ContrastiveLoss(**opt.ml_criterion)
    set_model_criterion(opt, model, multilanguage_criterion, multimodal_criterion)
    # return multimodal_criterion, multilanguage_criterion


def set_model_criterion(opt, model, multilanguage_criterion, multimodal_criterion):
    model.mm_criterion = multimodal_criterion
    model.ml_criterion = None
    if len(opt.dataset.adapt.data) > 0:
        model.ml_criterion = multilanguage_criterion

# Run

In [9]:
os.environ["DATA_PATH"] = "/home/ec2-user/SageMaker/data/"

In [10]:
options = "options/adapt/foodi-ml/i2t.yaml"

In [11]:
args = {
    "options": options,
}
args = Dict(args)
opt = load_yaml_opts(args.options)

In [12]:
logger = create_logger(level='debug' if opt.engine.debug else 'info')
#logger.info(f'Used args   : \n{args}')
#logger.info(f'Used options: \n{opt}')

In [13]:
# Get path of the data
data_path = get_data_path(opt)

In [14]:
# Get loaders
train_loader, val_loaders, adapt_loaders = get_loaders(data_path, args.local_rank, opt)

2021-08-17 11:37:16,910 - [INFO    ] - Loaded vocab containing 2487 tokens
2021-08-17 11:37:16,911 - [INFO    ] - Loaded from .vocab_cache/foodiml_vocab.json.
2021-08-17 11:37:16,911 - [INFO    ] - Created tokenizer with init 2487 tokens.
2021-08-17 11:37:16,955 - [INFO    ] - [FoodiML] Loaded 8011 images and 8011 annotations.
2021-08-17 11:37:16,960 - [INFO    ] - Loaded vocab containing 2487 tokens
2021-08-17 11:37:16,960 - [INFO    ] - Loaded from .vocab_cache/foodiml_vocab.json.
2021-08-17 11:37:16,960 - [INFO    ] - Created tokenizer with init 2487 tokens.
2021-08-17 11:37:16,997 - [INFO    ] - [FoodiML] Loaded 0 images and 0 annotations.
2021-08-17 11:37:16,998 - [INFO    ] - Adapt loaders: 0


In [15]:
tokenizers = get_tokenizers(train_loader)

In [16]:
model = Retrieval(**opt.model, tokenizers=tokenizers)

2021-08-17 11:37:18,512 - [INFO    ] - Image encoder created: ('full_image',)
2021-08-17 11:37:18,804 - [INFO    ] - Text encoder created: gru_glove
2021-08-17 11:37:18,874 - [INFO    ] - Created similarity: AdaptiveEmbeddingI2T(
  (norm): Normalization(
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  )
  (adapt_txt): ADAPT(
    (fc_gamma): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
    (fc_beta): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
  )
  (fovea): Fovea(smooth=10,train_smooth: False)
)
2021-08-17 11:37:21,803 - [INFO    ] - Setting devices: img: cuda,txt: cuda, loss: cuda
2021-08-17 11:37:21,804 - [INFO    ] - Using similarity: ('adapt_i2t',)


<font color='red'> **Deep dive on Retrieval --------------------- (1)**

In [17]:
from retrieval.model.model import Retrieval

In [18]:
opt.model['img_enc']['name']

'full_image'

In [19]:
model = Retrieval(**opt.model, tokenizers=tokenizers)

2021-08-17 11:37:23,273 - [INFO    ] - Image encoder created: ('full_image',)
2021-08-17 11:37:23,563 - [INFO    ] - Text encoder created: gru_glove
2021-08-17 11:37:23,633 - [INFO    ] - Created similarity: AdaptiveEmbeddingI2T(
  (norm): Normalization(
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  )
  (adapt_txt): ADAPT(
    (fc_gamma): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
    (fc_beta): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
  )
  (fovea): Fovea(smooth=10,train_smooth: False)
)
2021-08-17 11:37:23,779 - [INFO    ] - Setting devices: img: cuda,txt: cuda, loss: cuda
2021-08-17 11:37:23,780 - [INFO    ] - Using similarity: ('adapt_i2t',)


In [20]:
txt_enc = model.txt_enc
img_enc = model.img_enc

In [21]:
txt_pool = model.txt_pool 
img_pool = model.img_pool

In [22]:
gen_loader = iter(train_loader)
batch = next(gen_loader)

In [23]:
batch = next(gen_loader)

#### Images embeddings

In [24]:
type(batch) == dict

True

In [25]:
input_batch = batch['image']
input_batch = input_batch.to(model.img_enc.device)

In [26]:
img_tensor = img_enc(input_batch)

In [27]:
print("input_batch: \t", input_batch.shape)
print("img_tensor: \t", img_tensor.shape)

input_batch: 	 torch.Size([10, 3, 224, 224])
img_tensor: 	 torch.Size([10, 2048, 49])


In [28]:
img_embed  = model.embed_image_features(img_tensor)

In [29]:
features = img_enc.cnn(input_batch)

In [30]:
features.shape

torch.Size([10, 2048, 7, 7])

#### Text embeddings

In [31]:
input_batch = batch

In [32]:
txt_tensor, lengths = txt_enc(input_batch)

In [33]:
txt_tensor.shape

torch.Size([10, 31, 2048])

In [34]:
txt_tensor = model.embed_caption_features(txt_tensor, lengths)

In [35]:
txt_tensor.shape

torch.Size([10, 31, 2048])

In [36]:
lengths

[13, 9, 31, 10, 6, 11, 29, 10, 7, 10]

In [37]:
batch['caption'][0].shape

torch.Size([10, 31])

#### Forward batch

In [38]:
print(batch['image'].shape)
print(batch['caption'][0].shape)

torch.Size([10, 3, 224, 224])
torch.Size([10, 31])


In [39]:
img_embed, txt_embed = model.forward_batch(batch)

In [40]:
print("img_embed", img_embed.shape)
print("txt_embed", txt_embed.shape)

img_embed torch.Size([10, 2048, 49])
txt_embed torch.Size([10, 31, 2048])


<font color='red'> **Deep dive on Similarity --------------------- (2)**

In [41]:
from retrieval.model.similarity import similarity as sim

In [42]:
sim_obj = sim.AdaptiveEmbeddingI2T(
    **opt.model["similarity"]['params']
)

In [43]:
similarity = sim.Similarity(
    device=img_embed.device,
    similarity_object=sim_obj)

2021-08-17 11:37:35,073 - [INFO    ] - Created similarity: AdaptiveEmbeddingI2T(
  (norm): Normalization(
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  )
  (adapt_txt): ADAPT(
    (fc_gamma): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
    (fc_beta): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
  )
  (fovea): Fovea(smooth=10,train_smooth: False)
)


In [44]:
# Forward AdaptiveI2T
cap_embedp = txt_embed.permute(0, 2, 1)
img_embedp = img_embed.permute(0, 1, 2)

In [76]:
BB, LT, KK = img_embed.shape

In [79]:
LT == model.latent_size

True

In [45]:
print("cap_embedp", cap_embedp.shape)
print("img_embedp", img_embedp.shape)

cap_embedp torch.Size([10, 2048, 31])
img_embedp torch.Size([10, 2048, 49])


In [47]:
cap_embedp = model.similarity.similarity.norm(cap_embedp)

In [48]:
cap_embedp.shape

torch.Size([10, 2048, 31])

In [49]:
sims = torch.zeros(
    img_embedp.shape[0], cap_embedp.shape[0]
).to(similarity.device)

In [50]:
sims.shape

torch.Size([10, 10])

In [51]:
# Global image representation
img_embedp = img_embedp.mean(-1)

In [52]:
img_embedp.shape

torch.Size([10, 2048])

In [53]:
for i, img_tensor in enumerate(img_embedp):
    break

In [54]:
img_tensor.shape

torch.Size([2048])

In [55]:
img_vector = img_tensor.unsqueeze(0)

In [56]:
img_vector.shape

torch.Size([1, 2048])

In [57]:
cap_embedp.shape

torch.Size([10, 2048, 31])

In [58]:
txt_output = model.similarity.similarity.adapt_txt(value=cap_embedp, query=img_vector)

In [59]:
txt_output.shape

torch.Size([10, 2048, 31])

In [60]:
txt_output = model.similarity.similarity.fovea(txt_output)

In [61]:
txt_output.shape

torch.Size([10, 2048, 31])

In [62]:
txt_vector = txt_output.max(dim=-1)[0]

In [63]:
txt_vector.shape

torch.Size([10, 2048])

In [64]:
from retrieval.model.similarity.measure import l2norm, cosine_sim

In [65]:
txt_vector = l2norm(txt_vector, dim=-1)
img_vector = l2norm(img_vector, dim=-1)

In [66]:
print("txt_vector", txt_vector.shape)
print("img_vector", img_vector.shape)

txt_vector torch.Size([10, 2048])
img_vector torch.Size([1, 2048])


In [67]:
sim = cosine_sim(img_vector, txt_vector)

In [68]:
sim.shape

torch.Size([1, 10])

In [69]:
sim = sim.squeeze(-1)
sims[i,:] = sim

In [70]:
sims.shape

torch.Size([10, 10])

In [71]:
sims = torch.zeros(
            img_embedp.shape[0], cap_embedp.shape[0]
        ).to(model.similarity.device)

# Loop
for i, img_tensor in enumerate(img_embedp):
    img_vector = img_tensor.unsqueeze(0)
    txt_output = model.similarity.similarity.adapt_txt(value=cap_embedp, query=img_vector)
    txt_output = model.similarity.similarity.fovea(txt_output)
    txt_vector = txt_output.max(dim=-1)[0]
    txt_vector = l2norm(txt_vector, dim=-1)
    img_vector = l2norm(img_vector, dim=-1)
    sim = cosine_sim(img_vector, txt_vector)
    sim = sim.squeeze(-1)
    sims[i,:] = sim

In [72]:
sims

tensor([[0.6651, 0.6212, 0.6875, 0.6416, 0.6255, 0.6668, 0.6969, 0.6564, 0.6269,
         0.6352],
        [0.5615, 0.5528, 0.5802, 0.5712, 0.5328, 0.5683, 0.5993, 0.5580, 0.5433,
         0.5589],
        [0.6139, 0.5952, 0.6394, 0.6137, 0.5862, 0.6269, 0.6562, 0.6014, 0.5851,
         0.5999],
        [0.6303, 0.6114, 0.6458, 0.6136, 0.5981, 0.6273, 0.6579, 0.6276, 0.5954,
         0.6078],
        [0.6168, 0.6007, 0.6551, 0.6016, 0.5901, 0.6332, 0.6643, 0.6190, 0.5978,
         0.5954],
        [0.6279, 0.6105, 0.6594, 0.6227, 0.6050, 0.6505, 0.6662, 0.6333, 0.6054,
         0.6117],
        [0.6615, 0.6251, 0.6819, 0.6493, 0.6370, 0.6686, 0.7018, 0.6481, 0.6418,
         0.6342],
        [0.6455, 0.6314, 0.6830, 0.6615, 0.6256, 0.6717, 0.6945, 0.6605, 0.6277,
         0.6413],
        [0.6522, 0.6242, 0.6739, 0.6367, 0.6214, 0.6589, 0.6996, 0.6566, 0.6106,
         0.6315],
        [0.6272, 0.5965, 0.6526, 0.6178, 0.6004, 0.6255, 0.6654, 0.6303, 0.5992,
         0.5966]], device='c

<font color='red'> **Deep dive on SIMILARITY --------------------- (2)**

<font color='red'> **Finish Deep dive on Retrieval --------------------- (1)**