# DEEP DIVE MODEL

In [1]:
import os
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [2]:
#!pip install -r requirements.txt

In [3]:
import os
import torch
from tqdm import tqdm

import params
from retrieval.train import train
from retrieval.utils import helper
from retrieval.model import loss
from retrieval.model.model import Retrieval
from retrieval.data.loaders import get_loaders
from retrieval.utils.logger import create_logger
from retrieval.utils.helper import load_model
from retrieval.utils.file_utils import load_yaml_opts, parse_loader_name

In [4]:
from addict import Dict

# Functions

In [5]:
def get_data_path(opt):
    if 'DATA_PATH' not in os.environ:
        if not opt.dataset.data_path:
            raise Exception('''
                DATA_PATH not specified.
                Please, run "$ export DATA_PATH=/path/to/dataset"
                or add path to yaml file
            ''')
        return opt.dataset.data_path
    else:
        return os.environ['DATA_PATH']

In [6]:
def get_tokenizers(train_loader):
    tokenizers = train_loader.dataset.tokenizer
    if type(tokenizers) != list:
        tokenizers = [tokenizers]
    return tokenizers

In [7]:
def set_criterion(opt, model):
    if 'name' in opt.criterion:
        logger.info(opt.criterion)
        multimodal_criterion = loss.get_loss(**opt.criterion)
        multilanguage_criterion = loss.get_loss(**opt.criterion)
    else:
        multimodal_criterion = loss.ContrastiveLoss(**opt.criterion)
        multilanguage_criterion = loss.ContrastiveLoss(**opt.ml_criterion)
    set_model_criterion(opt, model, multilanguage_criterion, multimodal_criterion)
    # return multimodal_criterion, multilanguage_criterion


def set_model_criterion(opt, model, multilanguage_criterion, multimodal_criterion):
    model.mm_criterion = multimodal_criterion
    model.ml_criterion = None
    if len(opt.dataset.adapt.data) > 0:
        model.ml_criterion = multilanguage_criterion

# Run

In [8]:
os.environ["DATA_PATH"] = "/home/ec2-user/SageMaker/data/"

In [9]:
options = "options/adapt/foodi-ml/i2t.yaml"

In [10]:
args = {
    "options": options,
}
args = Dict(args)
opt = load_yaml_opts(args.options)

In [11]:
logger = create_logger(level='debug' if opt.engine.debug else 'info')
#logger.info(f'Used args   : \n{args}')
#logger.info(f'Used options: \n{opt}')

In [12]:
# Get path of the data
data_path = get_data_path(opt)

In [13]:
# Get loaders
train_loader, val_loaders, adapt_loaders = get_loaders(data_path, args.local_rank, opt)

2021-08-16 18:56:03,824 - [INFO    ] - Loaded vocab containing 2487 tokens
2021-08-16 18:56:03,824 - [INFO    ] - Loaded from .vocab_cache/foodiml_vocab.json.
2021-08-16 18:56:03,825 - [INFO    ] - Created tokenizer with init 2487 tokens.
2021-08-16 18:56:03,868 - [INFO    ] - [FoodiML] Loaded 8011 images and 8011 annotations.
2021-08-16 18:56:03,873 - [INFO    ] - Loaded vocab containing 2487 tokens
2021-08-16 18:56:03,873 - [INFO    ] - Loaded from .vocab_cache/foodiml_vocab.json.
2021-08-16 18:56:03,873 - [INFO    ] - Created tokenizer with init 2487 tokens.
2021-08-16 18:56:03,910 - [INFO    ] - [FoodiML] Loaded 0 images and 0 annotations.
2021-08-16 18:56:03,910 - [INFO    ] - Adapt loaders: 0


In [14]:
tokenizers = get_tokenizers(train_loader)

In [15]:
model = Retrieval(**opt.model, tokenizers=tokenizers)

2021-08-16 18:56:04,555 - [INFO    ] - Image encoder created: ('resnet50',)
2021-08-16 18:56:04,662 - [INFO    ] - Text encoder created: gru_glove
2021-08-16 18:56:04,663 - [INFO    ] - Created similarity: Cosine()
2021-08-16 18:56:07,547 - [INFO    ] - Setting devices: img: cuda,txt: cuda, loss: cuda
2021-08-16 18:56:07,548 - [INFO    ] - Using similarity: ('cosine',)


<font color='red'> **Deep dive on Retrieval --------------------- (1)**

In [16]:
from retrieval.model.model import Retrieval

In [17]:
opt.model['img_enc']['name'] = 'resnet50'

In [18]:
model = Retrieval(**opt.model, tokenizers=tokenizers)

2021-08-16 18:56:08,177 - [INFO    ] - Image encoder created: ('resnet50',)
2021-08-16 18:56:08,283 - [INFO    ] - Text encoder created: gru_glove
2021-08-16 18:56:08,284 - [INFO    ] - Created similarity: Cosine()
2021-08-16 18:56:08,337 - [INFO    ] - Setting devices: img: cuda,txt: cuda, loss: cuda
2021-08-16 18:56:08,338 - [INFO    ] - Using similarity: ('cosine',)


In [19]:
txt_enc = model.txt_enc
img_enc = model.img_enc

In [20]:
txt_pool = model.txt_pool 
img_pool = model.img_pool

In [21]:
gen_loader = iter(train_loader)
batch = next(gen_loader)

In [22]:
batch = next(gen_loader)

#### Images embeddings

In [44]:
type(batch) == dict

True

In [45]:
input_batch = batch['image']
input_batch = input_batch.to(model.img_enc.device)

In [46]:
img_tensor = img_enc(input_batch)

In [50]:
print("input_batch: \t", input_batch.shape)
print("img_tensor: \t", img_tensor.shape)

input_batch: 	 torch.Size([10, 3, 224, 224])
img_tensor: 	 torch.Size([10, 49, 1024])


In [29]:
img_embed  = model.embed_image_features(img_tensor)

In [51]:
img_enc

FullImageEncoder(
  (cnn): BaseFeatures(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (downsample): Sequential(
         

In [52]:
img_enc.proj_regions

True

#### Text embeddings

In [31]:
input_batch = batch

In [32]:
txt_tensor, lengths = txt_enc(input_batch)

In [33]:
txt_tensor.shape

torch.Size([10, 25, 1024])

In [34]:
txt_tensor = model.embed_caption_features(txt_tensor, lengths)

In [35]:
txt_tensor.shape

torch.Size([10, 25, 1024])

In [36]:
lengths

[12, 8, 13, 12, 19, 20, 25, 11, 24, 8]

In [37]:
batch['caption'][0].shape

torch.Size([10, 25])

#### Forward batch

In [38]:
img_embed, txt_embed = model.forward_batch(batch)

In [39]:
img_embed.shape

torch.Size([10, 49, 1024])

In [40]:
txt_embed.shape

torch.Size([10, 25, 1024])

In [42]:
model.get_sim_matrix(img_embed, txt_embed)

RuntimeError: t() expects a tensor with <= 2 dimensions, but self is 3D

<font color='red'> **Finish Deep dive on Retrieval --------------------- (1)**