# Deep Dive Evaluation during Training

Execution 

```{bash}
cd /home/ec2-user/SageMaker/foodi-ml
source activate python3
export DATA_PATH=/home/ec2-user/SageMaker/data/
python run.py options/adapt/foodi-ml/i2t.yaml

#nvidia-smi
```

In [1]:
import os
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [2]:
import os
import numpy as np
import torch
from tqdm import tqdm
from addict import Dict

import params
from retrieval.train import train
from retrieval.utils import helper
from retrieval.model import loss
from retrieval.model.model import Retrieval
from retrieval.data.loaders import get_loaders
from retrieval.utils.logger import create_logger
from retrieval.utils.helper import load_model
from retrieval.utils.file_utils import load_yaml_opts, parse_loader_name

# Functions

In [3]:
def get_data_path(opt):
    if 'DATA_PATH' not in os.environ:
        if not opt.dataset.data_path:
            raise Exception('''
                DATA_PATH not specified.
                Please, run "$ export DATA_PATH=/path/to/dataset"
                or add path to yaml file
            ''')
        return opt.dataset.data_path
    else:
        return os.environ['DATA_PATH']

In [4]:
def get_tokenizers(train_loader):
    tokenizers = train_loader.dataset.tokenizer
    if type(tokenizers) != list:
        tokenizers = [tokenizers]
    return tokenizers

In [5]:
def set_criterion(opt, model):
    if 'name' in opt.criterion:
        logger.info(opt.criterion)
        multimodal_criterion = loss.get_loss(**opt.criterion)
        multilanguage_criterion = loss.get_loss(**opt.criterion)
    else:
        multimodal_criterion = loss.ContrastiveLoss(**opt.criterion)
        multilanguage_criterion = loss.ContrastiveLoss(**opt.ml_criterion)
    set_model_criterion(opt, model, multilanguage_criterion, multimodal_criterion)
    # return multimodal_criterion, multilanguage_criterion


def set_model_criterion(opt, model, multilanguage_criterion, multimodal_criterion):
    model.mm_criterion = multimodal_criterion
    model.ml_criterion = None
    if len(opt.dataset.adapt.data) > 0:
        model.ml_criterion = multilanguage_criterion

# Run

In [6]:
os.environ["DATA_PATH"] = "/home/ec2-user/SageMaker/data/"

options = "options/adapt/foodi-ml/i2t.yaml"

args = {"options": options}
args = Dict(args)
opt = load_yaml_opts(args.options)

logger = create_logger(level='debug' if opt.engine.debug else 'info')

# Get path of the data
data_path = get_data_path(opt)

# Get loaders
train_loader, val_loaders, adapt_loaders = get_loaders(data_path, args.local_rank, opt)

2021-08-18 11:34:59,924 - [INFO    ] - Loaded vocab containing 2487 tokens
2021-08-18 11:34:59,924 - [INFO    ] - Loaded from .vocab_cache/foodiml_vocab.json.
2021-08-18 11:34:59,925 - [INFO    ] - Created tokenizer with init 2487 tokens.
2021-08-18 11:34:59,967 - [INFO    ] - [FoodiML] Loaded 5608 images and 5608 annotations.
2021-08-18 11:34:59,971 - [INFO    ] - Loaded vocab containing 2487 tokens
2021-08-18 11:34:59,971 - [INFO    ] - Loaded from .vocab_cache/foodiml_vocab.json.
2021-08-18 11:34:59,972 - [INFO    ] - Created tokenizer with init 2487 tokens.
2021-08-18 11:35:00,010 - [INFO    ] - [FoodiML] Loaded 2403 images and 2403 annotations.
2021-08-18 11:35:00,013 - [INFO    ] - Adapt loaders: 0


In [7]:
tokenizers = get_tokenizers(train_loader)

In [8]:
model = Retrieval(**opt.model, tokenizers=tokenizers)

2021-08-18 11:35:02,507 - [INFO    ] - Image encoder created: ('full_image',)
2021-08-18 11:35:02,800 - [INFO    ] - Text encoder created: gru_glove
2021-08-18 11:35:02,871 - [INFO    ] - Created similarity: AdaptiveEmbeddingI2T(
  (norm): Normalization(
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  )
  (adapt_txt): ADAPT(
    (fc_gamma): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
    (fc_beta): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
    )
  )
  (fovea): Fovea(smooth=10,train_smooth: False)
)
2021-08-18 11:35:05,829 - [INFO    ] - Setting devices: img: cuda,txt: cuda, loss: cuda
2021-08-18 11:35:05,829 - [INFO    ] - Using similarity: ('adapt_i2t',)


In [9]:
print_fn = (lambda x: x) if not model.master else tqdm.write
print_fn

<bound method tqdm.write of <class 'tqdm._tqdm.tqdm'>>

In [10]:
set_criterion(opt, model)

In [11]:
# Trainer
trainer = train.Trainer(
    model=model,
    args=opt,
    sysoutlog=print_fn,
    path=opt.exp.outpath,
    world_size=1 # TODO
)

In [12]:
trainer.setup_optim(
        lr=opt.optimizer.lr,
        lr_scheduler=opt.optimizer.lr_scheduler,
        clip_grad=opt.optimizer.grad_clip,
        log_grad_norm=False,
        log_histograms=False,
        optimizer=opt.optimizer,
        freeze_modules=opt.model.freeze_modules
)

2021-08-18 11:17:28,445 - [INFO    ] - lr 0.001
2021-08-18 11:17:28,446 - [INFO    ] - [0.5, 2.0, 4000]
2021-08-18 11:17:28,446 - [INFO    ] - [10000, 20000, 3000]


Freezing model.txt_enc.embed.glove
lr: 0.001, #layers: 478, #params: 99,845,812
Total Params: 102,349,912, 


# Save

In [13]:
#trainer.save(
#    path = "runs",
#    is_best = True,
#    epoch = 0,
#)

# Load

In [14]:
#trainer.load()

# Trainer

In [22]:
train_loader=train_loader
valid_loaders=val_loaders
lang_loaders=adapt_loaders
nb_epochs=opt.engine.nb_epochs
valid_interval=opt.engine.valid_interval
log_interval=opt.engine.print_freq

In [23]:
print(train_loader)
print(valid_loaders)
print(lang_loaders)
print(nb_epochs)
print(valid_interval)
print(log_interval)

<torch.utils.data.dataloader.DataLoader object at 0x7f2f19473128>
[<torch.utils.data.dataloader.DataLoader object at 0x7f2f17a0efd0>]
[]
1
500
10


# Train epoch(Deep dive) -------------------------------- START

In [24]:
train_loader
lang_loaders
epoch = 0
valid_loaders=val_loaders
log_interval=50
valid_interval=500

In [25]:
batch = next(iter(train_loader))

## Run Evaluation (Deep Dive) ----------------- START

### Evaluate Loaders (Deep Dive) ----------------- START

In [26]:
### Dep dive evaluate_loaders
# metrics, val_metric = self.evaluate_loaders(valid_loaders)

In [27]:
loaders = valid_loaders

In [28]:
loader_metrics = {}
final_sum = 0.
nb_loaders = len(loaders)

In [29]:
loader = loaders[0]

In [30]:
loader_name = str(loader.dataset)

#### Predict Loader (Deep Dive) ----------------- START

In [32]:
### Deep dive predict_loader
# img_emb, txt_emb, lens = evaluation.predict_loader(self.model, loader, self.device)

In [72]:
# arguments
model = trainer.model
data_loader = loader

In [73]:
img_embs, cap_embs, cap_lens = None, None, None
max_n_word = 77
model.eval()
print("Done")

Done


In [74]:
genload = iter(data_loader)
batch = next(genload)

In [75]:
len(batch['caption'][0])

32

In [33]:
ids = batch['index']
cap, lengths = batch['caption']
img_emb, cap_emb = model.forward_batch(batch)

In [34]:
batch['index']

array([2197, 4367, 3558, 5274, 2360, 3384, 5528,  636, 3013, 1841, 4718,
       4909,  348, 1521, 1983, 4447, 1248, 1648, 1692, 4815, 1114, 3026,
        615, 1953, 3752, 5025,  460, 4635, 3327,  922, 3286,  737])

In [78]:
img_emb.shape

torch.Size([32, 2048, 49])

In [79]:
cap_emb.shape

torch.Size([32, 36, 2048])

In [80]:
is_tensor = True
img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1), img_emb.size(2)))
cap_embs = np.zeros((len(data_loader.dataset), max_n_word, cap_emb.size(2)))
cap_lens = [0] * len(data_loader.dataset)

In [81]:
max_n_word

77

In [82]:
len(cap_lens)

2403

In [83]:
img_embs.shape

(2403, 2048, 49)

In [84]:
cap_embs.shape

(2403, 77, 2048)

In [85]:
# cache embeddings
img_embs[ids] = img_emb.data.cpu().numpy()
if is_tensor:
    cap_embs[ids,:max(lengths),:] = cap_emb.data.cpu().numpy()
else:
    cap_embs[ids,] = cap_emb.data.cpu().numpy()

In [86]:
img_embs.shape

(2403, 2048, 49)

In [93]:
cap_embs.shape

(2403, 77, 2048)

In [95]:
len(cap_lens)

2403

#### Predict Loader (Deep Dive) ----------------- END

In [35]:
from retrieval.train.evaluation import predict_loader

In [36]:
img_emb, txt_emb, lens = predict_loader(trainer.model, loader, trainer.device)

                                                       

In [37]:
print(img_emb.shape)
print(txt_emb.shape)
print(len(lens))

(2403, 2048, 49)
(2403, 77, 2048)
2403


In [38]:
from retrieval.train.evaluation import evaluate

In [106]:
result = evaluate(
    model=trainer.model, 
    img_emb=img_emb,
    txt_emb=txt_emb, 
    lengths=lens,
    device=trainer.device, 
    shared_size=128
)

                                                       

In [107]:
result

{'pred_time': 1.3503006880000612,
 'sim_time': 233.0349687279995,
 'i2t_r1': 0.08322929671244278,
 'i2t_r5': 0.16645859342488556,
 'i2t_r10': 0.4577611319184353,
 'i2t_medr': 1205.0,
 'i2t_meanr': 1200.3354140657511,
 't2i_r1': 0.0,
 't2i_r5': 0.20807324178110695,
 't2i_r10': 0.4577611319184353,
 't2i_medr': 1187.0,
 't2i_meanr': 1197.8322929671244,
 'rsum': 1.373283395755306}

#### Evaluate (Deep Dive) ----------------- START

In [39]:
model = trainer.model
img_emb = img_emb
txt_emb = txt_emb
lengths = lens
device = trainer.device
shared_size=128
return_sims=False

In [40]:
model.eval()
_metrics_ = ('r1', 'r5', 'r10', 'medr', 'meanr')

In [41]:
img_emb = torch.FloatTensor(img_emb).to(device)
txt_emb = torch.FloatTensor(txt_emb).to(device)

In [45]:
lengths = np.array(lengths)

In [35]:
#torch.save(img_emb, 'tmp/imb_emb.pt')
#torch.save(txt_emb, 'tmp/txt_emb.pt')
#np.save('tmp/lengths', lengths)

In [10]:
img_emb = torch.load('tmp/imb_emb.pt')
txt_emb = torch.load('tmp/txt_emb.pt')
lengths = list(np.load('tmp/lengths.npy'))

In [14]:
img_emb.shape

torch.Size([12, 2048, 49])

In [15]:
img_emb = img_emb[:12,:,:]
txt_emb = txt_emb[:12,:,:]
lengths = lengths[:12]

In [17]:
%%time
sims = model.get_sim_matrix_shared(
    embed_a=img_emb, 
    embed_b=txt_emb,
    lens=lengths, 
    shared_size=128
)

                                             

CPU times: user 9.84 ms, sys: 9.34 ms, total: 19.2 ms
Wall time: 18.1 ms




In [18]:
similarity = model.similarity

In [28]:
embed_a=img_emb
embed_b=txt_emb
lens=lengths,
shared_size=12

In [29]:
img_embed = embed_a
cap_embed = embed_b
lens=lengths
shared_size=shared_size

In [30]:
n_im_shard = (len(img_embed)-1)//shared_size + 1
n_cap_shard = (len(cap_embed)-1)//shared_size + 1

In [40]:
n_cap_shard

1

In [31]:
d = torch.zeros(len(img_embed), len(cap_embed)).cpu()

In [34]:
for i in range(1):
    print(i)

0


In [41]:
i = 0
j = 0

In [42]:
im_start = shared_size*i
im_end = min(shared_size*(i+1), len(img_embed))

In [43]:
im_start

0

In [44]:
im_end

12

In [45]:
cap_start = shared_size*j
cap_end = min(shared_size*(j+1), len(cap_embed))

In [46]:
cap_start

0

In [47]:
cap_end

12

In [50]:
im = img_embed[im_start:im_end]
s = cap_embed[cap_start:cap_end]
l = lens[cap_start:cap_end]

In [49]:
im.shape

torch.Size([12, 2048, 49])

In [51]:
s.shape

torch.Size([12, 77, 2048])

In [52]:
len(l)

12

In [53]:
sim = model.similarity.forward(im, s, l)

In [55]:
sim.shape
# rows are image

torch.Size([12, 12])

In [56]:
d.shape

torch.Size([12, 12])

In [57]:
sim2 = model.similarity.forward(img_embed, cap_embed, lens)

In [64]:
from retrieval.utils.layers import tensor_to_numpy

In [66]:
sim = tensor_to_numpy(sim)

In [67]:
sim.shape

(12, 12)

##### Image 2 Text metrics (Deep Dive) --- START

In [68]:
#i2t_metrics = i2t(sims)
#t2i_metrics = t2i(sims)

In [91]:
npts, ncaps = sim.shape
captions_per_image = ncaps // npts

In [92]:
ranks = np.zeros(npts)
top1 = np.zeros(npts)

In [127]:
index = 0

In [128]:
# similarity of a given image embedding, to all of the caption embeddings of validation
sims[index] 

tensor([0.4724, 0.5718, 0.5298, 0.5168, 0.5247, 0.5420, 0.5529, 0.4668, 0.5212,
        0.5628, 0.5902, 0.5679], grad_fn=<SelectBackward>)

In [129]:
# The first position of inds is the position of the sims[index] vector of the HIGHEST similar
inds = np.argsort(sim[index])[::-1]
inds

array([10,  1, 11,  9,  6,  5,  2,  4,  8,  3,  0,  7])

In [None]:
# Ideally, we would want that the first position of the inds is 0. 
# That is why we apply the "where" to search for the position of the 0
# in the inds array. This position is the "rank", ideally the lower the better

In [113]:
# Ascendingly
np.argsort([1,3,4,5,0])

array([4, 0, 1, 2, 3])

In [114]:
# Descendingly
np.argsort([1,3,4,5,0])[::-1]

array([3, 2, 1, 0, 4])

In [96]:
# Score
rank = 1e20
begin = captions_per_image * index
end = captions_per_image * index + captions_per_image
print((begin, end))

(0, 1)


In [97]:
# Search for the position of the inds = 0
for i in range(begin, end, 1):
    # Check the "diagonal" element of inds, indicating the rank of the
    # TRUE embedding that matched the query embedding index. We ideally want
    # this rank
    tmp = np.where(inds == i)[0][0]
    if tmp < rank:
        rank = tmp

In [99]:
ranks[index] = rank
top1[index] = inds[0]

In [103]:
ranks

array([10.,  1.,  6.,  9.,  7.,  4.,  5., 11.,  9.,  2.,  0.,  1.])

In [100]:
npts, ncaps = sim.shape
captions_per_image = ncaps // npts

ranks = np.zeros(npts)
top1 = np.zeros(npts)
for index in range(npts):
    inds = np.argsort(sim[index])[::-1]
    # Score
    rank = 1e20
    begin = captions_per_image * index
    end = captions_per_image * index + captions_per_image
    for i in range(begin, end, 1):
        # tell me which is the position
        tmp = np.where(inds == i)[0][0]
        if tmp < rank:
            rank = tmp
    ranks[index] = rank
    top1[index] = inds[0]

In [104]:
ranks

array([10.,  1.,  6.,  9.,  7.,  4.,  5., 11.,  9.,  2.,  0.,  1.])

In [105]:
# for image embedding 0, the most similar one is caption embedding 10

In [111]:
r1 = np.round(100.0 * len(np.where(ranks < 1)[0]) / len(ranks),2)
r5 = np.round(100.0 * len(np.where(ranks < 5)[0]) / len(ranks),2)
r10 = np.round(100.0 * len(np.where(ranks < 10)[0]) / len(ranks),2)
medr = np.round(np.floor(np.median(ranks)) + 1,2)
meanr = np.round(ranks.mean() + 1,2)

In [112]:
# Compute metrics
print('r1', np.round(r1,2))
print('r5', r5)
print('r10', r10)
print('medr', medr)
print('meanr', meanr)

r1 8.33
r5 41.67
r10 83.33
medr 6.0
meanr 6.42


##### Image 2 Text metrics (Deep Dive) --- END

In [131]:
from retrieval.train.evaluation import t2i, i2t

In [133]:
t2i_metrics = t2i(sim)
i2t_metrics = i2t(sim)

In [135]:
t2i_metrics

(8.333333333333334, 33.333333333333336, 75.0, 7.0, 7.083333333333333)

In [134]:
i2t_metrics

(8.333333333333334,
 41.666666666666664,
 83.33333333333333,
 6.0,
 6.416666666666667)

In [136]:
rsum = np.sum(i2t_metrics[:3]) + np.sum(t2i_metrics[:3])

In [137]:
rsum

250.0

In [140]:
_metrics_ = ('r1', 'r5', 'r10', 'medr', 'meanr')

In [141]:
i2t_metrics = {f'i2t_{k}': v for k, v in zip(_metrics_, i2t_metrics)}
t2i_metrics = {f't2i_{k}': v for k, v in zip(_metrics_, t2i_metrics)}

In [142]:
i2t_metrics

{'i2t_r1': 8.333333333333334,
 'i2t_r5': 41.666666666666664,
 'i2t_r10': 83.33333333333333,
 'i2t_medr': 6.0,
 'i2t_meanr': 6.416666666666667}

In [None]:
# return metrics

#### Evaluate (Deep Dive) ----------------- END

In [143]:
#result = evaluation.evaluate(
#                model=self.model, img_emb=img_emb,
#                txt_emb=txt_emb, lengths=lens,
#                device=self.device, shared_size=128)

### Evaluate Loaders (Deep Dive) ----------------- END

### Run Evaluation (Deep Dive) ----------------- END

## Train epoch(Deep dive) -------------------------------- END