# Validation pipeline 

In [1]:
import torch 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

device

device(type='cuda')

In [2]:
from rich import print

In [3]:
# Read the images from of the dataset 
import os 

img_folder = 'flicker_8k/photos'

if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
    os.makedirs(img_folder, exist_ok=True)
    

In [4]:
import json 

data = [] 

with open("flicker_8k/flicker_8k.jsonl") as filino:

    for file_i in filino:

        dic_obj = json.loads(file_i)
        data.append(dic_obj)

In [5]:
print("Dataset size is: ", len(data) )

In [6]:
print(data[:10])

In [7]:
Check_id_duplication = [] 

In [8]:
for idx, data_obj in enumerate(data):

    Check_id_duplication.append(data_obj["id"])

In [9]:
# If the len is 1000, there is no duplicates

len(set(Check_id_duplication)) == 8091

False

In [10]:
# data = [
#     {'image_id': 0, 'id': 391895, 'caption': 'رجل يرتدي خوذة حمراء على دراجة بخارية صغيرة على طريق ترابي'},
#     {'image_id': 1, 'id': 522418, 'caption': 'امرأة ترتدي شبكة على رأسها تقطع كعكة'},
#     {'image_id': 2, 'id': 184613, 'caption': 'طفل يحمل مظلة مزهرة ويأكل ثورًا'},
# ]

# Sort the list of dictionaries based on the 'id' key
sorted_data = sorted(data, key=lambda x: x['id'])

print(sorted_data[:20])
# # Print the sorted list
# for item in sorted_data:
#     print(item)

In [11]:
# get only 10 examples
# sorted_data

In [12]:
len(sorted_data)

8091

In [13]:
print(sorted_data[:10])

In [14]:
image_name_list = []

for lin in sorted_data:
    # print(lin["image_name"])
    image_name_list.append(lin["image_name"])

In [15]:
sorted_data[0]

{'caption_ar': 'طفلة صغيرة تتسلق إلى مسرح خشبي',
 'image_name': '1000268201_693b08cb0e.jpg',
 'id': '1000268201_693b08cb0e'}

In [16]:
# Create a mapping dictionary between the ids and paths

id2path = {}


for im_path, sort_sample in zip(image_name_list, sorted_data):


    # print(json.loads(lin)["text"])
    # print(im_path.split("_")[-1].split(".")[0])

    input_str = im_path.split(".")[0]

    # Check the ids

    if sort_sample['id'] != input_str:
        print("stop ........................................................")
    id2path[input_str] = im_path

    # print(result)

In [17]:
# id2path

In [18]:
# Check if each image file exists in the folder

folder_path = "flicker_8k/photos"

missing_images = []

for image_path in image_name_list:
    full_image_path = os.path.join(folder_path, image_path)
    if not os.path.exists(full_image_path):
        missing_images.append(image_path)

if missing_images:
    print("The following images are missing:")
    for image_path in missing_images:
        print(image_path)
else:
    print("All images are present in the folder.")

In [19]:
# Delete the images that are not included on the testing dataset 

import os


not_exist_paths = []
exist_paths = [] 

# Get a list of all files in the folder
all_files = os.listdir(folder_path)

# Remove any files in the folder that are not in the list of image paths
for file_name in all_files:
    if file_name not in image_name_list:
        file_path = os.path.join(folder_path, file_name)
        os.remove(file_path)
        # print(f"Removed: {file_path}")
        not_exist_paths.append(file_path)

    elif file_name in image_name_list:

        exist_paths.append(file_name)


destroy_images = set(not_exist_paths).difference(set(exist_paths))


print("img_names", len(all_files))
print("destroy_images", len(destroy_images))
print("not_exist_paths", len(not_exist_paths))
print("remaining images", len(all_files)- len(destroy_images))

# print("Finished removing unwanted images.")

Define the the text model 

In [20]:
!pip install multilingual-clip

from multilingual_clip import pt_multilingual_clip
import transformers


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'

text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)




In [21]:
tokenizer("Test", padding=True, return_tensors='pt')

{'input_ids': tensor([[   0, 8647,    2]]), 'attention_mask': tensor([[1, 1, 1]])}

In [22]:
text_model


MultilingualCLIP(
  (transformer): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)

In [23]:
import numpy as np

In [24]:
language_model = lambda queries:  np.asarray(text_model.forward(queries, tokenizer).detach().to('cpu'))  

### Define the image model 

In [25]:
# clip_model, compose = clip.load('RN50x4')
import torch
import open_clip

device = "cuda" if torch.cuda.is_available() else "cpu"

print("Device: ", device)

clip_model, _, compose = open_clip.create_model_and_transforms('ViT-B-16-plus-240', pretrained="laion400m_e32")
# image_tokenizer = open_clip.get_tokenizer('ViT-B-16-plus-240')
# clip_model.to(device)


In [26]:
compose

Compose(
    Resize(size=240, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(240, 240))
    <function _convert_to_rgb at 0x7fac51d84940>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [27]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [28]:
clip_model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 896, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0): ResidualAttentionBlock(
          (ln_1): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=896, out_features=896, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=896, out_features=3584, bias=True)
            (gelu): GELU()
            (c_proj): Linear(in_features=3584, out_features=896, bias=True)
          )
          (ls_2): Identity()
        )
        (1): ResidualAttentionBlock(
          (ln_1): LayerNorm((896,), eps=1e-05, elementwise_affine

### Defind  the image model 

In [29]:
image_model = lambda images: np.asarray(clip_model.encode_image(images.to(device)).float().detach().to('cpu'))


# Utils

In [30]:
# Define the needed libraries in the code 

from tqdm.notebook import tqdm
import os 
from PIL import Image

### Defind a dataset class for images 

In [31]:

class CustomDataSet(torch.utils.data.Dataset):
    def __init__(self, main_dir, transform):
        self.main_dir = main_dir
        self.transform = transform
        self.total_imgs = image_name_list
        print(self.total_imgs[:20])

    def __len__(self):
        return len(self.total_imgs)

    def get_image_name(self, idx):

        return self.total_imgs[idx]

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc)

        return self.transform(image)

### Defind a dataset class for text dataset  

In [32]:
class SimpleTextDataset(torch.utils.data.Dataset):

    def __init__(self, texts):
        """Define  the class init"""
        self.texts = texts

    def __len__(self):
        """Return the length of the text dataset"""
        return len(self.texts)

    def __getitem__(self, idx):
        """Get the item based on index"""
        return self.texts[idx]

In [33]:
def text_encoder(text):
    """Normalize the text embeddings"""
    embedding = language_model(text)
    embedding = embedding / np.linalg.norm(embedding)

    return embedding

def precompute_text_features(loader):
    """Compute the text embeddings of the whole dataset based on the loader provided"""
    text_features = []

    for _, (texts) in enumerate(tqdm(loader)):

        embedding = language_model(texts)
        embedding = embedding / np.linalg.norm(embedding)

        text_features.extend(embedding)

    return np.array(text_features)

In [34]:
def precompute_image_features(loader):
    image_features = []
    
    for i, (images) in enumerate(tqdm(loader)):

        features = image_model(images)

        features = features / np.linalg.norm(features)
        image_features.extend(features)

    return np.array(image_features)

In [35]:
def show_images(image_list):
    for im_path in image_list:
        print(im_path)
        display(Image.open(im_path))

In [36]:
# text = 'بجعة تطفو أسفل النهر بالقارب'

# image_paths = find_image(text, dataset, image_features, n=3)
# show_images(image_paths)

Build the image dataset 

In [37]:
dataset = CustomDataSet("flicker_8k/photos", transform=compose)

In [38]:
# check if the image_paths sorted_data in the same order of the image dataset:


for i, item in enumerate(sorted_data):

    if item['image_name'] != dataset.get_image_name(i):
        print("stop")
        break


In [39]:
len(dataset)

8091

### Define the image_loder

In [40]:
image_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=16,
    shuffle=False,
    num_workers=0,
    drop_last=False)

### Define the text_loder

In [41]:
lang = "ar"

if lang =="ar":
    captions_lang = "caption_ar"
else:
    captions_lang = "caption_en"


print(f"The language used in the captions is {captions_lang}")

In [42]:
text_dataset = SimpleTextDataset([elem[captions_lang] for elem in sorted_data])

text_loader = torch.utils.data.DataLoader(
    text_dataset,
    batch_size=64,
    shuffle=False)

In [43]:
print("We are processing: ", captions_lang)

In [44]:
# Check this to utalize the GPU memory in the images 
# https://discuss.pytorch.org/t/not-using-multiprocessing-but-getting-cuda-error-re-forked-subprocess/54610/8

In [45]:
import numpy as np

In [46]:
image_features = precompute_image_features(image_loader)

  0%|          | 0/506 [00:00<?, ?it/s]

In [47]:
image_emb_path = 'image_features_flicker_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle'

In [48]:
text_emb_path = 'text_features_flicker_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle'

In [49]:
import pickle


with open(image_emb_path, 'wb') as handle:
    pickle.dump(image_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
import pickle

with open(image_emb_path, 'rb') as handle:
    image_features_new = pickle.load(handle)

image_features_new

array([[-0.00551481, -0.0018802 , -0.00828313, ...,  0.00217237,
         0.0019792 , -0.00400357],
       [-0.01181199,  0.0030166 , -0.00503209, ..., -0.00669394,
        -0.00999732,  0.01249228],
       [-0.00420342, -0.0069913 , -0.01194882, ...,  0.00300192,
        -0.01094185, -0.0049613 ],
       ...,
       [-0.00384364, -0.0054126 , -0.02167754, ...,  0.00039435,
        -0.00500899,  0.00201166],
       [ 0.00860667, -0.02255196,  0.00203453, ...,  0.01478415,
        -0.01755573, -0.01129133],
       [ 0.01163081,  0.00379059, -0.02416092, ..., -0.00695446,
        -0.0144442 , -0.0283859 ]], dtype=float32)

In [51]:
text_features = precompute_text_features(text_loader)

text_features

  0%|          | 0/127 [00:00<?, ?it/s]

array([[ 6.02509733e-03,  3.18110525e-03, -9.91252344e-03, ...,
        -1.18462602e-03,  3.49959149e-03, -9.19437793e-04],
       [-2.65762582e-03, -4.22366150e-03, -7.05972314e-03, ...,
        -1.59219455e-03, -5.03420178e-03,  6.22146996e-03],
       [-1.56358190e-04, -2.93192978e-04, -7.91358575e-03, ...,
         2.84977723e-03,  6.26614690e-03, -9.17871576e-03],
       ...,
       [-8.85711779e-05,  3.90802743e-03, -1.25089735e-02, ...,
        -7.61612260e-04,  6.51917467e-03,  2.39236234e-03],
       [ 3.03399260e-03,  5.07207587e-03, -1.43755833e-02, ...,
        -5.07260393e-03, -7.91140739e-03, -5.83504280e-03],
       [ 9.75695997e-03,  5.50761260e-03, -1.95871405e-02, ...,
         1.18678226e-03, -2.10715894e-04, -8.62668827e-03]], dtype=float32)

In [52]:
import pickle


with open(text_emb_path, 'wb') as handle:
    pickle.dump(text_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [53]:

with open(text_emb_path, 'rb') as handle:
    text_features_new = pickle.load(handle)

text_features_new

array([[ 6.02509733e-03,  3.18110525e-03, -9.91252344e-03, ...,
        -1.18462602e-03,  3.49959149e-03, -9.19437793e-04],
       [-2.65762582e-03, -4.22366150e-03, -7.05972314e-03, ...,
        -1.59219455e-03, -5.03420178e-03,  6.22146996e-03],
       [-1.56358190e-04, -2.93192978e-04, -7.91358575e-03, ...,
         2.84977723e-03,  6.26614690e-03, -9.17871576e-03],
       ...,
       [-8.85711779e-05,  3.90802743e-03, -1.25089735e-02, ...,
        -7.61612260e-04,  6.51917467e-03,  2.39236234e-03],
       [ 3.03399260e-03,  5.07207587e-03, -1.43755833e-02, ...,
        -5.07260393e-03, -7.91140739e-03, -5.83504280e-03],
       [ 9.75695997e-03,  5.50761260e-03, -1.95871405e-02, ...,
         1.18678226e-03, -2.10715894e-04, -8.62668827e-03]], dtype=float32)

In [54]:
image_features_new

array([[-0.00551481, -0.0018802 , -0.00828313, ...,  0.00217237,
         0.0019792 , -0.00400357],
       [-0.01181199,  0.0030166 , -0.00503209, ..., -0.00669394,
        -0.00999732,  0.01249228],
       [-0.00420342, -0.0069913 , -0.01194882, ...,  0.00300192,
        -0.01094185, -0.0049613 ],
       ...,
       [-0.00384364, -0.0054126 , -0.02167754, ...,  0.00039435,
        -0.00500899,  0.00201166],
       [ 0.00860667, -0.02255196,  0.00203453, ...,  0.01478415,
        -0.01755573, -0.01129133],
       [ 0.01163081,  0.00379059, -0.02416092, ..., -0.00695446,
        -0.0144442 , -0.0283859 ]], dtype=float32)

In [55]:
def get_path_coco(image_id):
    # image_id = int(image_id)
    # print(type(image_id))

    im_path = id2path[image_id]
    
    return f"flicker_8k/photos/{im_path}" # f"photos/val2014/COCO_val2014_{image_id:012d}.jpg"

In [56]:
import numpy as np

In [58]:
def compare_embeddings(logit_scale, img_embs, txt_embs):
  # normalized features
  image_features = img_embs / img_embs.norm(dim=-1, keepdim=True)
  text_features = txt_embs / txt_embs.norm(dim=-1, keepdim=True)

  logits_per_text = logit_scale * text_features @ image_features.t()
  
  return logits_per_text

In [68]:
def compute_mrr(data, dataset, n):
    """Compute the MRR for the data based on n"""
    collect_rr = []
    pbar = tqdm(total=len(data), position=0, leave=True)
    found = np.matmul(text_features_new, image_features_new.T)

    for index, distances in enumerate(found): # It return the rows, one by one

        pbar.update(1)
        image_path = get_path_coco(data[index]["id"])

        collect_rr.append(new_rr(distances, image_path, dataset, n,index))


    pbar.close()
    print(100*"=")
    
    return np.average(collect_rr)


def new_rr(distances, target_image, dataset, n,index):
    """Calculate the RR for the given target image"""
    image_paths = []

    idxs = distances.argsort()[-n:][::-1] # Get the indcies for the images distances based on n

    for idx in idxs:
        image_paths.append('flicker_8k/photos/' + dataset.get_image_name(idx))

    if target_image in image_paths:

        return 1/(image_paths.index(target_image) + 1)
    else:
        return 0


def internal_hits(distances, target_image, dataset, n):
    """Calculate the hits of the target images based on the existance of it or not"""
    image_paths = []
    idxs = distances.argsort()[-n:][::-1]

    if target_image in idxs:
        return 1
    else:
        return 0

def compute_hits(data, dataset, n):

    collect_rr = []

    pbar = tqdm(total=len(data), position=0, leave=True)

    found = np.matmul(text_features_new, image_features_new.T)

    for index, distances in enumerate(found):
        pbar.update(1)
        # image_path = get_path_coco(data[index]["id"])
        image_path = index # get_path_coco(data[index]["id"])
        
        collect_rr.append(internal_hits(distances, image_path, dataset, n))
        # collect_rr_testing.append(internal_hits(distances, image_path, dataset, n))

    
    pbar.close()
    # print(len(collect_rr_testing))
    return np.average(collect_rr)

In [69]:
print('MRR@1:', compute_mrr(sorted_data, dataset, 1))

  0%|          | 0/8091 [00:00<?, ?it/s]

In [70]:
print('MRR@5:', compute_mrr(sorted_data, dataset, 5))

  0%|          | 0/8091 [00:00<?, ?it/s]

In [71]:
print('MRR@10:', compute_mrr(sorted_data, dataset,10))

  0%|          | 0/8091 [00:00<?, ?it/s]

## Evaluation based on Recall metric

In [72]:
image_features_new.shape

(8091, 640)

In [73]:
text_features_new.shape

(8091, 640)

In [74]:
image_features_new_pt = torch.from_numpy(image_features_new)

text_features_new_pt = torch.from_numpy(text_features_new)

text_to_image_map = torch.LongTensor(list(range(text_features_new.shape[0])))
print(text_to_image_map.shape) # .type(torch.int64)

print(text_to_image_map.unsqueeze(1).shape)

In [75]:
torch.set_printoptions(precision=8)

In [76]:
# https://github.com/openai/CLIP/issues/115

import torch
from torchvision.datasets import CocoCaptions
import torch.utils.data as dutils
from typing import List
import clip

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

k_vals= [1, 5, 10, 50]


def recall_at_k(k_vals, image_encodings,text_encodings,text_to_image_map):
    print("Encoding all data...")
 
    num_text = text_encodings.shape[0]
    
    # text-to-image recall
    print("Text-to-image recall...")


    dist_matrix = text_encodings @ image_encodings.T  # dist_matrix[i] gives logits for ith text

    inds = torch.argsort(dist_matrix, dim=1, descending=True)
    inds = inds.to(device)
    text_to_image_recall = []

    

    text_to_image_map = text_to_image_map.to(device)
    
    for k in k_vals:
        # Extract top k indices only
        topk = inds[:, :k]

        text_to_image_map_new = text_to_image_map.repeat(k, 1).t()

        correct = torch.eq(topk, text_to_image_map_new).any(dim=1)  #  value along dimension 1 (which typically corresponds to rows in a 2D tensor) ###### any(dim=1) >> check if True over the row 
        
        num_correct = correct.sum().item()

        text_to_image_recall.append(num_correct / num_text)

    print(text_to_image_recall)

    print("Done.")
    return text_to_image_recall

In [77]:
t2i= recall_at_k(k_vals=k_vals, image_encodings=image_features_new_pt,text_encodings=text_features_new_pt,text_to_image_map=text_to_image_map)

print("Text-to-image Recall@K")

print("Returned value: ", t2i)
for k, x in zip(k_vals, t2i):
    print(k, " ", (x/100) * 100)
    # print(f" R@{k}: {100*x:.2f}%")
