In [1]:
import os
import numpy as np
import pandas as pd
import pprint

import pickle # Load refs and annotations
from typing import Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset

import torchvision
import torchmetrics


import pytorch_lightning as pl
from pytorch_lightning.utilities.types import STEP_OUTPUT

from transformers import T5Tokenizer, T5ForConditionalGeneration 

from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from ultralytics import YOLO

import clip
from PIL import Image, ImageDraw
import cv2
import numpy as np


import matplotlib.pyplot as plt
import torch
import numpy as np
import os
import json

from torch.utils.data import DataLoader
from torchvision import transforms

import sklearn.metrics


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import BertTokenizer, BertModel

# Example sentences as a list of strings
sentences = [
    "This is the first sentence.",
    "And this is the second one.",
    "Finally, here's the third sentence."
]

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode sentences using BERT
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**encoded_inputs)



# Get the tensor containing the sentence representations (pooled outputs)
sentence_tensor = outputs.pooler_output

print("Encoded Sentence Tensor:")
print(sentence_tensor)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Encoded Sentence Tensor:
tensor([[-0.9051, -0.4523, -0.6808,  ..., -0.5678, -0.7033,  0.9266],
        [-0.8462, -0.2268, -0.4501,  ..., -0.0893, -0.6163,  0.8899],
        [-0.8705, -0.4604, -0.8941,  ..., -0.8175, -0.6987,  0.8999]])


In [3]:
from torch.utils.data import Dataset

import json

    
def getcaption(elem):
    li = []
    for e in elem["sentences"]:
        li.append(e['raw'])
    return li

class RefCOCOg(Dataset):
    """
    Args:
        dataset: a list of dictionaries containing:
        {
            'file_name': # path of the image, images will be loaded on the fly
            'caption': # referring caption
            'ann_id': # annotation ID (one per caption), taken from 'file_name'
            'bbox': # coordinates (xmin, ymin, xmax, ymax) of the bounding box
        }
    """
    def __init__(self, refs, annotations, split="train"):

        self.dataset = [{"file_name": os.path.join("../refcocog/images/", f'{"_".join(elem["file_name"].split("_")[:3])}.jpg'),
                            "caption": elem["sentences"][0]["raw"],
                            "captions": getcaption(elem),
                            "ann_id": int(elem["file_name"].split("_")[3][:-4]),
                            "bbox": annotations[int(elem["file_name"].split("_")[3][:-4])]}
                        for elem in [d for d in refs if d["split"]==split]]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

    def __call__(self, idx):
        print(json.dumps(self.dataset[idx], indent=4))

In [4]:
# Load refs and annotations
import pickle

with open("../refcocog/annotations/refs(umd).p", "rb") as fp:
  refs = pickle.load(fp)

# 'annotations' will be a dict object mapping the 'annotation_id' to the 'bbox' to make search faster
with open("../refcocog/annotations/instances.json", "rb") as fp:
  data = json.load(fp)
  annotations = dict(sorted({ann["id"]: ann["bbox"] for ann in data["annotations"]}.items()))

In [5]:
import matplotlib.pyplot as plt
import torch
import numpy as np
import os

from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image, ImageDraw

def pad_image(image):
    """
    Performs bottom-right padding of the original image to 640x640 (max size of images in the dataset).
    Bottom-right padding prevents corruption of bounding boxes.

    ### Arguments
    image: a PIL.Image to transform
    """
    original_width, original_height = image.size
    padded_width, padded_height = 640, 640

    pad_width = padded_width - original_width
    pad_height = padded_height - original_height

    padded_image = Image.new(image.mode, (padded_width, padded_height), (0, 0, 0))
    padded_image.paste(image, (0, 0))

    return padded_image

def collate_fn(batch):
    images = []
    for sample in batch:
        image = Image.open(sample["file_name"]).convert("RGB")
        image = pad_image(image=image)
        images.append(transform(image))
    images = torch.stack(images, dim=0)

    data = {}
    for key in batch[0].keys():
        if key != "file_name":
            data[key] = [sample[key] for sample in batch]
    return images, data

transform = transforms.Compose([
    transforms.ToTensor(),
])

# create dataset and dataloader
dataset = RefCOCOg(refs, annotations, split="test")
print(dataset[0])
print("---------------------------------------------------")
#plt.imshow(Image.open(dataset[2]["file_name"]))
dataloader = DataLoader(dataset, batch_size=1, collate_fn=collate_fn)

{'file_name': '../refcocog/images/COCO_train2014_000000380440.jpg', 'caption': 'the man in yellow coat', 'captions': ['the man in yellow coat', 'Skiier in red pants.'], 'ann_id': 491042, 'bbox': [374.31, 65.06, 136.04, 201.94]}
---------------------------------------------------


In [6]:
print(dataset[0]['captions'])

['the man in yellow coat', 'Skiier in red pants.']


In [7]:
encoded_inputs = tokenizer(dataset[0]['captions'], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Get the tensor containing the sentence representations (pooled outputs)
sentence_tensor = outputs.pooler_output

print("Encoded Sentence Tensor:")
print(sentence_tensor)

Encoded Sentence Tensor:
tensor([[-0.8468, -0.2413, -0.3398,  ...,  0.1472, -0.6613,  0.9029],
        [-0.7858, -0.3866, -0.8730,  ..., -0.8093, -0.6661,  0.9216]])


In [8]:
sentence_tensor = outputs.pooler_output

# Flatten the tensor to get a single tensor with dim 1
flattened_tensor = sentence_tensor.view(-1)
print("Flattened Tensor:")
print(flattened_tensor)

Flattened Tensor:
tensor([-0.8468, -0.2413, -0.3398,  ..., -0.8093, -0.6661,  0.9216])


In [9]:
device =  "cpu"
model, preprocess = clip.load("RN50", device=device)
#text = clip.tokenize(q).to(device)

In [10]:
# Models
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Images
img = dataset[0]['file_name']
# Query
q = dataset[0]['caption']

# Inference
results = yolo_model(img)

# Results
print("\n CAPTION: ", q)
print("RESULTS: ", results.xyxy[0].cpu().numpy())

Using cache found in /home/matea/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-8-28 Python-3.11.3 torch-2.0.1+cu117 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 5938MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 



 CAPTION:  the man in yellow coat
RESULTS:  [[     376.61      67.176      511.16      262.77     0.93054           0]
 [     230.67       43.14       371.3      307.89     0.93043           0]
 [     244.57      257.03       377.8      343.93     0.70337          30]
 [     346.77      212.96      518.62      268.03     0.44683          30]]


In [18]:
bert_features_transposed = sentence_tensor.T

In [20]:

from sklearn.metrics.pairwise import cosine_similarity

In [25]:
best_bbox = []
max_prob = 0
for bbox in results.xyxy[0].cpu().numpy():
    print(bbox)
    temp = cv2.imread(dataset[0]['file_name'])
    #print(temp)
    image = np.zeros((temp.shape[0], temp.shape[1], temp.shape[2]), dtype=np.uint8)
    image[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = temp[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]
    image = preprocess(Image.fromarray(image)).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        #text_features = model.encode_text(flattened_tensor)
        
        print(image_features.shape[1])
        print(sentence_tensor.shape[0])

        num_rows_image = image_features.shape[0]
        bert_features_expanded = bert_features_transposed.expand(num_rows_image, -1, -1)
        image_features_expanded = image_features.expand(sentence_tensor.shape[0], -1) 

        #logits_per_image, logits_per_text = model(image, flattened_tensor)
        similarity_score = (100.0 * image_features_expanded @ bert_features_expanded.T).softmax(dim=1)
        #similarity_scores = cosine_similarity(bert_features_expanded.T, image_features)

        prob = similarity_score.cpu().numpy()[0][0]
        if prob > max_prob:
            #print("ciao")
            max_prob = prob
            best_bbox = bbox


[     376.61      67.176      511.16      262.77     0.93054           0]
1024
2


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [2, 1024] but got: [2, 768].