## Before you start

Let's make sure that we have access to GPU. We can use `nvidia-smi` command to do that. In case of any problems navigate to `Edit` -> `Notebook settings` -> `Hardware accelerator`, set it to `GPU`, and then click `Save`.

In [1]:
!nvidia-smi

Sat Apr 22 10:24:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:65:00.0 Off |                  Off |
|  0%   42C    P8    14W / 450W |     19MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
import os
HOME = os.getcwd()
print(HOME)

/src/notebooks/GDINO


## Install Grounding DINO 🦕 

In [3]:
import os

CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))

/src/notebooks/GDINO/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py ; exist: True


In [4]:
import os

WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

/src/notebooks/GDINO/weights/groundingdino_swint_ogc.pth ; exist: True


## Load Grounding DINO Model

In [5]:
%cd {HOME}/GroundingDINO

from groundingdino.util.inference import load_model, load_image, predict, annotate

model = load_model(CONFIG_PATH, WEIGHTS_PATH)

/src/notebooks/GDINO/GroundingDINO


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


final text_encoder_type: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Grounding DINO Demo

In [6]:
import numpy as np
from typing import Tuple, List
import groundingdino.datasets.transforms as T
from PIL import Image
import cv2
import supervision as sv
def create_frame_list(filepath):
    """
    This function takes in a video filepath.
    Reads in the video frame by frame using cv2.
    Converts the frame into a GDINO suitable format.
    Returns a list of frames
    """
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    vidcap = cv2.VideoCapture(filepath)
    success,raw_image = vidcap.read()
    source_frames = []
    frames = []
    while success:
        frame = Image.fromarray(raw_image)
        frame_transformed, _ = transform(frame, None)
        frames.append(frame_transformed)
        source_frames.append(raw_image)
        success,raw_image = vidcap.read()
    return source_frames, frames

In [27]:
def generate_bounding_boxes(frames, captions, real_caption, BOX_TRESHOLD = 0.5, TEXT_TRESHOLD = 0.3):
    """
    This function takes in a list of frames in a BLIP suitable format.
    Generates a caption for each frame.
    Returns a dictionary with the key being the frame id and the value the caption for that frame.
    """
    DINO_outputs = {}
    for i, frame in enumerate(frames):
        DINO_outputs[i] = {}
        boxes, logits, phrases = predict(
        model=model, 
        image=frame, 
        caption=captions[str(i)][0], 
        box_threshold=BOX_TRESHOLD, 
        text_threshold=TEXT_TRESHOLD
    )

        DINO_outputs[i]["BLIP_caption"] = {"boxes":boxes.tolist(), 
                                           "logits":logits.tolist(), 
                                           "phrases":phrases}
        
        boxes, logits, phrases = predict(
        model=model, 
        image=frame, 
        caption=real_caption, 
        box_threshold=BOX_TRESHOLD, 
        text_threshold=TEXT_TRESHOLD
    )
        DINO_outputs[i]["real_caption"] = {"boxes":boxes.tolist(), 
                                           "logits":logits.tolist(), 
                                           "phrases":phrases}
        
        find_AP(DINO_outputs[i]["real_caption"], DINO_outputs[i]["BLIP_caption"])
        
    return DINO_outputs

In [8]:
import json 
def read_BLIP_captions(caption_path = "../../video_caption_generator/video_captions.json"):
    # Opening JSON file
    f = open(caption_path)

    # returns JSON object as 
    # a dictionary
    captions = json.load(f)
    return captions

In [9]:
def run_DINO_model(filepath, captions, real_caption):
    _, frames = create_frame_list(filepath)
    return generate_bounding_boxes(frames, captions, real_caption, BOX_TRESHOLD = 0.5, TEXT_TRESHOLD = 0.3)

In [10]:
def is_video(filename, filetypes=[".mp4", ".gif"]):
    """
    Checks if a file is a video based on the file extension.
    Returns True if the file extension matches the selected types else False.
    """
    for filetype in filetypes:
        if filename.endswith(filetype):
            return True, filename.split(filetype)[0]
    return False, None

In [41]:
"ipynb_checkpoints".find(".")

-1

In [34]:
complete_DINO_outputs = {}
video_directory = "../../generated_videos"
captions = read_BLIP_captions()
isvideo, real_caption = is_video(video_directory)
if isvideo:
    complete_DINO_outputs[video_directory] = run_DINO_model(video_directory)
else:
    for model_type in os.listdir(video_directory):
        filepath = f"{video_directory}/{model_type}/"
        if model_type.find(".")==-1:
            complete_DINO_outputs[model_type] = {}
            print(model_type)
            for video_name in os.listdir(filepath):
                isvideo, real_caption = is_video(video_name)
                if isvideo and model_type in captions:
                    text_prompt = real_caption.replace("_", " ")
                    if text_prompt in captions[model_type]:
                        complete_DINO_outputs[model_type][text_prompt] = run_DINO_model(
                            filepath+video_name, 
                            captions[model_type][text_prompt], 
                            text_prompt)
output_path = "test.json"
output_file = open(file=output_path, mode="w", encoding="utf-8") 
json.dump(complete_DINO_outputs, output_file, indent=4)
output_file.close()

ModelScopeText2VideoSynthesis
clip_vqgan
Aphantasia
VideoCrafter
VideoFusion
CogVideo
Tune-a-video
Original_Videos_Meta_Google
.ipynb_checkpoints


In [62]:
import numpy
import sklearn.metrics

def precision_recall_curve(y_true, pred_scores, thresholds):
    precisions = []
    recalls = []
    
    for threshold in thresholds:
        y_pred = ["positive" if score >= threshold else "negative" for score in pred_scores]

        precision = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
        recall = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
        
        precisions.append(precision)
        recalls.append(recall)

    return precisions, recalls

def average_precision(precisions, recalls):
    precisions.append(1)
    recalls.append(0)

    precisions = numpy.array(precisions)
    recalls = numpy.array(recalls)

    ap = numpy.sum((recalls[:-1] - recalls[1:]) * precisions[:-1])
    
    return ap

def IoU(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou


def find_AP(real_caption, BLIP_caption):
    y_true = []
    pred_scores = []
    thresholds=numpy.arange(start=0.2, stop=0.7, step=0.05)
    for real_boxes, real_phrases in zip(real_caption["boxes"], real_caption["phrases"]):
        for BLIP_boxes, BLIP_phrases in zip(BLIP_caption["boxes"], BLIP_caption["phrases"]):
            iou_value = IoU(real_boxes, BLIP_boxes)
            pred_scores.append(iou_value)
            sim_score = compute_bert_similarity(real_phrases, BLIP_phrases)
            if sim_score > 0.5:
                y_true.append("positive")
            else:
                y_true.append("negative")
    
    precisions, recalls = precision_recall_curve(y_true, pred_scores, thresholds)
    ap =  average_precision(precisions, recalls)
    
    return ap

In [None]:
def compute_bert_similarity(sentence_1, sentence_2):
    # Preprocess texts
    sentence_1 = preprocess_text(sentence_1)
    sentence_2 = preprocess_text(sentence_2)
    
    # Tokenize input sentences
    encoded_sentence_1 = tokenizer(sentence_1, padding=True, truncation=True, return_tensors='pt')
    encoded_sentence_2 = tokenizer(sentence_2, padding=True, truncation=True, return_tensors='pt')
    
    # Generate sentence embeddings
    with torch.no_grad():
        outputs = model(**encoded_sentence_1)
        embeddings_1 = outputs.last_hidden_state[:, 0, :]
        
        outputs = model(**encoded_sentence_2)
        embeddings_2 = outputs.last_hidden_state[:, 0, :]
    
    # Compute similarity score
    similarity_score = torch.nn.functional.cosine_similarity(embeddings_1, embeddings_2)
    
    return similarity_score.item()

In [64]:
real_caption, BLIP_caption

({'boxes': [[0.5000684261322021,
    0.46889203786849976,
    0.9999943971633911,
    0.5703166723251343]],
  'logits': [0.8146625757217407],
  'phrases': ['a large shark swimming']},
 {'boxes': [[0.5000467300415039,
    0.4707260727882385,
    0.9999933242797852,
    0.5702130794525146]],
  'logits': [0.6924726366996765],
  'phrases': ['a shark swimming']})

In [68]:
for i in range(len(complete_DINO_outputs["ModelScopeText2VideoSynthesis"]['A shark swimming in clear Caribbean ocean'])):
    real_caption, BLIP_caption = complete_DINO_outputs["ModelScopeText2VideoSynthesis"
                                                      ]['A shark swimming in clear Caribbean ocean'][i].values()
    print(find_AP(real_caption, BLIP_caption))

0.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [36]:
for key in complete_DINO_outputs:
    print(len(complete_DINO_outputs[key]))

35
0
0
35
35
0
35
0
0


In [16]:
output_path = "test.json"
output_file = open(file=output_path, mode="w", encoding="utf-8") 
json.dump(complete_DINO_outputs, output_file, indent=4)
output_file.close()

TypeError: Object of type Tensor is not JSON serializable

In [26]:
complete_DINO_outputs["ModelScopeText2VideoSynthesis"]["A shark swimming in clear Caribbean ocean"][13]["BLIP_caption"]["boxes"].tolist()

[]

In [None]:
def run_DINO_model(filepath, captions, real_caption):
    _, frames = create_frame_list(filepath)
    return generate_bounding_boxes(frames, captions, real_caption, BOX_TRESHOLD = 0.5, TEXT_TRESHOLD = 0.3)

In [None]:
import json 
def read_BLIP_captions(caption_path = "../../video_caption_generator/video_captions.json"):
    # Opening JSON file
    f = open(caption_path)

    # returns JSON object as 
    # a dictionary
    captions = json.load(f)
    return captions

In [None]:
path = "../../generated_videos/clip_vqgan/A dog wearing a Superhero outfit with red cape flying through the sky.mp4"
source_frames, frame = create_frame_list(path)

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-3.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

In [None]:
captions.keys()

In [None]:
print(captions["VideoCrafter"]["A_dog_wearing_a_Superhero_outfit_with_red_cape_flying_through_the_sky".replace("_", " ")]["0"][0])
print("A_dog_wearing_a_Superhero_outfit_with_red_cape_flying_through_the_sky".replace("_", " "))

In [None]:
TEXT_PROMPT = captions["VideoCrafter"]["A_dog_wearing_a_Superhero_outfit_with_red_cape_flying_through_the_sky".replace("_", " ")]["0"][0]
BOX_TRESHOLD = 0.5
TEXT_TRESHOLD = 0.3

path = "../../generated_videos/VideoCrafter/A_dog_wearing_a_Superhero_outfit_with_red_cape_flying_through_the_sky.mp4"
source_frames, frame = create_frame_list(path)

image_source, image = source_frames[0], frame[0]

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))
print(boxes, logits, phrases)

In [None]:
TEXT_PROMPT = "A_small_domesticated_carnivorous_mammal_with_soft_fur,_a_short_snout,_and_retractable_claws".replace("_", " ")
BOX_TRESHOLD = 0.6
TEXT_TRESHOLD = 0.3

path = "../../generated_videos/Tune-a-video/A_small_domesticated_carnivorous_mammal_with_soft_fur,_a_short_snout,_and_retractable_claws.gif"
source_frames, frame = create_frame_list(path)

image_source, image = source_frames[5], frame[5]

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))
print(boxes, logits, phrases)

In [None]:
boxes, logits, phrases

In [None]:
TEXT_PROMPT = "dog . superhero . sky . cape"
BOX_TRESHOLD = 0.6
TEXT_TRESHOLD = 0.3

path = "../../generated_videos/clip_vqgan/A dog wearing a Superhero outfit with red cape flying through the sky.mp4"
source_frames, frame = create_frame_list(path)

image_source, image = source_frames[2], frame[2]

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))
print(boxes, logits, phrases)

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-3.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

TEXT_PROMPT = "chair with man sitting on it"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-3.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

TEXT_PROMPT = "chair, dog, table, shoe, light bulb, coffee, hat, glasses, car, tail, umbrella"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-2.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

TEXT_PROMPT = "glass"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-2.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

TEXT_PROMPT = "glass most to the right"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-2.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

TEXT_PROMPT = "straw"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))

In [None]:
import os
import supervision as sv

IMAGE_NAME = "dog-4.jpeg"
IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

TEXT_PROMPT = "mens shadow"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))