In [1]:
import os

from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
from imagebind.models.imagebind_model import ImageBindModel


device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ImageBindModel.from_pretrained("nielsr/imagebind-huge")
model.eval()
model.to(device)

# print(
#     "Audio x Text: ",
#     torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1),
# )
# print(
#     "Vision x Audio: ",
#     torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=-1),
# )

# Expected output:
#
# Vision x Text:
# tensor([[9.9761e-01, 2.3694e-03, 1.8612e-05],
#         [3.3836e-05, 9.9994e-01, 2.4118e-05],
#         [4.7997e-05, 1.3496e-02, 9.8646e-01]])
#
# Audio x Text:
# tensor([[1., 0., 0.],
#         [0., 1., 0.],
#         [0., 0., 1.]])
#
# Vision x Audio:
# tensor([[0.8070, 0.1088, 0.0842],
#         [0.1036, 0.7884, 0.1079],
#         [0.0018, 0.0022, 0.9960]])

  from .autonotebook import tqdm as notebook_tqdm


ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

### <em><strong>TEXT PREPARATION</strong></em>
#### <em>Text Embeddings</em>

In [None]:
import os
with open("imagenet_no_other_sim.txt", "r") as file:
    text_list = [line.strip() for line in file if line.strip()]

batch_size = 32
text_embeddings = []

for i in range(0, len(text_list), batch_size):
    batch = text_list[i:i+batch_size]
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(batch, device)
    }
    with torch.no_grad():
        embeddings = model(inputs)

    text_embeddings.extend(embeddings[ModalityType.TEXT])

    print(f"Batch {i//batch_size} is completed")

#### <em>Process & Save </em>

In [None]:
import numpy as np
import h5py

text_embeddings_tensor = torch.cat(text_embeddings).view(len(text_list), -1)
text_np = np.array(text_list, dtype=h5py.string_dtype(encoding='utf-8'))

In [None]:
with h5py.File('text_embeddings_imagebind.h5', 'w') as f:
    f.create_dataset('text_features', data=text_embeddings_tensor)
    f.create_dataset('text_list', data=text_np,  dtype=h5py.string_dtype())

#### <em>Upload & Check Embeddings</em>

In [6]:
with h5py.File('text_embeddings_imagebind.h5', 'r') as f:
    loaded_embeddings = torch.tensor(f['text_features'][:])
    loaded_text_list = [t.decode() for t in f['text_list'][:]]
loaded_embeddings.shape , loaded_text_list.shape , type(loaded_embeddings) , type(loaded_text_list)

NameError: name 'h5py' is not defined

### <em><strong>IMAGE PREPARATION</strong></em>
#### <em>Image Embeddings</em>

In [15]:
video_names = ['video_11']
image_names = os.listdir('video_11')   
image_paths = [os.path.join('video_11' , image_name) for image_name in image_names]
len(image_paths)

314

In [4]:
batch_size = 32
text_embeddings = []

for i in range(0, len(image_paths), batch_size):
    batch = image_paths[i:i+batch_size]
    inputs = {
        ModalityType.VISION: data.load_and_transform_vision_data(batch, device)
    }
    with torch.no_grad():
        embeddings = model(inputs)
    
    text_embeddings.extend(embeddings[ModalityType.VISION])

    print(f"Batch {i//batch_size} is completed")

Batch 0 is completed
Batch 1 is completed
Batch 2 is completed
Batch 3 is completed
Batch 4 is completed
Batch 5 is completed
Batch 6 is completed
Batch 7 is completed
Batch 8 is completed
Batch 9 is completed


In [13]:
import h5py
text_embeddings_list = [torch.cat(text_embeddings).view(len(image_names), -1).to('cpu').numpy()]

In [16]:
with h5py.File('video_embeddings.h5', 'w') as f:
    for video_name, embedding in zip(video_names, text_embeddings_list):
        # Create a dataset for each video name
        f.create_dataset(video_name, data=embedding)

In [18]:
with h5py.File('video_embeddings.h5', 'r') as f:
    # Access embeddings for a specific video
    print(f['video_11'])

<HDF5 dataset "video_11": shape (314, 1024), type "<f4">


In [None]:
type(loaded_text_list) , type(loaded_embeddings) , loaded_text_list.__len__(),  loaded_embeddings.shape

In [None]:
# concepts_text = torch.add(concepts_text , torch.multiply( torch.min(concepts_text) , -1 )) / torch.max(concepts_text, dim=1, keepdim=True).values
# concepts_audio = torch.add(concepts_audio , torch.multiply( torch.min(concepts_audio) , -1 )) / torch.max(concepts_audio, dim=1, keepdim=True).values


# ----------------

In [3]:
import h5py
import numpy as np
file_name= "eccv16_dataset_tvsum_google_pool5_concepts.h5"

In [9]:

def get_video_names(h5py_path):
    with h5py.File(h5py_path, 'r') as f:
        video_names = list(f.keys())
        f.close()
        return video_names

In [10]:
def add_or_overwrite_dataset_in_h5py_group(h5py_path, group_name, dataset_name, dataset ):
    with h5py.File(h5py_path, 'a') as f:
        if group_name in f:
            video_group = f[group_name]  # Access the group
            if dataset_name in video_group:
                del video_group[dataset_name]
                
            video_group.create_dataset(dataset_name, data=dataset)
            
    print(f"{dataset_name} field added to video group {group_name}.")

In [11]:
def show_h5py_file(h5py_path):
    with h5py.File(h5py_path, 'r') as f:
        f.visit(print)  # Prints all groups and datasets
        f.close()

In [12]:
def clear_file(h5py_path, list_of_desired_concepts):
    with h5py.File(h5py_path, 'a') as f:  # Use append mode
        for video_name in f.keys():
            # Collect datasets to delete
            to_delete = [concept for concept in f[video_name].keys() if concept not in list_of_desired_concepts]
            
            # Delete datasets in a separate loop
            for concept in to_delete:
                del f[video_name][concept]
                print(f"Deleted '{concept}' from {video_name}")
        
clear_file(file_name ,['change_points', "features" ,"gtscore","gtsummary","n_frame_per_seg","n_frames","n_steps","picks","user_summary"] )


Deleted 'concept' from video_1
Deleted 'concept' from video_10
Deleted 'concept' from video_11
Deleted 'concept' from video_12
Deleted 'concept' from video_13
Deleted 'concept' from video_14
Deleted 'concept' from video_15
Deleted 'concept' from video_16
Deleted 'concept' from video_17
Deleted 'concept' from video_18
Deleted 'concept' from video_19
Deleted 'concept' from video_2
Deleted 'concept' from video_20
Deleted 'concept' from video_21
Deleted 'concept' from video_22
Deleted 'concept' from video_23
Deleted 'concept' from video_24
Deleted 'concept' from video_25
Deleted 'concept' from video_26
Deleted 'concept' from video_27
Deleted 'concept' from video_28
Deleted 'concept' from video_29
Deleted 'concept' from video_3
Deleted 'concept' from video_30
Deleted 'concept' from video_31
Deleted 'concept' from video_32
Deleted 'concept' from video_33
Deleted 'concept' from video_34
Deleted 'concept' from video_35
Deleted 'concept' from video_36
Deleted 'concept' from video_37
Deleted 'co

In [13]:
def get_length_of_frames(h5py_path , video_name):
    with h5py.File(h5py_path, 'r') as f:
        return len(f[video_name]["picks"])
    

In [14]:
for video_name in get_video_names(file_name):
    # print(video_name)
    # # 
    n = get_length_of_frames(file_name, video_name)
    embeddings = np.random.rand(n , 600)
    add_or_overwrite_dataset_in_h5py_group(file_name, video_name, "concepts", embeddings)


concepts field added to video group video_1.
concepts field added to video group video_10.
concepts field added to video group video_11.
concepts field added to video group video_12.
concepts field added to video group video_13.
concepts field added to video group video_14.
concepts field added to video group video_15.
concepts field added to video group video_16.
concepts field added to video group video_17.
concepts field added to video group video_18.
concepts field added to video group video_19.
concepts field added to video group video_2.
concepts field added to video group video_20.
concepts field added to video group video_21.
concepts field added to video group video_22.
concepts field added to video group video_23.
concepts field added to video group video_24.
concepts field added to video group video_25.
concepts field added to video group video_26.
concepts field added to video group video_27.
concepts field added to video group video_28.
concepts field added to video group 

In [15]:
show_h5py_file(file_name)

video_1
video_1/change_points
video_1/concepts
video_1/features
video_1/gtscore
video_1/gtsummary
video_1/n_frame_per_seg
video_1/n_frames
video_1/n_steps
video_1/picks
video_1/user_summary
video_10
video_10/change_points
video_10/concepts
video_10/features
video_10/gtscore
video_10/gtsummary
video_10/n_frame_per_seg
video_10/n_frames
video_10/n_steps
video_10/picks
video_10/user_summary
video_11
video_11/change_points
video_11/concepts
video_11/features
video_11/gtscore
video_11/gtsummary
video_11/n_frame_per_seg
video_11/n_frames
video_11/n_steps
video_11/picks
video_11/user_summary
video_12
video_12/change_points
video_12/concepts
video_12/features
video_12/gtscore
video_12/gtsummary
video_12/n_frame_per_seg
video_12/n_frames
video_12/n_steps
video_12/picks
video_12/user_summary
video_13
video_13/change_points
video_13/concepts
video_13/features
video_13/gtscore
video_13/gtsummary
video_13/n_frame_per_seg
video_13/n_frames
video_13/n_steps
video_13/picks
video_13/user_summary
video_

In [5]:
data = h5py.File(file_name , 'r')
data["video_1"]['user_summary']

<HDF5 dataset "user_summary": shape (20, 10597), type "<f4">