#### **Import Libraries**

In [1]:
import os
import faiss
import numpy as np
import json
from tqdm import tqdm

#### **Parsing Data Path**

In [2]:
def parse_data_path(feature_dir):
    all_feature_paths = dict()
    for feature_part in sorted(os.listdir(feature_dir)):
        all_feature_paths[feature_part] = dict()
    for feature_part in sorted(all_feature_paths.keys()):
        feature_part_path = f'{feature_dir}/{feature_part}'
        feature_paths = sorted(os.listdir(feature_part_path))
        feature_ids = [feature_path.split('.')[0] for feature_path in feature_paths]
        for feature_id, feature_path in zip(feature_ids, feature_paths):
            feature_path_full = f'{feature_part_path}/{feature_path}'
            all_feature_paths[feature_part][feature_id] = feature_path_full
    return all_feature_paths

#### **Utils**

In [3]:
def load_numpy(numpy_data_path):
    np_data = np.load(numpy_data_path)
    return np_data

def reading_json_file(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

def convert_dict(begin_index, original_dict):
    # Convert keys to integers
    converted_dict = {begin_index + int(key): value for key, value in original_dict.items()}
    return converted_dict

def save_bin_file(embeddings, bin_file):
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, bin_file)

#### **Create CLIP/L14 Bin File**

In [None]:
# File notebook này để tạo file bin cho các feature mô hình CLIP, BLIP
# Muốn tạo folder cho mô hình nào thì cd đến folder chứa feature của mô hình đó trong dataset
%cd D:/AIC2024/dataset/clip/clip-vit-l14-laion400m

In [None]:
all_clip_paths = parse_data_path(feature_dir='./features')
all_id2img_paths = parse_data_path(feature_dir='./id2image')

In [18]:
global_embeddings = []
global_id2image = {}
begin_index = 0

for clip_part in sorted(all_clip_paths.keys()):
    clip_dict = all_clip_paths[clip_part]
    id2img_dict = all_id2img_paths[clip_part]
    for video_id in tqdm(sorted(clip_dict.keys())):
        numpy_data_path = clip_dict[video_id]
        id2image_path = id2img_dict[video_id]
        numpy_data = load_numpy(numpy_data_path)
        id2image_data = reading_json_file(id2image_path)
        id2image_data = convert_dict(begin_index, id2image_data)
        global_embeddings.append(numpy_data)
        global_id2image.update(id2image_data)
        begin_index += len(id2image_data.items())
global_embeddings = np.concatenate(global_embeddings, axis=0)

100%|██████████| 31/31 [00:02<00:00, 10.97it/s]
100%|██████████| 31/31 [00:03<00:00,  8.30it/s]
100%|██████████| 30/30 [00:02<00:00, 10.97it/s]
100%|██████████| 30/30 [00:02<00:00, 10.91it/s]
100%|██████████| 31/31 [00:02<00:00, 11.70it/s]
100%|██████████| 31/31 [00:02<00:00, 10.88it/s]
100%|██████████| 31/31 [00:02<00:00, 11.27it/s]
100%|██████████| 30/30 [00:07<00:00,  4.16it/s]
100%|██████████| 29/29 [00:03<00:00,  7.62it/s]
100%|██████████| 29/29 [00:03<00:00,  7.63it/s]
100%|██████████| 30/30 [00:02<00:00, 10.88it/s]
100%|██████████| 30/30 [00:02<00:00, 11.63it/s]


In [19]:
save_dir = './dict'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
global_bin_save_path = os.path.join(save_dir, 'l14_laion400m.bin')
global_id2img_save_path = os.path.join(save_dir, 'l14_laion400m.json')
with open(global_id2img_save_path, 'w') as file:
    json.dump(global_id2image, file, ensure_ascii=False, indent=4)
save_bin_file(global_embeddings, bin_file=global_bin_save_path)

#### **Create CLIP/H14 Bin File**

In [None]:
# File notebook này để tạo file bin cho các feature mô hình CLIP, BLIP
# Muốn tạo folder cho mô hình nào thì cd đến folder chứa feature của mô hình đó trong dataset
%cd D:/AIC2024/dataset/clip/clip-vit-h14-laion2b-s32b-b79k

In [None]:
all_clip_paths = parse_data_path(feature_dir='./features')
all_id2img_paths = parse_data_path(feature_dir='./id2image')

In [None]:
global_embeddings = []
global_id2image = {}
begin_index = 0

for clip_part in sorted(all_clip_paths.keys()):
    clip_dict = all_clip_paths[clip_part]
    id2img_dict = all_id2img_paths[clip_part]
    for video_id in tqdm(sorted(clip_dict.keys())):
        numpy_data_path = clip_dict[video_id]
        id2image_path = id2img_dict[video_id]
        numpy_data = load_numpy(numpy_data_path)
        id2image_data = reading_json_file(id2image_path)
        id2image_data = convert_dict(begin_index, id2image_data)
        global_embeddings.append(numpy_data)
        global_id2image.update(id2image_data)
        begin_index += len(id2image_data.items())
global_embeddings = np.concatenate(global_embeddings, axis=0)

In [None]:
save_dir = './dict'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
global_bin_save_path = os.path.join(save_dir, 'h14_laion2b.bin')
global_id2img_save_path = os.path.join(save_dir, 'h14_laion2b.json')
with open(global_id2img_save_path, 'w') as file:
    json.dump(global_id2image, file, ensure_ascii=False, indent=4)
save_bin_file(global_embeddings, bin_file=global_bin_save_path)

#### **Create BLIP Bin File**

In [None]:
# File notebook này để tạo file bin cho các feature mô hình CLIP, BLIP
# Muốn tạo folder cho mô hình nào thì cd đến folder chứa feature của mô hình đó trong dataset
%cd D:/AIC2024/dataset/blip/blip2-vitL

In [None]:
all_blip_paths = parse_data_path(feature_dir='./features')
all_id2img_paths = parse_data_path(feature_dir='./id2image')

In [None]:
global_embeddings = []
global_id2image = {}
begin_index = 0

for blip_part in sorted(all_blip_paths.keys()):
    blip_dict = all_blip_paths[blip_part]
    id2img_dict = all_id2img_paths[blip_part]
    for video_id in tqdm(sorted(blip_dict.keys())):
        numpy_data_path = blip_dict[video_id]
        id2image_path = id2img_dict[video_id]
        numpy_data = load_numpy(numpy_data_path)
        id2image_data = reading_json_file(id2image_path)
        id2image_data = convert_dict(begin_index, id2image_data)
        global_embeddings.append(numpy_data)
        global_id2image.update(id2image_data)
        begin_index += len(id2image_data.items())
global_embeddings = np.concatenate(global_embeddings, axis=0)

In [None]:
save_dir = './dict'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
global_bin_save_path = os.path.join(save_dir, 'blip_vit.bin')
global_id2img_save_path = os.path.join(save_dir, 'blip_vit.json')
with open(global_id2img_save_path, 'w') as file:
    json.dump(global_id2image, file, ensure_ascii=False, indent=4)
save_bin_file(global_embeddings, bin_file=global_bin_save_path)

#### **Create BEIT base model Bin File**

In [4]:
# File notebook này để tạo file bin cho các feature mô hình CLIP, BLIP
# Muốn tạo folder cho mô hình nào thì cd đến folder chứa feature của mô hình đó trong dataset
%cd D:/AIC2024/dataset/beit/base

D:\AIC2024\dataset\beit


In [5]:
all_blip_paths = parse_data_path(feature_dir='./features')
all_id2img_paths = parse_data_path(feature_dir='./id2image')

In [6]:
global_embeddings = []
global_id2image = {}
begin_index = 0

for blip_part in sorted(all_blip_paths.keys()):
    blip_dict = all_blip_paths[blip_part]
    id2img_dict = all_id2img_paths[blip_part]
    for video_id in tqdm(sorted(blip_dict.keys())):
        numpy_data_path = blip_dict[video_id]
        id2image_path = id2img_dict[video_id]
        numpy_data = load_numpy(numpy_data_path)
        id2image_data = reading_json_file(id2image_path)
        id2image_data = convert_dict(begin_index, id2image_data)
        global_embeddings.append(numpy_data)
        global_id2image.update(id2image_data)
        begin_index += len(id2image_data.items())
global_embeddings = np.concatenate(global_embeddings, axis=0)

100%|██████████| 31/31 [00:01<00:00, 17.34it/s]
100%|██████████| 31/31 [00:01<00:00, 16.82it/s]
100%|██████████| 30/30 [00:01<00:00, 17.14it/s]
100%|██████████| 30/30 [00:01<00:00, 24.69it/s]
100%|██████████| 31/31 [00:01<00:00, 26.86it/s]
100%|██████████| 31/31 [00:01<00:00, 26.38it/s]
100%|██████████| 31/31 [00:01<00:00, 24.70it/s]
100%|██████████| 30/30 [00:01<00:00, 24.89it/s]
100%|██████████| 29/29 [00:01<00:00, 16.62it/s]
100%|██████████| 29/29 [00:01<00:00, 25.94it/s]
100%|██████████| 30/30 [00:02<00:00, 11.82it/s]
100%|██████████| 30/30 [00:02<00:00, 12.43it/s]


In [7]:
save_dir = './dict'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
global_bin_save_path = os.path.join(save_dir, 'base_beit.bin')
global_id2img_save_path = os.path.join(save_dir, 'base_beit.json')
with open(global_id2img_save_path, 'w') as file:
    json.dump(global_id2image, file, ensure_ascii=False, indent=4)
save_bin_file(global_embeddings, bin_file=global_bin_save_path)

#### **Create BEIT large model Bin File**

In [None]:
# File notebook này để tạo file bin cho các feature mô hình CLIP, BLIP
# Muốn tạo folder cho mô hình nào thì cd đến folder chứa feature của mô hình đó trong dataset
%cd D:/AIC2024/dataset/beit/large

In [None]:
all_blip_paths = parse_data_path(feature_dir='./features')
all_id2img_paths = parse_data_path(feature_dir='./id2image')

In [None]:
global_embeddings = []
global_id2image = {}
begin_index = 0

for blip_part in sorted(all_blip_paths.keys()):
    blip_dict = all_blip_paths[blip_part]
    id2img_dict = all_id2img_paths[blip_part]
    for video_id in tqdm(sorted(blip_dict.keys())):
        numpy_data_path = blip_dict[video_id]
        id2image_path = id2img_dict[video_id]
        numpy_data = load_numpy(numpy_data_path)
        id2image_data = reading_json_file(id2image_path)
        id2image_data = convert_dict(begin_index, id2image_data)
        global_embeddings.append(numpy_data)
        global_id2image.update(id2image_data)
        begin_index += len(id2image_data.items())
global_embeddings = np.concatenate(global_embeddings, axis=0)

In [None]:
save_dir = './dict'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
global_bin_save_path = os.path.join(save_dir, 'large_beit.bin')
global_id2img_save_path = os.path.join(save_dir, 'large_beit.json')
with open(global_id2img_save_path, 'w') as file:
    json.dump(global_id2image, file, ensure_ascii=False, indent=4)
save_bin_file(global_embeddings, bin_file=global_bin_save_path)