#### **Import Libraries**

In [1]:
import os
import scipy
import pickle
import json
import bm25s
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


resource module not available on Windows


In [2]:
%cd D:/AIC2024/dataset

D:\AIC2024\dataset


#### **Utils**

In [3]:
def sorted_by_id(keyframe_paths):
    id_path_keyframes = []
    for keyframe_path in keyframe_paths:
        keyframe_filename = keyframe_path.split('/')[-1]
        keyframe_id = int(keyframe_filename.split('.')[0])
        id_path_keyframes.append((keyframe_id, keyframe_path))
    sorted_id_path_keyframes = sorted(id_path_keyframes, key=lambda id_path: id_path[0])
    return [id_path[1] for id_path in sorted_id_path_keyframes]

def sort_dict_by_filename(feature_dict):
    # Sort the dictionary based on the extracted numeric part of the filenames
    sorted_keyframe_paths = sorted_by_id(list(feature_dict.keys()))
    return {keyframe_path: feature_dict[keyframe_path] for keyframe_path in sorted_keyframe_paths}

#### **Metadata Encoder**

In [4]:
class MetadataEncoder:
    def __init__(self, metadata_dirs, all_datatype, ngram_range=(1, 1)):
        self.metadata_dirs = metadata_dirs
        self.all_datatype = all_datatype
        self.ngram_range = ngram_range

    def load_context(self, metadata_dir, data_type):
        # Initialize context and data_paths
        feature_data_paths = []
        # Get sorted list of all directories matching the metadata_path
        part_data_dirs = sorted([metadata_dir + '/' + video_part for video_part in os.listdir(metadata_dir)])
        # Iterate through each directory and gather all .txt file paths
        for part_data_dir in part_data_dirs:
            video_data_paths = sorted([part_data_dir + '/' + video_id for video_id in os.listdir(part_data_dir)])
            feature_data_paths.extend(video_data_paths)
        
        keyframe_paths, metadata_features = [], []
        for fearure_data_path in feature_data_paths:
            with open(fearure_data_path, 'r') as f:
                metadata_dict = json.load(f)
                sorted_metadata_dict = sort_dict_by_filename(metadata_dict)
                keyframe_paths.extend(list(sorted_metadata_dict.keys()))
                metadata_features.extend([metadata_feature[data_type] for metadata_feature in sorted_metadata_dict.values()])
        
        id2image_fps = {index : path for index, path in enumerate(keyframe_paths)}
        return id2image_fps, metadata_features

    def save_context_tfidf(self, save_path, data_type):
        os.makedirs(save_path, exist_ok=True)
        save_context_matrix_path = os.path.join(save_path, f'sparse_context_matrix_{data_type}.npz')
        save_transform_path = os.path.join(save_path, f'transform_{data_type}.pkl')
        json_file_path = os.path.join(save_path, f'id2image_fps_{data_type}.json')

        scipy.sparse.save_npz(save_context_matrix_path, self.context_matrix)
        with open(save_transform_path, 'wb') as f:
            pickle.dump(self.transform, f)
        with open(json_file_path, "w") as file:
            json.dump(self.id2image_fps, file, ensure_ascii=False, indent=4)
    
    def save_context_bm25(self, save_path, data_type):
        os.makedirs(save_path, exist_ok=True)
        self.transform.save(save_path, corpus=self.metadata_features)
        json_file_path = os.path.join(save_path, f'id2image_fps_{data_type}.json')
        with open(json_file_path, "w") as file:
            json.dump(self.id2image_fps, file, ensure_ascii=False, indent=4)
        
    def extract_metadata_tfidf(self, data_type):
        metadata_dir = self.metadata_dirs[data_type]
        self.id2image_fps, self.metadata_features = self.load_context(metadata_dir, data_type)
        self.transform = TfidfVectorizer(input = 'content', ngram_range = (1, 1), token_pattern=r"(?u)\b[\w\d]+\b")
        self.context_matrix = self.transform.fit_transform(self.metadata_features).tocsr()
    
    def extract_metadata_bm25(self, data_type):
        metadata_dir = self.metadata_dirs[data_type]
        self.id2image_fps, self.metadata_features = self.load_context(metadata_dir, data_type)
        tokenized_context = bm25s.tokenize(self.metadata_features)
        self.transform = bm25s.BM25(method="lucene")
        self.transform.index(tokenized_context)

#### **Metadata Paths Definition**

In [5]:
object_path = './metadata/object/features'
color_path = './metadata/color/features'
metadata_dirs = {
    'object_bbox': object_path,
    'object_class': object_path,
    'object_number': object_path,
    'color_bbox': color_path,
    'color_class': color_path,
}
all_datatype = ['object_bbox', 'object_class', 'object_number', 'color_bbox', 'color_class']
context_encoder = MetadataEncoder(metadata_dirs=metadata_dirs, all_datatype=all_datatype, ngram_range=(1, 1))

#### **Encoding using TF-IDF**

In [6]:
metadata_types = ['object_bbox', 'object_class', 'object_number']
for metadata_type in metadata_types:
    save_path = f'./metadata/object/dict/tf-idf/{metadata_type}'
    context_encoder.extract_metadata_tfidf(metadata_type)
    context_encoder.save_context_tfidf(save_path, metadata_type)

In [7]:
metadata_types = ['color_bbox', 'color_class']
for metadata_type in metadata_types:
    save_path = f'./metadata/color/dict/tf-idf/{metadata_type}'
    context_encoder.extract_metadata_tfidf(metadata_type)
    context_encoder.save_context_tfidf(save_path, metadata_type)

#### **Encoding using BM25**

In [8]:
metadata_types = ['object_bbox', 'object_class', 'object_number']
for metadata_type in metadata_types:
    save_path = f'./metadata/object/dict/bm25/{metadata_type}'
    context_encoder.extract_metadata_bm25(metadata_type)
    context_encoder.save_context_bm25(save_path, metadata_type)

Finding newlines for mmindex: 100%|██████████| 50.1M/50.1M [00:04<00:00, 11.8MB/s]
Finding newlines for mmindex: 100%|██████████| 9.05M/9.05M [00:05<00:00, 1.67MB/s]
Finding newlines for mmindex: 100%|██████████| 6.52M/6.52M [00:04<00:00, 1.47MB/s]


In [9]:
metadata_types = ['color_bbox', 'color_class']
for metadata_type in metadata_types:
    save_path = f'./metadata/color/dict/bm25/{metadata_type}'
    context_encoder.extract_metadata_bm25(metadata_type)
    context_encoder.save_context_bm25(save_path, metadata_type)

Finding newlines for mmindex: 100%|██████████| 210M/210M [00:05<00:00, 41.0MB/s]   
Finding newlines for mmindex: 100%|██████████| 156M/156M [00:04<00:00, 38.7MB/s] 
