In [None]:
!pip install annoy

In [None]:
from annoy import AnnoyIndex
import numpy as np
import json
import re
import os
import time

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
class Annoy_Processing:
  def __init__(self, root_database, json_path: str):
    self.root_database = root_database
    self.id2img_fps = self.load_json_file(json_path)

    ## Scaling
    self.sc = StandardScaler()
    ## PCA
    self.pca = PCA(n_components = 100)

  def load_json_file(self, json_path: str):
    with open(json_path, 'r') as f:
      js = json.loads(f.read())

    return {int(k):v for k,v in js.items()}

  def buildAnnoyIndex(self, id_query, ls_id_db, metric, ntrees, feature_shape):
    index = AnnoyIndex(feature_shape, metric)
    
    ####### MERGE ID_QUERY WTIH ID DATABASE #######
    ls_id_db.insert(0, id_query)

    ####### BUILD ANNOY #######
    for id in range(len(ls_id_db)):
      """
      Format Database:

        Database/
        │
        ├── KeyFramesC00_V00/ 
        │   ├── C00_V0000/
        │       └── 000000.jpg - Tất cả các KeyFrames được trích xuất từ videos
        │
        ├── CLIPFeatures_C00_V00/
        │   └── C00_V0000.npy - Tất cả các CLIP Features của KeyFrames được lưu thành một file npy duy nhất

      """

      ##### GET PATH NPY FILE #####
      infos = self.id2img_fps[id] ## Get Infos from keyframe_id.json

      image_path = infos["image_path"] ## Ex: Database/KeyFramesC00_V00/C00_V0000/000000.jpg

      batch_name = image_path.split('/')[-3].split('_')[-1] ## Ex: V00
      video_id = re.sub('_V\d+', '', image_path.split('/')[-2]) ## Ex: C00
      clip_name = f"CLIPFeatures_{video_id}_{batch_name}" ## Ex: CLIPFeatures_C00_V00

      npy_name = image_path.split('/')[-2] + '.npy' ## Ex: C00_V0000.npy

      feat_path = os.path.join(self.root_database, clip_name, npy_name) ## Ex: Database/CLIPFeatures_C00_V00/C00_V0000.npy

      ##### LOAD NPY FILE #####
      feats = np.load(feat_path)
      

      # Scaling and PCA
      # feats = self.sc.fit_transform(feats)
      feats = self.pca.fit_transform(feats)
      
      # feats = self.sc.transform(feats)
      # feats = pca.transform(feats)   

      ##### GET ID IN NPY FILE #####
      lst_id = os.listdir(re.sub('/\d+.jpg','',image_path))
      lst_id = sorted(lst_id, key=lambda x:int(x.split('.')[0]))

      id_feats = lst_id.index(image_path.split('/')[-1])
      
      ##### GET FEATURES #####
      feat = feats[id_feats]

      ##### ADD FEATURE TO ANNOY #####
      # feat = feat.astype(np.float32).reshape(1,-1)
      index.add_item(id, feat)

    ##### BUILD #####
    index.build(ntrees)

    return index

  def annoy_search(self, id_query, list_id_database, metric, ntrees, topk, feature_shape=512):
    ls_id_db = list_id_database.copy()
    ##### Build Annoy #####
    start_time = time.time()
    annoy_idx = self.buildAnnoyIndex(id_query, ls_id_db, metric, ntrees, feature_shape)
    print(f'Time Build for {len(list_id_database)+1} samples: {time.time()-start_time}')

    ##### Searching #####
    hit_id=0
    Nb_neighbors=topk

    start_time = time.time()
    idx_image = annoy_idx.get_nns_by_item(hit_id, Nb_neighbors)
    print(f'Time Search for Topk={topk}: {time.time()-start_time}')

    ##### Get Infos #####
    infos_query = list(map(self.id2img_fps.get, list(idx_image)))
    image_paths = [info['image_path'] for info in infos_query]

    return idx_image, infos_query, image_paths

In [None]:
root_database = '/content/drive/MyDrive/Merge_Database'
json_path = '/content/drive/MyDrive/Video_Retrieval/faiss_merge_colab/keyframes_id.json'

## Test Samples ##
id_query = 0
list_id_database = list(range(1,500))

## Params ##
metric = "manhattan"
ntrees = 10
topk = 200

## Annoy ##
my_annoy = Annoy_Processing(root_database, json_path)
idx_image, infos_query, image_paths = my_annoy.annoy_search(id_query, list_id_database, metric, ntrees, topk=200)

In [None]:
def main():
  root_database = '/content/drive/MyDrive/Merge_Database'
  json_path = '/content/drive/MyDrive/Video_Retrieval/faiss_merge_colab/keyframes_id.json'

  ## Test Samples ##
  id_query = 0
  list_id_database = list(range(1,500))

  ## Params ##
  metric = "manhattan"
  ntrees = 10
  topk = 200

  ## Annoy Implement ##
  my_annoy = Annoy_Processing(root_database, json_path)
  idx_image, infos_query, image_paths = my_annoy.annoy_search(id_query, list_id_database, metric, ntrees, topk=200, feature_shape=100)

if __name__ == "__main__":
    main()