## Import libraries

In [1]:
import os
import numpy as np
import json
from qdrant_client import QdrantClient
from qdrant_client.http import models
from app.utils import get_video_pack_files, get_keyframe_data
from app.PATH import CLIP_FEATURES_PATH, MAP_KEYFRAMES_PATH

  from .autonotebook import tqdm as notebook_tqdm


## Create Qdrant client

In [2]:
client = QdrantClient(
    url="https://9bf65806-b1f1-498b-b309-079694a5a23b.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key=os.getenv("QDRANT_TOKEN"),
    timeout=60,
)

## Constant

In [7]:
# VIDEO_PACK_LIST = [pack for pack in range(21, 31)]
VIDEO_PACK_LIST = [pack for pack in range(1, 21)]
COLLECTION_NAME = "my_collection"
print(VIDEO_PACK_LIST)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


## Create colletion

In [8]:
if COLLECTION_NAME not in [c.name for c in client.get_collections().collections]:
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(size=512, distance=models.Distance.COSINE),
        optimizers_config=models.OptimizersConfigDiff(
            indexing_threshold=0,
        ),
    )
    print(f"Collection '{COLLECTION_NAME}' created.")
else:
    print(f"Collection '{COLLECTION_NAME}' already exists.")

Collection 'my_collection' already exists.


## Upsert data automatically

In [None]:
if not os.path.exists(CLIP_FEATURES_PATH):
    print("The folder does not exist")
else:
    start_id = client.count(
        collection_name=COLLECTION_NAME,
        count_filter=models.Filter(
            must=[]
        ),
        exact=True,
    ).count
    
    print(os.listdir(CLIP_FEATURES_PATH)[:5])
    for pack in VIDEO_PACK_LIST:
        if pack < 10:
            video_pack = f"K0{pack}"
        elif pack < 100:
            video_pack = f"K{pack}"
        print(f"Process {video_pack}")
        files_per_video_pack = get_video_pack_files(CLIP_FEATURES_PATH, video_pack)
        # A list of .npy files, each file contains a list of 512 features vectors, 
        # each vector is embedded from a keyframe of the video that has the same name as the file

        if len(files_per_video_pack) == 0:
            print(f"No files found for video pack {video_pack}")
            continue

        for file in files_per_video_pack:
            # Process each video features in the pack
            feature = np.load(os.path.join(CLIP_FEATURES_PATH, file))
            if feature.shape[1] != 512:
                print(f"File {file} does not have 512 features, skipping")
                continue

            # Insert the feature into the collection
            file_name = file[:-4]
            num_frames = feature.shape[0]
            df = get_keyframe_data(MAP_KEYFRAMES_PATH, file_name)
            payloads = [{
                "pack": file_name[:3], 
                "video": file_name[4:],
                "frame_index": df.iloc[i]["frame_idx"], 
                "frame": f"00{i + 1}.jpg" if i + 1 < 10 else f"0{i + 1}.jpg" if i + 1 < 100 else f"{i + 1}.jpg"
                }
                for i in range(num_frames)
            ]

            client.upsert(
                collection_name=COLLECTION_NAME,
                points=models.Batch(
                    ids=range(start_id, start_id + feature.shape[0]),
                    vectors=feature.tolist(),
                    payloads=payloads
                )
            )
            start_id += feature.shape[0]

['K01_V001.npy', 'K01_V002.npy', 'K01_V003.npy', 'K01_V004.npy', 'K01_V005.npy']
Process K01
Process K02
Process K03
Process K04
Process K05
Process K06
Process K07
Process K08
Process K09
Process K10
Process K11
Process K12
Process K13
Process K14
Process K15
Process K16
Process K17
Process K18
Process K19
Process K20


## Upsert data manually (1 pack at a time)

In [None]:
# start_id = 0

In [None]:
# video_pack = "L30" # manually iterate from this list ["L21", "L22", "L23", ..., "L30"]

# files_per_video_pack = get_video_pack_files(CLIP_FEATURES_PATH, video_pack)
# # A list of .npy files, each file contains a list of 512 features vectors,
# # each vector is embedded from a keyframe of the video that has the same name as the file

# if len(files_per_video_pack) == 0:
#     print(f"No files found for video pack {video_pack}")
# else:
#     for file_name in files_per_video_pack:
#         feature = np.load(os.path.join(CLIP_FEATURES_PATH, file_name))
#         if feature.shape[1] != 512:
#             print(f"File {file_name} does not have 512 features, skipping")
#             continue
#         print(f"Processing file {file_name} with shape {feature.shape}")

#         # Insert the feature into the collection
#         num_frames = feature.shape[0]
#         payloads = [
#             {"origin": file_name[:-4], "frame_id": i + 1}
#             for i in range(num_frames)
#         ]

#         client.upsert(
#             collection_name=COLLECTION_NAME,
#             points=models.Batch(
#                 ids=range(start_id, start_id + feature.shape[0]),
#                 vectors=feature.tolist(),
#                 payloads=payloads
#             )
#         )
#         start_id += feature.shape[0]

## Check collection status

In [16]:
def count_points_without_vectors(client, collection_name):
    offset = None
    count = 0
    while True:
        points, offset = client.scroll(
            collection_name=collection_name,
            with_vectors=True,
            offset=offset,
            limit=1000  # batch size
        )
        for point in points:
            if point.vector is None:
                count += 1
        if offset is None:
            break
    return count

no_vector_count = count_points_without_vectors(client, COLLECTION_NAME)
print(f"Points with no vectors: {no_vector_count}")

Points with no vectors: 0


## Enable indexing

In [17]:
client.update_collection(
    collection_name=COLLECTION_NAME,
    optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000),
)

True

In [28]:
client.create_payload_index(COLLECTION_NAME, "pack", models.PayloadSchemaType.KEYWORD)
client.create_payload_index(COLLECTION_NAME, "video", models.PayloadSchemaType.KEYWORD)

UpdateResult(operation_id=989, status=<UpdateStatus.COMPLETED: 'completed'>)

In [19]:
print(client.get_collection(COLLECTION_NAME))
# green is good
# yellow is danger - wait
# red is bad

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=382299 points_count=382299 segments_count=5 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=512, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_con

## Delete collection

In [None]:
# client.delete_collection(collection_name=COLLECTION_NAME)

True

## Delete all points

In [None]:
# client.delete(
#     collection_name=COLLECTION_NAME,
#     points_selector=models.FilterSelector(
#         filter=models.Filter(  # match all points
#             must=[]
#         )
#     )
# )

UpdateResult(operation_id=112, status=<UpdateStatus.COMPLETED: 'completed'>)

## Count all points

In [None]:
client.count(
    collection_name=COLLECTION_NAME,
    count_filter=models.Filter(
        must=[]
    ),
    exact=True,
)
# 177321
# 382299

CountResult(count=382299)

## Retrive a list of points

In [22]:
client.retrieve(
    collection_name=COLLECTION_NAME,
    ids=[0, 9, 99],
)

[Record(id=0, payload={'pack': 'L21', 'video': 'V001', 'frame_index': 0.0, 'frame': '001.jpg'}, vector=None, shard_key=None, order_value=None),
 Record(id=9, payload={'pack': 'L21', 'video': 'V001', 'frame_index': 1131.0, 'frame': '010.jpg'}, vector=None, shard_key=None, order_value=None),
 Record(id=99, payload={'pack': 'L21', 'video': 'V001', 'frame_index': 12450.0, 'frame': '100.jpg'}, vector=None, shard_key=None, order_value=None)]

In [None]:
client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector, # assuming query_vector is defined, a python list of 512 features (float)
    with_payload=True,
    with_vectors=True,
    limit=20,
)

## Update payload

In [25]:
packs = ["L25", "L26"]
all_tags = dict()

In [29]:
for pack in packs:
    data = json.load(open(f"{pack}_video_tags.json", 'r'))
    videos = data.keys()
    videos = sorted(list(videos))
    for origin_video in videos:
        pack = origin_video[:3]
        video = origin_video[4:]
        client.set_payload(
            collection_name=COLLECTION_NAME,
            payload={"tags": data[origin_video]},
            points=models.Filter(
                must=[
                    models.FieldCondition(key="pack", match=models.MatchValue(value=pack)),
                    models.FieldCondition(key="video", match=models.MatchValue(value=video)),
                ]
            )
        )

    all_tags[pack] = []
    for tags in data.values():
        all_tags[pack].extend(tags)
    all_tags[pack] = sorted(list(set(all_tags[pack])))

In [23]:
hits = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="pack", match=models.MatchValue(value="K20")),
            models.FieldCondition(key="video", match=models.MatchValue(value="V011"))
        ],
    ),
    with_payload=True,
    limit=10
)
hits

([Record(id=375226, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 0.0, 'frame': '001.jpg'}, vector=None, shard_key=None, order_value=None),
  Record(id=375227, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 90.0, 'frame': '002.jpg'}, vector=None, shard_key=None, order_value=None),
  Record(id=375228, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 265.0, 'frame': '003.jpg'}, vector=None, shard_key=None, order_value=None),
  Record(id=375229, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 343.0, 'frame': '004.jpg'}, vector=None, shard_key=None, order_value=None),
  Record(id=375230, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 397.0, 'frame': '005.jpg'}, vector=None, shard_key=None, order_value=None),
  Record(id=375231, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 450.0, 'frame': '006.jpg'}, vector=None, shard_key=None, order_value=None),
  Record(id=375232, payload={'pack': 'K20', 'video': 'V011', 'frame_index': 474.0, 'frame':

In [75]:
with open("all_tags.json", 'w', encoding="utf-8") as f:
    json.dump(all_tags, f, ensure_ascii=False)

In [31]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://9bf65806-b1f1-498b-b309-079694a5a23b.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.WUaO1xhU2TRrdl75RUkyxv2EkY6wN5o2oZ9PEBU-_3o",
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='my_collection')]
