In [None]:
# Instal dependency
!pip install git+https://github.com/openai/CLIP.git

In [2]:
# Import module
import os
import clip
import glob
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm

# Parse data path

In [3]:
keyframes_dir = './Keyframes'
all_keyframe_paths = dict()
for part in sorted(os.listdir(keyframes_dir)):
    data_part = part.split('_')[-1] # L01, L02 for ex
    all_keyframe_paths[data_part] =  dict()

for data_part in sorted(all_keyframe_paths.keys()):
    data_part_path = f'{keyframes_dir}/{data_part}'
    video_dirs = sorted(os.listdir(data_part_path))
    video_ids = [video_dir.split('_')[-1] for video_dir in video_dirs]
    for video_id, video_dir in zip(video_ids, video_dirs):
        keyframe_paths = sorted(glob.glob(f'{data_part_path}/{video_dir}/*.jpg'))
        all_keyframe_paths[data_part][video_id] = keyframe_paths

# Model

In [None]:
##### Load Model #####
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model, preprocess = clip.load("ViT-B/16", device=device)

In [None]:
bs = 4
save_dir = './CLIP_features'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

for key, video_keyframe_paths in all_keyframe_paths.items():
    video_ids = sorted(video_keyframe_paths.keys())
    
    if not os.path.exists(os.path.join(save_dir, key)):
        os.mkdir(os.path.join(save_dir, key))
    
    for video_id in tqdm(video_ids):
        video_feats = []
        video_keyframe_path = video_keyframe_paths[video_id]
        for i in range(0, len(video_keyframe_path), bs):
            # Support batchsize inferencing
            images = []
            image_paths = video_keyframe_path[i:i+bs]
            for image_path in image_paths:
                image = preprocess(Image.open(image_path)).unsqueeze(0)
                images.append(image)
            images = torch.cat(images).to(device)

            with torch.no_grad():
                image_feats = model.encode_image(images)
            image_feats /= image_feats.norm(dim=-1, keepdim=True)

            for b in range(image_feats.shape[0]):
                video_feats.append(image_feats[b].detach().cpu().numpy().astype(np.float32).flatten())
        
        np.save(f'{save_dir}/{key}/{video_id}.npy', video_feats)