# Use pretrained backbone to get video embeddings

In [1]:
import torch
import torchvision
import pytorchvideo

In [2]:
torch.cuda.empty_cache()
#device = "cpu"
device = "cuda:1" if torch.cuda.is_available() else "cpu"
print("Device: " + device)
print(f"Devices count: {torch.cuda.device_count()}")

Device: cuda:1
Devices count: 2


In [3]:
from pathlib import Path

In [4]:
import json
import pandas
import numpy

In [5]:
from tqdm import tqdm

In [6]:
from misc.utils_mvit import *



In [9]:
data_path = Path("./data/test_data_yappy")
videos_path = data_path / "test_dataset"
metadata_path = data_path / "test.csv"

In [10]:
filenames = walk_directory(videos_path, max_size=200*1024*1024)
filenames = sorted(filenames)

## Model

In [11]:
video_transform = VideoTransform()

In [12]:
#model_name = "mvit_v1_b"
model_name = "mvit_v2_s"
model = getattr(torchvision.models.video, model_name)(weights=torchvision.models.video.MViT_V2_S_Weights.KINETICS400_V1)

In [13]:
model = model.to(device)
model = model.eval()

In [14]:
embeddings, durations = encode_videos(
    videos_path,
    tqdm(filenames),
    model,
    video_transform,
    torchvision.models.video.MViT_V1_B_Weights.KINETICS400_V1.transforms(),
    device,
    batch_size=4
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [3:26:19<00:00, 12.38s/it]


## Save embeddings

In [15]:
import os

In [16]:
embeddings_path = data_path / model_name
os.makedirs(embeddings_path, exist_ok=True)

In [17]:
embeddings_path_torch = embeddings_path / "embeddings.pt"
embeddings_uuid_path  = embeddings_path / "embeddings_uuid.csv"

In [18]:
torch.save(embeddings, embeddings_path_torch)

In [19]:
embeddings_uuid = pandas.DataFrame([filename.split('.')[0] for filename in filenames], columns=["uuid"])

In [20]:
embeddings_uuid.to_csv(embeddings_uuid_path, index=False)