In [1]:
import torch
import torchvision
import pytorchvideo

In [2]:
torch.cuda.empty_cache()
#device = "cpu"
device = "cuda:1" if torch.cuda.is_available() else "cpu"
print("Device: " + device)
print(f"Devices count: {torch.cuda.device_count()}")

Device: cuda:1
Devices count: 2


In [3]:
from pathlib import Path

In [4]:
import faiss
import json
import pandas
import numpy

In [5]:
from tqdm import tqdm

In [6]:
from misc.utils import *
from misc.predict import *



In [7]:
data_path = Path("./data")
videos_path = data_path / "train_dataset"
metadata_path = data_path / "train.csv"

In [8]:
model_name = "mvit_v2_s"
embeddings_path = data_path / model_name

In [9]:
embeddings_path_torch = embeddings_path / "embeddings.pt"
embeddings_uuid_path  = embeddings_path / "embeddings_uuid.csv"

## Load metadata

In [10]:
pandas.set_option('display.width', 100)
pandas.set_option('display.max_colwidth', 100)

In [11]:
metadata_train = pandas.read_csv(metadata_path, index_col="uuid")
embeddings_uuid = pandas.read_csv(embeddings_uuid_path)

In [12]:
id_to_uuid = embeddings_uuid["uuid"].to_numpy()
uuid_to_id = {value: index for index, value in enumerate(id_to_uuid)}

In [13]:
metadata_train.head()

Unnamed: 0_level_0,created,link,is_duplicate,duplicate_for,is_hard
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,2024-06-01 00:05:43,https://s3.ritm.media/yappy-db-duplicates/23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4.mp4,False,,False
2fa37210-3c25-4a87-88f2-1242c2c8a699,2024-06-01 00:11:01,https://s3.ritm.media/yappy-db-duplicates/2fa37210-3c25-4a87-88f2-1242c2c8a699.mp4,False,,False
31cc33d5-95de-4799-ad01-87c8498d1bde,2024-06-01 00:13:20,https://s3.ritm.media/yappy-db-duplicates/31cc33d5-95de-4799-ad01-87c8498d1bde.mp4,False,,False
03abd0ec-609e-4eea-9f2a-b6b7442bc881,2024-06-01 00:27:23,https://s3.ritm.media/yappy-db-duplicates/03abd0ec-609e-4eea-9f2a-b6b7442bc881.mp4,False,,False
22ee0045-004b-4c7e-98f2-77e5e02e2f15,2024-06-01 00:30:23,https://s3.ritm.media/yappy-db-duplicates/22ee0045-004b-4c7e-98f2-77e5e02e2f15.mp4,False,,False


In [14]:
_not_in_index = set(id_to_uuid) - set(metadata_train.index)

In [15]:
_not_in_index

set()

In [16]:
#_dummy_data = pandas.DataFrame([['2020-06-01 00:05:43', '', False, numpy.nan, False] for _ in range(len(_not_in_index))], index=list(_not_in_index), columns=metadata_train.columns)
#%metadata_train = pandas.concat([metadata_train, _dummy_data], axis=0)

## Load embeddings

In [17]:
torch_embeddings = torch.load(embeddings_path_torch)
torch_embeddings.shape

torch.Size([2000, 400])

In [18]:
torch_embeddings

tensor([[-5.3610e-01,  2.2627e+00,  1.5577e+00,  ...,  5.0714e-01,
         -1.5675e-03,  3.8101e-01],
        [ 5.0548e-01,  1.2463e+00, -9.8039e-01,  ..., -7.6966e-01,
         -1.1751e-01,  1.0210e+00],
        [ 1.0693e-02, -1.8525e-01, -4.1511e-01,  ..., -6.0840e-01,
          1.3939e-01,  6.0030e-01],
        ...,
        [-1.6418e-01,  2.8743e+00,  2.2072e-01,  ..., -1.1432e-01,
         -6.0881e-01,  1.8747e-01],
        [-5.4414e-01,  7.0293e-01,  1.8913e+00,  ...,  1.4516e+00,
         -6.6224e-01,  6.2969e-01],
        [-8.4023e-01,  5.6275e+00,  3.3273e+00,  ...,  1.8207e+00,
         -6.5340e-01, -4.3275e-01]])

In [19]:
embeddings = torch_embeddings.cpu().numpy().copy()
faiss.normalize_L2(embeddings)

In [20]:
index = faiss.IndexFlatIP(torch_embeddings.shape[-1])

In [21]:
index.add(embeddings)

## Predict

In [22]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [23]:
y_score, distances, indices, processed_metadata = predict(index, embeddings, id_to_uuid, metadata_train)

In [47]:
treshold = 0.935
y_pred = y_score > treshold
y_true = metadata_train.loc[id_to_uuid]["is_duplicate"].to_numpy()# * metadata_train.loc[id_to_uuid]["duplicate_for"].isin(id_to_uuid).to_numpy()

In [50]:
(metadata_train.loc[id_to_uuid]["is_duplicate"].to_numpy() * (~metadata_train.loc[id_to_uuid]["duplicate_for"].isin(id_to_uuid).to_numpy())).sum()

0

In [48]:
print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))
print(roc_auc_score(y_true, y_score))

0.989
0.96875
0.5961538461538461
0.7380952380952381
0.9131999289211815


In [27]:
loaded_metadata_train = metadata_train.loc[id_to_uuid]

In [28]:
loaded_metadata_train["y_score"] = y_score
loaded_metadata_train["duplicate_for_pred"] = id_to_uuid[indices]

ValueError: 2

In [None]:
loaded_metadata_train[loaded_metadata_train["is_duplicate"] > y_pred][["duplicate_for", "duplicate_for_pred", "y_score"]]

In [None]:
loaded_metadata_train[loaded_metadata_train["is_duplicate"] > y_pred]["link"]

In [None]:
metadata_train.loc[loaded_metadata_train[loaded_metadata_train["is_duplicate"] > y_pred]["duplicate_for"]]["link"]

In [None]:
top_k = 3
distances, indices = index.search(embeddings, top_k)

In [None]:
distances

In [None]:
duplicate_candidates = distances[:,1] > 0.95

In [None]:
indices_1 = duplicate_candidates.nonzero()[0]
indices_2 = indices[indices_1][:,1]

In [None]:
uuids_1 = numpy.array(id_to_uuid)[indices_1]
uuids_2 = numpy.array(id_to_uuid)[indices_2]

In [None]:
index = 20
print(metadata_train[metadata_train["uuid"] == uuids_1[index]]["link"])
print(metadata_train[metadata_train["uuid"] == uuids_2[index]]["link"])
print(distances[indices_1][index][1])