In [77]:
import os
from tqdm import tqdm
import optuna

import torch
from models.model import SimilarityRecognizer
from src.preprocess import Preprocess

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

from src.serialization import read_obj, save_obj
from src.config import S3_MODEL_PATH, LOCAL_MODEL_PATH, BUCKET_URL
from src.utils import download_file_from_s3, mp4
from src.video_analysis import compute_similarities, get_features

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
if not os.path.exists(LOCAL_MODEL_PATH):
    download_file_from_s3(BUCKET_URL, S3_MODEL_PATH, LOCAL_MODEL_PATH)

In [4]:
model = SimilarityRecognizer(model_type="base", batch_size=8).to(device)
model.load_pretrained_weights("checkpoints/best_model_base_224_16x16_rgb.pth")
model.eval()

SimilarityRecognizer(
  (feature_extractor): Timesformer(
    (model): VisionTransformer(
      (dropout): Dropout(p=0.0, inplace=False)
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
      (blocks): ModuleList(
        (0): Block(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (proj): Linear(in_features=768, out_features=768, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (attn_drop): Dropout(p=0.0, inplace=False)
          )
          (temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (temporal_attn): Attention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (proj): Linear(in_features=768, 

In [5]:
data_path = os.path.join(os.getcwd(), 'train_data_yappy', 'train_dataset')

train = pd.read_csv('train_data_yappy/train.csv')

In [6]:
train['created'] = pd.to_datetime(train['created'])

In [7]:
train

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False
...,...,...,...,...,...,...
22758,2024-09-12 13:46:57,0efe756a-e965-40c1-94db-de7f3e6649a9,https://s3.ritm.media/yappy-db-duplicates/0efe...,True,131846f3-6f5c-497a-a2fa-95cfb3929301,False
22759,2024-09-12 14:46:13,caec3b94-e356-4576-b00a-515e0df1dfc3,https://s3.ritm.media/yappy-db-duplicates/caec...,True,3b5eb15a-c6d7-4214-8dd6-c029564ff11d,False
22760,2024-09-13 09:08:42,c5b69151-f240-4e27-a5c9-c41f79a167e9,https://s3.ritm.media/yappy-db-duplicates/c5b6...,True,17ecc94a-f28a-40d5-b438-86b6e82a2fef,False
22761,2024-09-13 14:52:21,6d3233b6-f8de-49ba-8697-bb30dbf825f7,https://s3.ritm.media/yappy-db-duplicates/6d32...,True,1838f7a7-ef2a-4141-a125-90fb5bf0c5a2,False


In [8]:
train_files = set(map(lambda x: x.split('.')[0], os.listdir(data_path)))

In [29]:
filtred_train = train[train['uuid'].isin(train_files)].sort_values(by='created')

In [30]:
filtred_train

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False
...,...,...,...,...,...,...
10253,2024-07-07 09:50:11,67457d6c-6175-400d-89fe-7243d73b7874,https://s3.ritm.media/yappy-db-duplicates/6745...,True,44bec7ea-5636-44d6-866c-7cdd7f9a4696,False
10290,2024-07-07 13:38:24,f0968632-e0b3-4407-9c99-3258e75e2959,https://s3.ritm.media/yappy-db-duplicates/f096...,True,3fb52c06-65ae-4ea6-9f57-c4b3f9ac7163,False
10543,2024-07-08 10:32:40,7282037d-678f-4397-b6c7-c0c8819533da,https://s3.ritm.media/yappy-db-duplicates/7282...,True,03f52b57-5e60-441f-8c41-b78f9222752e,False
10690,2024-07-08 22:21:52,6e85101c-da03-4416-89f1-0284da3bb478,https://s3.ritm.media/yappy-db-duplicates/6e85...,True,455d9167-5327-45b7-832a-249ca1354778,False


In [31]:
duplicates_df = filtred_train[~filtred_train['duplicate_for'].isna()]
duplicates_df

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
97,2024-06-01 10:27:10,5eb4127e-5694-492b-963c-6688522e9ad2,https://s3.ritm.media/yappy-db-duplicates/5eb4...,True,3726bb2d-3323-41f8-8eb2-0d7cf095d62b,False
181,2024-06-01 19:09:52,b5f191e6-42e0-43f5-8773-560643de17fb,https://s3.ritm.media/yappy-db-duplicates/b5f1...,True,314d2988-eb85-4581-8416-da998e036afe,False
195,2024-06-01 20:53:41,025ee26a-7391-4f60-878a-7fc1928a967b,https://s3.ritm.media/yappy-db-duplicates/025e...,True,0fd09c1b-e19e-4b6e-84e6-35f9d4fc6f72,False
212,2024-06-01 22:04:03,a18324cf-b2ad-41e2-86b8-e6923c5fdc36,https://s3.ritm.media/yappy-db-duplicates/a183...,True,2da574f2-5ae7-4d85-9f5b-c2970a315c1c,False
221,2024-06-01 23:12:31,2253aaa4-b29c-4b7d-b9cc-9286d23c44e8,https://s3.ritm.media/yappy-db-duplicates/2253...,True,3a20cbaf-798a-4b25-b2c1-e41e1c6ecd09,False
...,...,...,...,...,...,...
10253,2024-07-07 09:50:11,67457d6c-6175-400d-89fe-7243d73b7874,https://s3.ritm.media/yappy-db-duplicates/6745...,True,44bec7ea-5636-44d6-866c-7cdd7f9a4696,False
10290,2024-07-07 13:38:24,f0968632-e0b3-4407-9c99-3258e75e2959,https://s3.ritm.media/yappy-db-duplicates/f096...,True,3fb52c06-65ae-4ea6-9f57-c4b3f9ac7163,False
10543,2024-07-08 10:32:40,7282037d-678f-4397-b6c7-c0c8819533da,https://s3.ritm.media/yappy-db-duplicates/7282...,True,03f52b57-5e60-441f-8c41-b78f9222752e,False
10690,2024-07-08 22:21:52,6e85101c-da03-4416-89f1-0284da3bb478,https://s3.ritm.media/yappy-db-duplicates/6e85...,True,455d9167-5327-45b7-832a-249ca1354778,False


In [12]:
len(duplicates_df)

469

In [13]:
filtred_train[filtred_train['is_hard']]

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
8,2024-06-01 00:41:33,1e9efc51-a74c-4f32-b03e-71905f8d6dd1,https://s3.ritm.media/yappy-db-duplicates/1e9e...,False,,True
77,2024-06-01 08:28:26,3dd424ce-f0e5-4727-88a3-8f318b612afd,https://s3.ritm.media/yappy-db-duplicates/3dd4...,False,,True
211,2024-06-01 22:03:27,d444f2d0-a7cd-4c9b-bc56-8d5ef88ec015,https://s3.ritm.media/yappy-db-duplicates/d444...,False,,True
356,2024-06-02 11:23:19,49292bf9-dc53-44d5-9980-a658ab3a3921,https://s3.ritm.media/yappy-db-duplicates/4929...,False,,True
472,2024-06-02 20:54:43,6a816304-7c1b-4c16-81c6-09f46bb0ad63,https://s3.ritm.media/yappy-db-duplicates/6a81...,False,,True
553,2024-06-03 04:13:06,1f4df893-663e-4ca3-a6ca-46819aebd8a3,https://s3.ritm.media/yappy-db-duplicates/1f4d...,False,,True
581,2024-06-03 06:43:43,f41954d9-dd9f-4844-bea4-4a86b5ce2ae9,https://s3.ritm.media/yappy-db-duplicates/f419...,False,,True
748,2024-06-03 21:05:43,992321a1-06b2-4f2c-a2d1-aa7f37a14da1,https://s3.ritm.media/yappy-db-duplicates/9923...,False,,True
939,2024-06-04 13:11:37,68e41e6f-2d9b-409d-a3b8-581dff56a722,https://s3.ritm.media/yappy-db-duplicates/68e4...,False,,True
952,2024-06-04 13:53:38,bf2c9cd9-6fd2-4afd-bd90-b2a5d132cd52,https://s3.ritm.media/yappy-db-duplicates/bf2c...,False,,True


In [14]:
# '5eb4127e-5694-492b-963c-6688522e9ad2' in all_features 

In [15]:
# preprocess = Preprocess(clip_len=8, out_size=224, frame_interval=1, channels=1)

# video_names = filtred_train['uuid'].to_list()

# video_paths = [os.path.join(data_path, mp4(name)) for name in video_names]

# all_features = read_obj('features_pickled/grey_features.pkl')

# grey_features = get_features(preprocess, model, features=all_features, video_paths=video_paths)

# save_obj(grey_features, 'features_pickled/grey_features_2500')

In [16]:
all_features = read_obj('features_pickled/grey_features_2500.pkl')

In [18]:
hard_similarities = compute_similarities(video_id='1e9efc51-a74c-4f32-b03e-71905f8d6dd1', all_features=all_features, model=model)
sorted_dict = dict(sorted(hard_similarities.items(), key=lambda item: -item[1]))
print(sorted_dict)

In [26]:
for index, row in duplicates_df.iterrows():
    print(f'Similarity between {row['uuid']} and {row["duplicate_for"]}: {compute_similarities(video_id=row['uuid'], 
                                                                                               second_video_id=row["duplicate_for"], 
                                                                                               all_features=all_features,
                                                                                               model=model)}')

Similarity between 5eb4127e-5694-492b-963c-6688522e9ad2 and 3726bb2d-3323-41f8-8eb2-0d7cf095d62b: 0.9994394183158875
Similarity between b5f191e6-42e0-43f5-8773-560643de17fb and 314d2988-eb85-4581-8416-da998e036afe: 0.9997285008430481
Similarity between 025ee26a-7391-4f60-878a-7fc1928a967b and 0fd09c1b-e19e-4b6e-84e6-35f9d4fc6f72: 0.9932383894920349
Similarity between a18324cf-b2ad-41e2-86b8-e6923c5fdc36 and 2da574f2-5ae7-4d85-9f5b-c2970a315c1c: 0.8075734376907349
Similarity between 2253aaa4-b29c-4b7d-b9cc-9286d23c44e8 and 3a20cbaf-798a-4b25-b2c1-e41e1c6ecd09: 0.9990720748901367
Similarity between 99c8cd59-5995-4981-8346-460d40e4eed3 and 1a713f01-e2e1-48e8-83f3-2c90938ba197: 0.9964303970336914
Similarity between 5acd8e68-99fe-43fd-b9f9-b2279bdc9372 and 0af145dd-e5f1-4cc1-a755-4e9a52ea2435: 0.9971468448638916
Similarity between 05d72fc9-89a3-47bf-bb1b-6db4fc6f2b56 and 090cb968-02eb-4aba-8b94-f6124050df59: 0.9905136823654175
Similarity between 36e972c3-7134-41ce-b37f-e227fece4575 and 57ee

In [32]:
filtred_train

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False
...,...,...,...,...,...,...
10253,2024-07-07 09:50:11,67457d6c-6175-400d-89fe-7243d73b7874,https://s3.ritm.media/yappy-db-duplicates/6745...,True,44bec7ea-5636-44d6-866c-7cdd7f9a4696,False
10290,2024-07-07 13:38:24,f0968632-e0b3-4407-9c99-3258e75e2959,https://s3.ritm.media/yappy-db-duplicates/f096...,True,3fb52c06-65ae-4ea6-9f57-c4b3f9ac7163,False
10543,2024-07-08 10:32:40,7282037d-678f-4397-b6c7-c0c8819533da,https://s3.ritm.media/yappy-db-duplicates/7282...,True,03f52b57-5e60-441f-8c41-b78f9222752e,False
10690,2024-07-08 22:21:52,6e85101c-da03-4416-89f1-0284da3bb478,https://s3.ritm.media/yappy-db-duplicates/6e85...,True,455d9167-5327-45b7-832a-249ca1354778,False


In [64]:
all_similarities = compute_similarities(all_features=all_features, model=model) 

save_obj(all_similarities, 'pickled_data/similarities')

In [81]:
def objective(trial):
    similarity_threshold = trial.suggest_float("similarity_threshold", 0, 1)

    db_videos = []
    duplicate_videos = []

    y_true = list(map(str, filtred_train['duplicate_for'].to_list()))
    y_pred = []

    for index, row in tqdm(filtred_train.iterrows(), ncols=70, total=len(filtred_train)):
        new_video_id = row['uuid']

        max_similarity = float('-inf')
        max_similarity_video = None

        for old_video in db_videos:
            similarity = all_similarities[new_video_id][old_video]

            if similarity > max_similarity:
                max_similarity = similarity
                max_similarity_video = old_video

        if max_similarity < similarity_threshold:
            db_videos.append(new_video_id)
            y_pred.append('nan')
        else:
            duplicate_videos.append(new_video_id)
            y_pred.append(max_similarity_video)

    new_y_true, new_y_pred = [], []

    for true, pred in zip(y_true, y_pred):
        is_true_nan = true == 'nan'
        is_pred_nan = pred == 'nan'

        new_y_true.append(1 if not is_true_nan else 0)
        new_y_pred.append(1 if not is_pred_nan else 0)

        if not is_true_nan and not is_pred_nan:
            new_y_pred[-1] = 1 if true == pred else 0
            
    # accuracy = accuracy_score(y_true, y_pred)

    score = f1_score(new_y_true, new_y_pred)
    
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best hyperparameters: ", study.best_params)
print("Best f1_score: ", study.best_value)

[I 2024-09-28 16:22:31,827] A new study created in memory with name: no-name-2601676b-9ce4-4c7a-857d-9b1664f750f5
100%|██████████████████████████| 2500/2500 [00:00<00:00, 14041.99it/s]
[I 2024-09-28 16:22:32,017] Trial 0 finished with value: 0.0008009611533840609 and parameters: {'similarity_threshold': 0.11758916501104155}. Best is trial 0 with value: 0.0008009611533840609.
100%|███████████████████████████| 2500/2500 [00:00<00:00, 6029.27it/s]
[I 2024-09-28 16:22:32,441] Trial 1 finished with value: 0.38275862068965516 and parameters: {'similarity_threshold': 0.9977618675154444}. Best is trial 1 with value: 0.38275862068965516.
100%|███████████████████████████| 2500/2500 [00:00<00:00, 7595.45it/s]
[I 2024-09-28 16:22:32,779] Trial 2 finished with value: 0.43422913719943423 and parameters: {'similarity_threshold': 0.6863327954868816}. Best is trial 2 with value: 0.43422913719943423.
100%|██████████████████████████| 2500/2500 [00:00<00:00, 16318.12it/s]
[I 2024-09-28 16:22:32,941] Trial

Best hyperparameters:  {'similarity_threshold': 0.8601731272679782}
Best f1_score:  0.96


In [None]:
similarity_threshold = 0.8601731272679782

df = filtred_train[filtred_train['is_hard']]

db_videos = []
duplicate_videos = []

y_true = list(map(str, filtred_train['duplicate_for'].to_list()))
y_pred = []

for index, row in tqdm(filtred_train.iterrows(), ncols=70, total=len(filtred_train)):
    new_video_id = row['uuid']

    max_similarity = float('-inf')
    max_similarity_video = None

    for old_video in db_videos:
        similarity = all_similarities[new_video_id][old_video]

        if similarity > max_similarity:
            max_similarity = similarity
            max_similarity_video = old_video

    if max_similarity < similarity_threshold:
        db_videos.append(new_video_id)
        y_pred.append('nan')
    else:
        duplicate_videos.append(new_video_id)
        y_pred.append(max_similarity_video)

new_y_true, new_y_pred = [], []

for true, pred in zip(y_true, y_pred):
    is_true_nan = true == 'nan'
    is_pred_nan = pred == 'nan'

    new_y_true.append(1 if not is_true_nan else 0)
    new_y_pred.append(1 if not is_pred_nan else 0)

if not is_true_nan and not is_pred_nan:
    new_y_pred[-1] = 1 if true == pred else 0
    
# accuracy = accuracy_score(y_true, y_pred)

score = f1_score(new_y_true, new_y_pred)