In [1]:
import cv2
import glob
import gradio as gr
import json
import numpy as np
import os
import pandas as pd
import sys
import torch
import warnings

from catboost import CatBoostClassifier

from collections import Counter
from PIL import Image

from pymilvus import MilvusClient

from tqdm import tqdm_notebook as tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.datasets import VisionDataset
from torchvision.models import resnet50, ResNet50_Weights
from videohash import VideoHash

warnings.filterwarnings('ignore')

In [2]:
given_videos = glob.glob(r'/Users/wallander/Downloads/train_data_yappy/train_dataset/*.mp4')
len(given_videos)

2000

In [3]:
description = pd.read_csv('/Users/wallander/Downloads/train_data_yappy/train.csv')
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False


In [4]:
def get_path(video_path, root):
    try:
        return os.path.join(root, video_path + '.mp4')
    except:
        return None

root = '/Users/wallander/Downloads/train_data_yappy/train_dataset'

description['video_1'] = description['uuid'].apply(lambda x: get_path(x, root))
# description['video_2'] = description['duplicate_for'].apply(lambda x: get_path(x, root))
description = description[(description['video_1'].isin(given_videos))].copy()
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...


In [5]:
description['is_duplicate'].value_counts()

is_duplicate
False    1948
True       52
Name: count, dtype: int64

In [6]:
description['is_hard'].value_counts()

is_hard
False    1988
True       12
Name: count, dtype: int64

In [7]:
description[description['is_hard'] == True]['is_duplicate'].value_counts()

is_duplicate
False    12
Name: count, dtype: int64

In [8]:
description['uuid'].nunique()

2000

In [9]:
video_idx = description['uuid'].to_dict()
inverse_video_idx = {value: key for key, value in video_idx.items()}
description['duplicate_id'] = description['duplicate_for'].map(inverse_video_idx)

## Gradio stand

In [10]:
hard_pair = description[description['is_duplicate'] == True].sample(1)
video_1 = description.loc[212, 'video_1']
video_2 = description.loc[153, 'video_1']

with gr.Blocks() as demo:
    with gr.Row():
        gr.Video(video_1)
        gr.Video(video_2)
        # gr.Video('/Users/wallander/Downloads/train_data_yappy/train_dataset/3d8304d8-b202-4c1c-bcb8-998fb7f767ae.mp4')
        
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Video processing

In [11]:
class BasicVisionDataset(VisionDataset):
    def __init__(self, images, transform=None, target_transform=None):
        if isinstance(images, np.ndarray):
            transform.transforms.insert(0, transforms.ToPILImage())
        super(BasicVisionDataset, self).__init__(root=None, transform=transform, target_transform=target_transform)
        self.images = images

    def __getitem__(self, index):
        return torch.unsqueeze(self.transform(self.images[index]), 0)

    def __len__(self):
        return len(self.targets)

In [12]:
class FEDataset(Dataset):

    def __init__(self, images, root_dir=None, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
            
        self.images = images
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.transform(self.images[idx])

In [13]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
])

In [14]:
def get_vidcap_param(video_fp, params):
    vidcap = cv2.VideoCapture(video_fp)
    return [vidcap.get(param) for param in params]

In [15]:
video_params = [
    cv2.CAP_PROP_FPS,
    cv2.CAP_PROP_FRAME_COUNT,
    cv2.CAP_PROP_FRAME_WIDTH,
    cv2.CAP_PROP_FRAME_HEIGHT,
    cv2.CAP_PROP_BITRATE,
]
video_features = ['fps', 'n_frames', 'w', 'h', 'bitrate',]
features = ['video_similarity', 'r', 'duration'] + video_features

description['params'] = description['video_1'].apply(lambda x: get_vidcap_param(x, video_params))
description['idx'] = description.index
description[video_features] = pd.DataFrame(description['params'].tolist(), index=description.index)

In [16]:
if os.path.exists('vectors.npz'):
    values = np.load('vectors.npz')

with open('keys.json', 'r') as file:
    d = json.load(file)

In [17]:
db_entries = []

for i, (filename, arr) in  enumerate(d.items()):
    vectors = values[arr]
    db_entries += [{'id': i, 'vector': vector, 'name': filename, 'frame': j} for j, vector in enumerate(vectors)]

## Milvus

In [19]:
client = MilvusClient('./milvus_demo.db')

collection = 'test_collection'

if client.has_collection(collection_name=collection):
    client.drop_collection(collection_name=collection)
    
client.create_collection(
    collection_name=collection,
    dimension=1000,
)

n_chunks = 3
for n in range(n_chunks):
    chunk = db_entries[n::n_chunks]
    db = client.insert(
        collection_name=collection,
        data=chunk
    )

## Similarity

In [76]:
thr = 0.75
video_similarity = {}

for _, row in description.iterrows():
    counter = Counter()
    created_ts = row['created']
    video_filename = row['uuid']
    video = values[d[row['video_1']]]
    idx = description[(description['uuid'] != video_filename) & (description['created'] <= created_ts)].index.tolist()
    if idx == []:
        continue
    
    res = client.search(
        collection_name=collection,
        data=video,
        limit=1,
        filter=f'id in {idx}',
        metric_type='cosine',
        params={
            'radius': thr,
            'range_filter': thr + .1,
        },
        output_fields=['id'],
    )
    
    for entry in res:
        counter[entry[0]['id']] += entry[0]['distance']
    video_similarity[video_filename] = counter.most_common()[0]

In [77]:
description['video_similarity'] = description['uuid'].apply(lambda x: video_similarity.get(x, [None])[-1])
description['similar_video'] = description['uuid'].apply(lambda x: video_similarity.get(x, [None])[0])
description['idx'] = description.index
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1,duplicate_id,params,idx,fps,n_frames,w,h,bitrate,video_similarity,similar_video,r,duration
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 228.0, 720.0, 1280.0, 4883.0]",0,30.0,228.0,720.0,1280.0,4883.0,,,,7.6
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 580.0, 720.0, 1280.0, 2141.0]",1,30.0,580.0,720.0,1280.0,2141.0,11.239502,0.0,0.019378,19.333333
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1389.0, 720.0, 1280.0, 1990.0]",2,30.0,1389.0,720.0,1280.0,1990.0,37.820294,1.0,0.027228,46.3
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[29.97, 600.0, 720.0, 1280.0, 2534.0]",3,29.97,600.0,720.0,1280.0,2534.0,16.014596,2.0,0.026691,20.02002
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 163.0, 720.0, 1280.0, 1348.0]",4,30.0,163.0,720.0,1280.0,1348.0,1.502037,1.0,0.009215,5.433333


In [78]:
description['r'] = description['video_similarity'] / description['n_frames']
description['duration'] = description['n_frames'] / description['fps']
description_ = description.merge(description, how='left', left_on=description['similar_video'], right_on=description['idx'])

df_train, df_test = train_test_split(description_, stratify=description_[['is_duplicate_x', 'is_hard_x']], random_state=42)

## ML

In [79]:
# video_params = [
#     cv2.CAP_PROP_FPS,
#     cv2.CAP_PROP_FRAME_COUNT,
#     cv2.CAP_PROP_FRAME_WIDTH,
#     cv2.CAP_PROP_FRAME_HEIGHT,
#     cv2.CAP_PROP_BITRATE,
# ]
# df_train['params'] = df_train['video_1'].apply(lambda x: get_vidcap_param(x, video_params))
# df_test['params'] = df_test['video_1'].apply(lambda x: get_vidcap_param(x, video_params))
# description['params'] = description['video_1'].apply(lambda x: get_vidcap_param(x, video_params))
df_train.columns

Index(['key_0', 'created_x', 'uuid_x', 'link_x', 'is_duplicate_x',
       'duplicate_for_x', 'is_hard_x', 'video_1_x', 'duplicate_id_x',
       'params_x', 'idx_x', 'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x',
       'video_similarity_x', 'similar_video_x', 'r_x', 'duration_x',
       'created_y', 'uuid_y', 'link_y', 'is_duplicate_y', 'duplicate_for_y',
       'is_hard_y', 'video_1_y', 'duplicate_id_y', 'params_y', 'idx_y',
       'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'video_similarity_y',
       'similar_video_y', 'r_y', 'duration_y'],
      dtype='object')

In [83]:
video_features = ['fps', 'n_frames', 'w', 'h', 'bitrate', 'duration']
features = ';'.join([f'{feature}_x;{feature}_y' for feature in video_features]).split(';') + ['video_similarity']
features

['fps_x',
 'fps_y',
 'n_frames_x',
 'n_frames_y',
 'w_x',
 'w_y',
 'h_x',
 'h_y',
 'bitrate_x',
 'bitrate_y',
 'duration_x',
 'duration_y',
 'video_similarity']

In [84]:
df_train.columns

Index(['key_0', 'created_x', 'uuid_x', 'link_x', 'is_duplicate_x',
       'duplicate_for_x', 'is_hard_x', 'video_1_x', 'duplicate_id_x',
       'params_x', 'idx_x', 'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x',
       'video_similarity_x', 'similar_video_x', 'r_x', 'duration_x',
       'created_y', 'uuid_y', 'link_y', 'is_duplicate_y', 'duplicate_for_y',
       'is_hard_y', 'video_1_y', 'duplicate_id_y', 'params_y', 'idx_y',
       'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'video_similarity_y',
       'similar_video_y', 'r_y', 'duration_y'],
      dtype='object')

In [110]:
features = [
    'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x', 'video_similarity_x', 'duration_x', 'r_x', #'similar_video_x',
    'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', #'video_similarity_y', 'duration_y', 'r_y', 'similar_video_y',
]

In [111]:
X_train = df_train[features].fillna(0).values
y_train = df_train['is_duplicate_x']
X_test = df_test[features].fillna(0).values
y_test = df_test['is_duplicate_x']

In [112]:
model = LogisticRegression(class_weight={0: 1, 1: 20})
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.028169014084507043

In [113]:
df_train.columns

Index(['key_0', 'created_x', 'uuid_x', 'link_x', 'is_duplicate_x',
       'duplicate_for_x', 'is_hard_x', 'video_1_x', 'duplicate_id_x',
       'params_x', 'idx_x', 'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x',
       'video_similarity_x', 'similar_video_x', 'r_x', 'duration_x',
       'created_y', 'uuid_y', 'link_y', 'is_duplicate_y', 'duplicate_for_y',
       'is_hard_y', 'video_1_y', 'duplicate_id_y', 'params_y', 'idx_y',
       'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'video_similarity_y',
       'similar_video_y', 'r_y', 'duration_y'],
      dtype='object')

In [123]:
model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.01,
    auto_class_weights='Balanced',
    depth=6,
    random_state=42,
    verbose=False,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.08

In [124]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.98      0.98      0.98       487
        True       0.08      0.08      0.08        13

    accuracy                           0.95       500
   macro avg       0.53      0.53      0.53       500
weighted avg       0.95      0.95      0.95       500



In [125]:
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
# model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

def video_preprocess(video_fp):
    vidcap = cv2.VideoCapture(video_fp)
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))
    n_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))

    frames = np.stack([vidcap.read()[1] for i in range(n_frames) if (i % fps) == 0])
    vidcap.release()
    
    dataset = FEDataset(frames, transform=transform)
    dataloader = DataLoader(dataset=dataset, batch_size=8)
    output = torch.vstack([model(batch) for batch in dataloader]).detach().numpy()
    
    return output

In [126]:
%%time
video_preprocess(description.loc[0, 'video_1']).shape

CPU times: user 1.19 s, sys: 424 ms, total: 1.61 s
Wall time: 252 ms


(8, 1000)

In [127]:
given_videos = glob.glob(r'/Users/wallander/Downloads/train_data_yappy/train_dataset/*.mp4')
len(given_videos)

2000

In [128]:
data = {}

for video in tqdm(given_videos):
    data[video] = video_preprocess(video)

print(sys.getsizeof(data))

  0%|          | 0/2000 [00:00<?, ?it/s]

NameError: name 'sys' is not defined

In [None]:
np.savez('vectors.npz', 

In [147]:
vectors = list(data.values())
keys = data.keys()
    
np.savez('vectors.npz', *vectors)

with open('keys.json', 'w') as file:
    d = {key: f'arr_{i}' for i, key in enumerate(keys)}
    json.dump(d, file)