In [1]:
import cv2
import glob
import gradio as gr
import numpy as np
import os
import pandas as pd
import torch

from collections import Counter
from PIL import Image

from pymilvus import MilvusClient

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.datasets import VisionDataset
from torchvision.models import resnet50, ResNet50_Weights
from videohash import VideoHash

In [2]:
given_videos = glob.glob(r'/Users/wallander/Downloads/train_data_yappy/train_dataset/*.mp4')
len(given_videos)

2000

In [3]:
description = pd.read_csv('/Users/wallander/Downloads/train_data_yappy/train.csv')
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False


In [4]:
def get_path(video_path, root):
    try:
        return os.path.join(root, video_path + '.mp4')
    except:
        return None

root = '/Users/wallander/Downloads/train_data_yappy/train_dataset'

description['video_1'] = description['uuid'].apply(lambda x: get_path(x, root))
description['video_2'] = description['duplicate_for'].apply(lambda x: get_path(x, root))
description = description[(description['video_1'].isin(given_videos))].copy()
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1,video_2
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,


In [5]:
description['is_duplicate'].value_counts()

is_duplicate
False    1948
True       52
Name: count, dtype: int64

In [6]:
description['is_hard'].value_counts()

is_hard
False    1988
True       12
Name: count, dtype: int64

In [7]:
description[description['is_hard'] == True]['is_duplicate'].value_counts()

is_duplicate
False    12
Name: count, dtype: int64

In [8]:
description['uuid'].nunique()

2000

In [141]:
video_idx = description['uuid'].to_dict()
inverse_video_idx = {value: key for key, value in video_idx.items()}
description['duplicate_id'] = description['duplicate_for'].map(inverse_video_idx)

## Gradio stand

In [111]:
hard_pair = description[description['is_duplicate'] == True].sample(1)
# video_1 = hard_pair['video_1'].values[0]
# video_2 = hard_pair['video_2'].values[0]
video_1 = description.loc[212, 'video_1']
video_2 = description.loc[153, 'video_1']

with gr.Blocks() as demo:
    with gr.Row():
        gr.Video(video_1)
        gr.Video(video_2)
        # gr.Video('/Users/wallander/Downloads/train_data_yappy/train_dataset/3d8304d8-b202-4c1c-bcb8-998fb7f767ae.mp4')
        
demo.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




## Video processing

In [11]:
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)

In [12]:
class BasicVisionDataset(VisionDataset):
    def __init__(self, images, transform=None, target_transform=None):
        if isinstance(images, np.ndarray):
            transform.transforms.insert(0, transforms.ToPILImage())
        super(BasicVisionDataset, self).__init__(root=None, transform=transform, target_transform=target_transform)
        self.images = images

    def __getitem__(self, index):
        return torch.unsqueeze(self.transform(self.images[index]), 0)

    def __len__(self):
        return len(self.targets)

In [13]:
class FEDataset(Dataset):

    def __init__(self, images, root_dir=None, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
            
        self.images = images
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.transform(self.images[idx])

In [14]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
])

In [196]:
def video_to_vec(video_fp, transform):
    vidcap = cv2.VideoCapture(video_fp)  
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))
    n_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    frames = np.stack([vidcap.read()[1] for i in range(n_frames) if (i % fps) == 0])
    
    dataset = FEDataset(frames, transform=transform)
    dataloader = DataLoader(dataset=dataset, batch_size=16)
    output = torch.vstack([model(batch) for batch in dataloader])

    return output.detach().numpy()


def get_vectors(id, step, vectors):
    l = step * (id // step)
    r = step * (id // step + 1)
    return np.load(os.path.join(vectors, f'{l}_{r}.npz'), allow_pickle=True)[f'arr_{id % step}']

def get_vidcap_param(video_fp, params):
    vidcap = cv2.VideoCapture(video_fp)
    return [vidcap.get(param) for param in params]

In [16]:
steps = 101
step = 2000 // (steps - 1)
ls = np.linspace(0, 2000, steps).astype(int)

vectors = 'vectors'
for l, r in zip(ls[:-1], ls[1:]):
    if not os.path.exists(vectors):
        os.mkdir(vectors)
    path = os.path.join(vectors, f'{l}_{r}.npz') 
    if os.path.exists(path):
        continue
    else:
        buf = description.iloc[l:r]['video_1'].apply(lambda x: video_to_vec(x, transform)).to_list()
        np.savez(path, *buf)

In [17]:
def get_duplicate_vectors(x, step, vectors):
    try:
        return get_vectors(x, step, vectors)
    except:
        return None


mapping = {filename: i for i, filename in enumerate(description['video_1'].values)}
description['vector'] = description['video_1'].map(mapping).apply(lambda x: get_vectors(x, step, vectors))

## Milvus

In [18]:
client = MilvusClient('./milvus_demo.db')

In [20]:
if client.has_collection(collection_name='demo_collection'):
    client.drop_collection(collection_name='demo_collection')
    
client.create_collection(
    collection_name='demo_collection',
    dimension=1000,
)

In [21]:
videos = description['vector'].values
data = []

for i, video in enumerate(videos):
    data += [{'id': i, 'vector': fr} for _, fr in enumerate(video)]
    
db = client.insert(
    collection_name='demo_collection',
    data=data
)

## Similarity

In [142]:
df_test = pd.read_csv('test.csv')
df_test['vector'] = df_test['video_1'].map(mapping).apply(lambda x: get_vectors(x, step, vectors))
df_test['duplicate_id'] = df_test['duplicate_for'].map(inverse_video_idx)

In [156]:
df_train = pd.read_csv('train.csv')
df_train['vector'] = df_train['video_1'].map(mapping).apply(lambda x: get_vectors(x, step, vectors))
df_train['duplicate_id'] = df_train['duplicate_for'].map(inverse_video_idx)

In [158]:
train_video_similarity = {}
thr = 0.7

for i, row in df_train.iterrows():
    counter = Counter()
    created_ts = row['created']
    video_filename = row['uuid']
    video = row['vector']
    idx = description[(description['uuid'] != video_filename) & (description['created'] <= created_ts)].index.tolist()
    if idx == []:
        continue
    
    res = client.search(
        collection_name='demo_collection',
        data=video, # replace with your query vector
        limit=1,
        filter=f'id in {idx}',
        metric_type='cosine',
        params={
            'radius': thr,
            'range_filter': thr + .01,
        },
        output_fields=['id'],
    )
    
    for entry in res:
        counter[entry[0]['id']] += entry[0]['distance']
    train_video_similarity[video_filename] = counter.most_common()[0]

In [160]:
test_video_similarity = {}

for i, row in df_test.iterrows():
    counter = Counter()
    created_ts = row['created']
    video_filename = row['uuid']
    video = row['vector']
    idx = description[(description['uuid'] != video_filename) & (description['created'] <= created_ts)].index.tolist()
    if idx == []:
        continue
    
    res = client.search(
        collection_name='demo_collection',
        data=video, # replace with your query vector
        limit=1,
        filter=f'id in {idx}',
        metric_type='cosine',
        params={
            'radius': thr,
            'range_filter': thr + .01,
        },
        output_fields=['id'],
    )
    
    for entry in res:
        counter[entry[0]['id']] += entry[0]['distance']
    test_video_similarity[video_filename] = counter.most_common()[0]

In [164]:
df_train['video_similarity'] = df_train['uuid'].apply(lambda x: train_video_similarity.get(x, [None])[-1])
df_test['video_similarity'] = df_test['uuid'].apply(lambda x: test_video_similarity.get(x, [None])[-1])
df_test.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1,video_2,vector_1,vector_2,vector,duplicate_id,video_similarity
0,2024-06-04 04:48:49,215d0b86-2bab-4081-970c-3e59d7761ccb,https://s3.ritm.media/yappy-db-duplicates/215d...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,[[ 0.3488465 -0.71843207 -1.097622 ... 1.0...,,"[[0.3488465, -0.71843207, -1.097622, -1.280640...",,2.259133
1,2024-06-01 14:58:19,2925df14-c98c-47f5-a4cf-6b21d41be9b9,https://s3.ritm.media/yappy-db-duplicates/2925...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,[[-0.71260405 0.36375737 0.19422503 ... 0.1...,,"[[-0.71260405, 0.36375737, 0.19422503, -0.3882...",,1.379539
2,2024-06-07 04:56:08,2fac9bfa-9326-4fa2-92b5-313cc50f9068,https://s3.ritm.media/yappy-db-duplicates/2fac...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,[[ 0.59485203 0.6931638 1.7225157 ... -1.0...,,"[[0.59485203, 0.6931638, 1.7225157, 0.3593938,...",,1.901853
3,2024-06-05 18:09:20,3e47012b-2fde-4fef-8622-7d84379dfc7d,https://s3.ritm.media/yappy-db-duplicates/3e47...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,[[ 1.2379974 0.46831685 1.1437972 ... 0.0...,,"[[1.2379974, 0.46831685, 1.1437972, 0.08837265...",,1.378546
4,2024-06-05 03:22:27,33895280-4519-4af4-b650-e55de2cf54cb,https://s3.ritm.media/yappy-db-duplicates/3389...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,[[-1.5078713 -0.5003367 -2.0516872 ... 0.0...,,"[[-1.5078713, -0.5003367, -2.0516872, -3.01824...",,2.450636


In [165]:
df_test.query('is_duplicate == True').head()
# description.query('is_hard == True').head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1,video_2,vector_1,vector_2,vector,duplicate_id,video_similarity
16,2024-06-06 06:45:16,63b899be-9547-44c2-91b2-e1b71510a614,https://s3.ritm.media/yappy-db-duplicates/63b8...,True,4a14964b-3ccf-4673-93fc-86c37a489333,False,/Users/wallander/Downloads/train_data_yappy/tr...,/Users/wallander/Downloads/train_data_yappy/tr...,[[ 1.1087284e+00 -6.6176802e-01 -5.6514573e-01...,,"[[1.1087284, -0.661768, -0.56514573, 0.0236847...",61.0,9.193621
140,2024-06-03 01:57:24,05d72fc9-89a3-47bf-bb1b-6db4fc6f2b56,https://s3.ritm.media/yappy-db-duplicates/05d7...,True,090cb968-02eb-4aba-8b94-f6124050df59,False,/Users/wallander/Downloads/train_data_yappy/tr...,/Users/wallander/Downloads/train_data_yappy/tr...,[[-2.2799623 -1.1117591 -1.7731289 ... -1.2...,,"[[-2.2799623, -1.1117591, -1.7731289, -2.51834...",384.0,13.010233
141,2024-06-07 14:39:54,4e13f784-dc74-4532-b944-1789b3a95af1,https://s3.ritm.media/yappy-db-duplicates/4e13...,True,0e0bc479-6a06-4901-8e71-99463edc5e52,False,/Users/wallander/Downloads/train_data_yappy/tr...,/Users/wallander/Downloads/train_data_yappy/tr...,[[-1.0500007 0.01436962 -1.0920494 ... -0.1...,,"[[-1.0500007, 0.014369617, -1.0920494, -1.3740...",179.0,1.939005
241,2024-06-06 07:50:45,b7defd6d-f2ba-4dec-b341-0a04f471a721,https://s3.ritm.media/yappy-db-duplicates/b7de...,True,1454c25b-fe4c-4073-a73a-9fba3fb048c2,False,/Users/wallander/Downloads/train_data_yappy/tr...,/Users/wallander/Downloads/train_data_yappy/tr...,[[-0.9018568 -1.4180013 0.0425047 ... -1.6...,,"[[-0.9018568, -1.4180013, 0.0425047, -0.176348...",132.0,1.70911
248,2024-06-06 06:46:30,1ad0536e-8422-409f-9e4a-3dc64f5e260a,https://s3.ritm.media/yappy-db-duplicates/1ad0...,True,3b447013-9202-415d-ae15-b1d76ce8e060,False,/Users/wallander/Downloads/train_data_yappy/tr...,/Users/wallander/Downloads/train_data_yappy/tr...,[[-2.0423083 -0.8111954 -2.1818283 ... 0.7...,,"[[-2.0423083, -0.8111954, -2.1818283, -2.18543...",196.0,8.922762


In [192]:
df_train['created']

0       2024-06-04 10:14:10
1       2024-06-04 17:18:48
2       2024-06-05 23:58:42
3       2024-06-03 02:06:10
4       2024-06-02 22:09:25
               ...         
1495    2024-06-06 07:46:08
1496    2024-06-05 12:01:59
1497    2024-06-06 15:20:52
1498    2024-06-05 11:14:33
1499    2024-06-08 17:14:23
Name: created, Length: 1500, dtype: object

## ML

In [170]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [206]:
df_train['params'] = df_train['video_1'].apply(lambda x: get_vidcap_param(
    x, [cv2.CAP_PROP_FPS, cv2.CAP_PROP_FRAME_COUNT, cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT]
))
df_test['params'] = df_test['video_1'].apply(lambda x: get_vidcap_param(
    x, [cv2.CAP_PROP_FPS, cv2.CAP_PROP_FRAME_COUNT, cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT]
))

In [210]:
X_train = df_train[['video_similarity', 'fps', 'n_frames', 'w', 'h']].values
y_train = df_train['is_duplicate']
X_test = df_test[['video_similarity', 'fps', 'n_frames', 'w', 'h']].fillna(0).values
y_test = df_test['is_duplicate']

In [211]:
model = LogisticRegression(class_weight={0: 1, 1: 20})
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.27906976744186046

In [208]:
df_train[['fps','n_frames', 'w', 'h']] = pd.DataFrame(df_train['params'].tolist(), index=df_train.index)
df_test[['fps','n_frames', 'w', 'h']] = pd.DataFrame(df_test['params'].tolist(), index=df_test.index)

In [212]:
from catboost import CatBoostClassifier

In [222]:
model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.01,
    auto_class_weights='Balanced',
    depth=8,
    random_state=42,
    verbose=False,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.375