In [1]:
import cv2
import glob
import gradio as gr
import json
import numpy as np
import os
import pandas as pd
import sys
import torch
import warnings

from catboost import CatBoostClassifier

from collections import Counter
from PIL import Image

from pymilvus import MilvusClient

from tqdm import tqdm_notebook as tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.datasets import VisionDataset
from torchvision.models import resnet50, ResNet50_Weights
from videohash import VideoHash

warnings.filterwarnings('ignore')

In [2]:
given_videos = glob.glob(r'/Users/wallander/Downloads/train_data_yappy/train_dataset/*.mp4')
len(given_videos)

2000

In [3]:
description = pd.read_csv('/Users/wallander/Downloads/train_data_yappy/train.csv')
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False


In [4]:
def get_path(video_path, root):
    try:
        return os.path.join(root, video_path + '.mp4')
    except:
        return None

root = '/Users/wallander/Downloads/train_data_yappy/train_dataset'

description['video_1'] = description['uuid'].apply(lambda x: get_path(x, root))
# description['video_2'] = description['duplicate_for'].apply(lambda x: get_path(x, root))
description = description[(description['video_1'].isin(given_videos))].copy()
description.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...


In [5]:
description['is_duplicate'].value_counts()

is_duplicate
False    1948
True       52
Name: count, dtype: int64

In [6]:
description['is_hard'].value_counts()

is_hard
False    1988
True       12
Name: count, dtype: int64

In [7]:
description[description['is_hard'] == True]['is_duplicate'].value_counts()

is_duplicate
False    12
Name: count, dtype: int64

In [8]:
description['uuid'].nunique()

2000

In [9]:
video_idx = description['uuid'].to_dict()
inverse_video_idx = {value: key for key, value in video_idx.items()}
description['duplicate_id'] = description['duplicate_for'].map(inverse_video_idx)

## Gradio stand

In [10]:
hard_pair = description[description['is_duplicate'] == True].sample(1)
video_1 = description.loc[212, 'video_1']
video_2 = description.loc[153, 'video_1']

with gr.Blocks() as demo:
    with gr.Row():
        gr.Video(video_1)
        gr.Video(video_2)
        # gr.Video('/Users/wallander/Downloads/train_data_yappy/train_dataset/3d8304d8-b202-4c1c-bcb8-998fb7f767ae.mp4')
        
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Video processing

In [11]:
class BasicVisionDataset(VisionDataset):
    def __init__(self, images, transform=None, target_transform=None):
        if isinstance(images, np.ndarray):
            transform.transforms.insert(0, transforms.ToPILImage())
        super(BasicVisionDataset, self).__init__(root=None, transform=transform, target_transform=target_transform)
        self.images = images

    def __getitem__(self, index):
        return torch.unsqueeze(self.transform(self.images[index]), 0)

    def __len__(self):
        return len(self.targets)

In [12]:
class FEDataset(Dataset):

    def __init__(self, images, root_dir=None, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
            
        self.images = images
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.transform(self.images[idx])

In [13]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
])

In [14]:
def get_vidcap_param(video_fp, params):
    vidcap = cv2.VideoCapture(video_fp)
    return [vidcap.get(param) for param in params]

In [15]:
video_params = [
    cv2.CAP_PROP_FPS,
    cv2.CAP_PROP_FRAME_COUNT,
    cv2.CAP_PROP_FRAME_WIDTH,
    cv2.CAP_PROP_FRAME_HEIGHT,
    cv2.CAP_PROP_BITRATE,
]
video_features = ['fps', 'n_frames', 'w', 'h', 'bitrate',]
features = ['video_similarity', 'r', 'duration'] + video_features

description['params'] = description['video_1'].apply(lambda x: get_vidcap_param(x, video_params))
description['idx'] = description.index
description[video_features] = pd.DataFrame(description['params'].tolist(), index=description.index)

In [16]:
if os.path.exists('vectors.npz'):
    values = np.load('vectors.npz')

with open('keys.json', 'r') as file:
    d = json.load(file)

In [122]:
# d

In [128]:
db_entries = []

for i, (filename, arr) in  enumerate(d.items()):
    vectors = values[arr]
    db_entries += [{'id': i, 'vector': vector, 'name': filename, 'frame': j} for j, vector in enumerate(vectors)]

In [129]:
vectors.shape

(38, 1000)

## Milvus

In [130]:
client = MilvusClient('./milvus_demo.db')

collection = 'test_collection'

if client.has_collection(collection_name=collection):
    client.drop_collection(collection_name=collection)
    
client.create_collection(
    collection_name=collection,
    dimension=1000,
)

n_chunks = 3
for n in range(n_chunks):
    chunk = db_entries[n::n_chunks]
    db = client.insert(
        collection_name=collection,
        data=chunk
    )

## Similarity

In [283]:
thr = 0.9
video_similarity = {}

for _, row in description.iterrows():
    counter = Counter()
    created_ts = row['created']
    video_filename = row['uuid']
    video = values[d[row['video_1']]]
    idx = description[(description['uuid'] != video_filename) & (description['created'] <= created_ts)].index.tolist()
    if idx == []:
        continue
    
    res = client.search(
        collection_name=collection,
        data=video,
        limit=1,
        filter=f'id in {idx}',
        metric_type='cosine',
        # params={
        #     'radius': thr,
        #     'range_filter': thr + .1,
        # },
        output_fields=['id', 'name'],
    )
    
    # for entry in res:
    #     counter[entry[0]['entity']['name']] += entry[0]['distance'] if entry[0]['distance'] > thr else 0
    # video_similarity[video_filename] = counter.most_common()[0]
    video_similarity[video_filename] = [res[0][0]['distance'], res[0][0]['entity']['name']]
    # break

In [284]:
# # video, idx,
res, video_similarity

(data: ["[{'id': 1902, 'distance': 0.9999993443489075, 'entity': {'name': '/Users/wallander/Downloads/train_data_yappy/train_dataset/17f4b0bf-135a-496c-ac7f-a83519d4f500.mp4', 'id': 1902}}]", "[{'id': 1902, 'distance': 1.0, 'entity': {'name': '/Users/wallander/Downloads/train_data_yappy/train_dataset/17f4b0bf-135a-496c-ac7f-a83519d4f500.mp4', 'id': 1902}}]", "[{'id': 1902, 'distance': 0.9999997615814209, 'entity': {'name': '/Users/wallander/Downloads/train_data_yappy/train_dataset/17f4b0bf-135a-496c-ac7f-a83519d4f500.mp4', 'id': 1902}}]", "[{'id': 1902, 'distance': 0.9999999403953552, 'entity': {'name': '/Users/wallander/Downloads/train_data_yappy/train_dataset/17f4b0bf-135a-496c-ac7f-a83519d4f500.mp4', 'id': 1902}}]", "[{'id': 1902, 'distance': 0.9999998211860657, 'entity': {'name': '/Users/wallander/Downloads/train_data_yappy/train_dataset/17f4b0bf-135a-496c-ac7f-a83519d4f500.mp4', 'id': 1902}}]", "[{'id': 1902, 'distance': 1.0000003576278687, 'entity': {'name': '/Users/wallander/Dow

In [285]:
res = client.search(
    collection_name=collection,
    data=video,
    limit=1,
    filter=f'',
    metric_type='cosine',
    output_fields=['id', 'name'],
)
description

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1,duplicate_id,params,idx,fps,n_frames,w,h,bitrate,video_similarity,similar_video,r,duration
0,2024-06-01 00:05:43,23fac2f2-7f00-48cb-b3ac-aac8caa3b6b4,https://s3.ritm.media/yappy-db-duplicates/23fa...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 228.0, 720.0, 1280.0, 4883.0]",0,30.00,228.0,720.0,1280.0,4883.0,,,,7.600000
1,2024-06-01 00:11:01,2fa37210-3c25-4a87-88f2-1242c2c8a699,https://s3.ritm.media/yappy-db-duplicates/2fa3...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 580.0, 720.0, 1280.0, 2141.0]",1,30.00,580.0,720.0,1280.0,2141.0,0.320983,/Users/wallander/Downloads/train_data_yappy/tr...,0.000553,19.333333
2,2024-06-01 00:13:20,31cc33d5-95de-4799-ad01-87c8498d1bde,https://s3.ritm.media/yappy-db-duplicates/31cc...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1389.0, 720.0, 1280.0, 1990.0]",2,30.00,1389.0,720.0,1280.0,1990.0,0.823112,/Users/wallander/Downloads/train_data_yappy/tr...,0.000593,46.300000
3,2024-06-01 00:27:23,03abd0ec-609e-4eea-9f2a-b6b7442bc881,https://s3.ritm.media/yappy-db-duplicates/03ab...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[29.97, 600.0, 720.0, 1280.0, 2534.0]",3,29.97,600.0,720.0,1280.0,2534.0,0.661135,/Users/wallander/Downloads/train_data_yappy/tr...,0.001102,20.020020
4,2024-06-01 00:30:23,22ee0045-004b-4c7e-98f2-77e5e02e2f15,https://s3.ritm.media/yappy-db-duplicates/22ee...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 163.0, 720.0, 1280.0, 1348.0]",4,30.00,163.0,720.0,1280.0,1348.0,0.659400,/Users/wallander/Downloads/train_data_yappy/tr...,0.004045,5.433333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2024-06-08 17:56:16,3b63aa7e-3d54-4299-954a-593d6ba248dc,https://s3.ritm.media/yappy-db-duplicates/3b63...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1064.0, 720.0, 1280.0, 1880.0]",1995,30.00,1064.0,720.0,1280.0,1880.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.000940,35.466667
1996,2024-06-08 18:00:35,0d382bc5-683e-4bd9-9986-6b21abaea2bb,https://s3.ritm.media/yappy-db-duplicates/0d38...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 186.0, 720.0, 1280.0, 1424.0]",1996,30.00,186.0,720.0,1280.0,1424.0,1.000001,/Users/wallander/Downloads/train_data_yappy/tr...,0.005376,6.200000
1997,2024-06-08 18:22:40,2efeecac-5fb3-42ce-b7f3-a06f975cc2b3,https://s3.ritm.media/yappy-db-duplicates/2efe...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 997.0, 720.0, 1280.0, 2666.0]",1997,30.00,997.0,720.0,1280.0,2666.0,0.999999,/Users/wallander/Downloads/train_data_yappy/tr...,0.001003,33.233333
1998,2024-06-08 18:28:33,238ed91f-5175-4f1a-92f6-dca4fb270cf4,https://s3.ritm.media/yappy-db-duplicates/238e...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[24.0, 176.0, 720.0, 1280.0, 1334.0]",1998,24.00,176.0,720.0,1280.0,1334.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.005682,7.333333


In [286]:
description['video_similarity'] = description['uuid'].apply(lambda x: video_similarity.get(x, [None])[0])
description['similar_video'] = description['uuid'].apply(lambda x: video_similarity.get(x, [None])[-1])
description['idx'] = description.index
description.sample(10)

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for,is_hard,video_1,duplicate_id,params,idx,fps,n_frames,w,h,bitrate,video_similarity,similar_video,r,duration
440,2024-06-02 18:13:33,1069528b-2a2a-45ed-9dd0-e6b1cb7ae70b,https://s3.ritm.media/yappy-db-duplicates/1069...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 351.0, 720.0, 1280.0, 1816.0]",440,30.0,351.0,720.0,1280.0,1816.0,0.82574,/Users/wallander/Downloads/train_data_yappy/tr...,0.002353,11.7
1614,2024-06-07 05:17:50,36ebe430-8feb-4c36-9477-425a09a11217,https://s3.ritm.media/yappy-db-duplicates/36eb...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 292.0, 720.0, 1280.0, 2980.0]",1614,30.0,292.0,720.0,1280.0,2980.0,1.0,/Users/wallander/Downloads/train_data_yappy/tr...,0.003425,9.733333
1183,2024-06-05 11:48:53,4982f999-6d42-448e-bb19-9996deef483c,https://s3.ritm.media/yappy-db-duplicates/4982...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 260.0, 720.0, 1280.0, 1453.0]",1183,30.0,260.0,720.0,1280.0,1453.0,0.840034,/Users/wallander/Downloads/train_data_yappy/tr...,0.003231,8.666667
107,2024-06-01 12:26:43,45272750-03f9-4366-8330-175b832a2688,https://s3.ritm.media/yappy-db-duplicates/4527...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[25.0, 897.0, 720.0, 1280.0, 3312.0]",107,25.0,897.0,720.0,1280.0,3312.0,0.789044,/Users/wallander/Downloads/train_data_yappy/tr...,0.00088,35.88
1642,2024-06-07 07:20:54,0817c766-1b90-4b31-95be-0e7eca9c7ec9,https://s3.ritm.media/yappy-db-duplicates/0817...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1722.0, 720.0, 1280.0, 1250.0]",1642,30.0,1722.0,720.0,1280.0,1250.0,1.0,/Users/wallander/Downloads/train_data_yappy/tr...,0.000581,57.4
1709,2024-06-07 14:39:54,4e13f784-dc74-4532-b944-1789b3a95af1,https://s3.ritm.media/yappy-db-duplicates/4e13...,True,0e0bc479-6a06-4901-8e71-99463edc5e52,False,/Users/wallander/Downloads/train_data_yappy/tr...,179.0,"[30.0, 153.0, 720.0, 1280.0, 471.0]",1709,30.0,153.0,720.0,1280.0,471.0,1.000001,/Users/wallander/Downloads/train_data_yappy/tr...,0.006536,5.1
1762,2024-06-07 18:47:55,1181c715-3943-4df2-a852-a031ce3fde38,https://s3.ritm.media/yappy-db-duplicates/1181...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 843.0, 720.0, 1280.0, 3975.0]",1762,30.0,843.0,720.0,1280.0,3975.0,0.713906,/Users/wallander/Downloads/train_data_yappy/tr...,0.000847,28.1
287,2024-06-02 05:37:28,45f0ea8f-43f4-4dab-9f73-2085373aa350,https://s3.ritm.media/yappy-db-duplicates/45f0...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 390.0, 720.0, 1280.0, 2584.0]",287,30.0,390.0,720.0,1280.0,2584.0,0.76078,/Users/wallander/Downloads/train_data_yappy/tr...,0.001951,13.0
327,2024-06-02 09:01:55,1986b063-b0d8-46d6-9921-23aa3868bb85,https://s3.ritm.media/yappy-db-duplicates/1986...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 372.0, 720.0, 1280.0, 2287.0]",327,30.0,372.0,720.0,1280.0,2287.0,0.64056,/Users/wallander/Downloads/train_data_yappy/tr...,0.001722,12.4
1714,2024-06-07 14:58:24,028e840c-a4e4-4e68-9461-0e2f754430b3,https://s3.ritm.media/yappy-db-duplicates/028e...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 652.0, 720.0, 1280.0, 2866.0]",1714,30.0,652.0,720.0,1280.0,2866.0,1.0,/Users/wallander/Downloads/train_data_yappy/tr...,0.001534,21.733333


In [310]:
description['r'] = description['video_similarity'] / description['n_frames']
description['duration'] = description['n_frames'] / description['fps']
description_ = description.merge(description, how='left', left_on=description['similar_video'], right_on=description['video_1'])
description_['created_x'] = pd.to_datetime(description_['created_x'], format='%Y-%m-%d %H:%M:%S')
description_['created_y'] = pd.to_datetime(description_['created_y'], format='%Y-%m-%d %H:%M:%S')
description_['td'] = (description_['created_y'] - description_['created_x']).dt.total_seconds()

df_train, df_test = train_test_split(description_, stratify=description_[['is_duplicate_x', 'is_hard_x']], random_state=492)

## ML

In [311]:
df_train

Unnamed: 0,key_0,created_x,uuid_x,link_x,is_duplicate_x,duplicate_for_x,is_hard_x,video_1_x,duplicate_id_x,params_x,...,fps_y,n_frames_y,w_y,h_y,bitrate_y,video_similarity_y,similar_video_y,r_y,duration_y,td
1961,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-08 15:25:26,0e3e3a98-a4f4-4647-a38e-9e16b5d1d0a6,https://s3.ritm.media/yappy-db-duplicates/0e3e...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[29.97, 1796.0, 720.0, 1280.0, 1522.0]",...,29.97,1796.0,720.0,1280.0,1522.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.000557,59.926593,0.0
1413,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-06 09:42:01,3c21a653-5701-4aa0-99e6-2787a08f50fe,https://s3.ritm.media/yappy-db-duplicates/3c21...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 664.0, 720.0, 1280.0, 1435.0]",...,30.00,664.0,720.0,1280.0,1435.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.001506,22.133333,0.0
1314,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-05 23:43:40,268a36fe-3406-4171-ad22-805430e6462d,https://s3.ritm.media/yappy-db-duplicates/268a...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[60.0, 3543.0, 720.0, 1280.0, 532.0]",...,60.00,3543.0,720.0,1280.0,532.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.000282,59.050000,0.0
1427,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-06 11:46:44,1c9a3fed-c814-4bf8-b219-1cfe3801a0bb,https://s3.ritm.media/yappy-db-duplicates/1c9a...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1767.0, 720.0, 1280.0, 830.0]",...,30.00,1767.0,720.0,1280.0,830.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.000566,58.900000,0.0
1891,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-08 08:06:00,07105104-8b7a-40f1-bece-8cdf6da19ee4,https://s3.ritm.media/yappy-db-duplicates/0710...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1325.0, 720.0, 1280.0, 2118.0]",...,30.00,1325.0,720.0,1280.0,2118.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.000755,44.166667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1644,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-07 07:45:29,494ae872-feea-48d0-ae37-a0e33f7721be,https://s3.ritm.media/yappy-db-duplicates/494a...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 1515.0, 720.0, 1280.0, 2655.0]",...,30.00,1515.0,720.0,1280.0,2655.0,0.999999,/Users/wallander/Downloads/train_data_yappy/tr...,0.000660,50.500000,0.0
1654,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-07 08:42:40,3af5ec91-f4df-4c8f-acd4-f15e12feab31,https://s3.ritm.media/yappy-db-duplicates/3af5...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 153.0, 720.0, 1280.0, 3932.0]",...,30.00,153.0,720.0,1280.0,3932.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.006536,5.100000,0.0
1652,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-07 08:28:18,53e7d63b-c60f-478d-95a7-539831045fe0,https://s3.ritm.media/yappy-db-duplicates/53e7...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 492.0, 720.0, 1280.0, 1530.0]",...,30.00,492.0,720.0,1280.0,1530.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.002033,16.400000,0.0
486,/Users/wallander/Downloads/train_data_yappy/tr...,2024-06-02 21:47:58,3587358a-1171-41e4-a551-fd70b30ed619,https://s3.ritm.media/yappy-db-duplicates/3587...,False,,False,/Users/wallander/Downloads/train_data_yappy/tr...,,"[30.0, 576.0, 720.0, 1280.0, 1144.0]",...,29.97,331.0,720.0,1280.0,1517.0,1.000000,/Users/wallander/Downloads/train_data_yappy/tr...,0.003021,11.044378,142731.0


In [312]:
df_train.columns

Index(['key_0', 'created_x', 'uuid_x', 'link_x', 'is_duplicate_x',
       'duplicate_for_x', 'is_hard_x', 'video_1_x', 'duplicate_id_x',
       'params_x', 'idx_x', 'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x',
       'video_similarity_x', 'similar_video_x', 'r_x', 'duration_x',
       'created_y', 'uuid_y', 'link_y', 'is_duplicate_y', 'duplicate_for_y',
       'is_hard_y', 'video_1_y', 'duplicate_id_y', 'params_y', 'idx_y',
       'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'video_similarity_y',
       'similar_video_y', 'r_y', 'duration_y', 'td'],
      dtype='object')

In [313]:
video_features = ['fps', 'n_frames', 'w', 'h', 'bitrate', 'duration']
features = ';'.join([f'{feature}_x;{feature}_y' for feature in video_features]).split(';') + ['video_similarity']
features

['fps_x',
 'fps_y',
 'n_frames_x',
 'n_frames_y',
 'w_x',
 'w_y',
 'h_x',
 'h_y',
 'bitrate_x',
 'bitrate_y',
 'duration_x',
 'duration_y',
 'video_similarity']

In [314]:
df_train.columns

Index(['key_0', 'created_x', 'uuid_x', 'link_x', 'is_duplicate_x',
       'duplicate_for_x', 'is_hard_x', 'video_1_x', 'duplicate_id_x',
       'params_x', 'idx_x', 'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x',
       'video_similarity_x', 'similar_video_x', 'r_x', 'duration_x',
       'created_y', 'uuid_y', 'link_y', 'is_duplicate_y', 'duplicate_for_y',
       'is_hard_y', 'video_1_y', 'duplicate_id_y', 'params_y', 'idx_y',
       'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'video_similarity_y',
       'similar_video_y', 'r_y', 'duration_y', 'td'],
      dtype='object')

In [315]:
features = [
    'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x', 'duration_x', 'r_x', 'video_similarity_x', # 'similar_video_x',
    'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'duration_y', 'r_y', # 'video_similarity_y', # 'similar_video_y',
    'td'
]

In [316]:
X_train = df_train[df_train['is_hard_x'] == False][features].fillna(0).values
y_train = df_train[df_train['is_hard_x'] == False]['is_duplicate_x']
X_test = df_test[features].fillna(0).values
y_test = df_test['is_duplicate_x']

In [317]:
model = LogisticRegression(class_weight={0: 1, 1: 20})
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.11538461538461539

In [318]:
df_train.columns

Index(['key_0', 'created_x', 'uuid_x', 'link_x', 'is_duplicate_x',
       'duplicate_for_x', 'is_hard_x', 'video_1_x', 'duplicate_id_x',
       'params_x', 'idx_x', 'fps_x', 'n_frames_x', 'w_x', 'h_x', 'bitrate_x',
       'video_similarity_x', 'similar_video_x', 'r_x', 'duration_x',
       'created_y', 'uuid_y', 'link_y', 'is_duplicate_y', 'duplicate_for_y',
       'is_hard_y', 'video_1_y', 'duplicate_id_y', 'params_y', 'idx_y',
       'fps_y', 'n_frames_y', 'w_y', 'h_y', 'bitrate_y', 'video_similarity_y',
       'similar_video_y', 'r_y', 'duration_y', 'td'],
      dtype='object')

In [322]:
model = CatBoostClassifier(
    iterations=2500,
    learning_rate=0.01,
    auto_class_weights='SqrtBalanced',
    depth=7,
    random_state=42,
    verbose=False,
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.125

In [323]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       487
        True       0.33      0.08      0.12        13

    accuracy                           0.97       500
   macro avg       0.65      0.54      0.56       500
weighted avg       0.96      0.97      0.96       500



In [324]:
np.where((y_test == y_pred) & (y_test == 1), 1 ,0).sum()

1

In [215]:
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
# model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

def video_preprocess(video_fp):
    vidcap = cv2.VideoCapture(video_fp)
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))
    n_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))

    frames = np.stack([vidcap.read()[1] for i in range(n_frames) if (i % fps) == 0])
    vidcap.release()
    
    dataset = FEDataset(frames, transform=transform)
    dataloader = DataLoader(dataset=dataset, batch_size=8)
    output = torch.vstack([model(batch) for batch in dataloader]).detach().numpy()
    
    return output

In [216]:
%%time
video_preprocess(description.loc[0, 'video_1']).shape

CPU times: user 1.14 s, sys: 552 ms, total: 1.69 s
Wall time: 249 ms


(8, 1000)

In [217]:
given_videos = glob.glob(r'/Users/wallander/Downloads/train_data_yappy/train_dataset/*.mp4')
len(given_videos)

2000

In [221]:
vectors = []
d = {}
counter = 0

for video in tqdm(given_videos):
    vectors.append(video_preprocess(video))
    d[video] = f'arr_{counter}'
    counter += 1
    
np.savez('vectors.npz', *vectors)

with open('keys.json', 'w') as file:
    json.dump(d, file)  

  0%|          | 0/2000 [00:00<?, ?it/s]