In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install numpy==1.24.3 librosa ffmpeg-python --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m115.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.
albumentations 2.0.5 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
blosc2 3.3.0 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which i

In [None]:
!pip install webrtcvad
!apt-get install -y ffmpeg

Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: webrtcvad
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp311-cp311-linux_x86_64.whl size=73499 sha256=8910f1fb780fe56a0d5e85ffb017c6de5f86d50b4a4e63352599f876a3bf734d
  Stored in directory: /root/.cache/pip/wheels/94/65/3f/292d0b656be33d1c801831201c74b5f68f41a2ae465ff2ee2f
Successfully built webrtcvad
Installing collected packages: webrtcvad
Successfully installed webrtcvad-2.0.10
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.

In [None]:
import os
import shutil
import wave
import contextlib
import webrtcvad
import collections
from tqdm import tqdm
import numpy as np


video_dir = "/content/drive/MyDrive/urfunny2_videos_copied/urfunny2_videos"
filtered_output_dir = "/content/drive/MyDrive/urfunny2_audio_filtered_videos_vad"
os.makedirs(filtered_output_dir, exist_ok=True)


SPEECH_RATIO_THRESHOLD = 0.85
BATCH_SIZE = 100


def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        assert wf.getnchannels() == 1
        assert wf.getsampwidth() == 2
        assert wf.getframerate() in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, wf.getframerate()

def write_wav(pcm_data, sample_rate, out_path):
    with contextlib.closing(wave.open(out_path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(pcm_data)

def extract_audio_ffmpeg(video_path, output_wav_path):
    cmd = f"ffmpeg -y -loglevel error -i '{video_path}' -ar 16000 -ac 1 -vn '{output_wav_path}'"
    return os.system(cmd) == 0

def calculate_vad_ratio(wav_path):
    try:
        audio, sample_rate = read_wave(wav_path)
        vad = webrtcvad.Vad(2)
        frame_duration = 30  # ms
        frame_size = int(sample_rate * frame_duration / 1000) * 2
        frames = [audio[i:i+frame_size] for i in range(0, len(audio), frame_size)]
        voiced = sum(1 for frame in frames if len(frame) == frame_size and vad.is_speech(frame, sample_rate))
        return voiced / len(frames) if frames else 0.0
    except Exception as e:
        print(f"[ERROR] VAD failed for {wav_path}: {e}")
        return 0.0


video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])
total_videos = len(video_files)
print(f" Total videos to process: {total_videos}")

kept_total = 0
dropped_total = 0

for i in range(0, total_videos, BATCH_SIZE):
    batch = video_files[i:i+BATCH_SIZE]
    print(f"\n Processing batch {i//BATCH_SIZE + 1} ({len(batch)} videos)...\n")

    kept_count, dropped_count = 0, 0

    for video_file in tqdm(batch):
        video_path = os.path.join(video_dir, video_file)
        temp_wav = "temp_vad.wav"

        if extract_audio_ffmpeg(video_path, temp_wav):
            ratio = calculate_vad_ratio(temp_wav)
            os.remove(temp_wav)

            if ratio >= SPEECH_RATIO_THRESHOLD:
                shutil.copy(video_path, os.path.join(filtered_output_dir, video_file))
                kept_count += 1
            else:
                dropped_count += 1
        else:
            print(f"[ERROR] Failed to extract audio: {video_file}")
            dropped_count += 1

    kept_total += kept_count
    dropped_total += dropped_count

    print(f" Batch done: Kept {kept_count}, Dropped {dropped_count}")

print("\n All Batches Completed!")
print(f" Total Kept: {kept_total} (speech_ratio >= {SPEECH_RATIO_THRESHOLD})")
print(f" Total Dropped: {dropped_total} (too silent or failed)")


 Total videos to process: 10166

 Processing batch 1 (100 videos)...



100%|██████████| 100/100 [00:58<00:00,  1.72it/s]


 Batch done: Kept 77, Dropped 23

 Processing batch 2 (100 videos)...



  2%|▏         | 2/100 [02:33<2:27:30, 90.31s/it]

[ERROR] Failed to extract audio: 10127.mp4


  3%|▎         | 3/100 [03:28<1:59:54, 74.17s/it]

[ERROR] Failed to extract audio: 10129.mp4


  4%|▍         | 4/100 [04:37<1:55:12, 72.00s/it]

[ERROR] Failed to extract audio: 1013.mp4


100%|██████████| 100/100 [05:15<00:00,  3.15s/it]


 Batch done: Kept 72, Dropped 28

 Processing batch 3 (100 videos)...



100%|██████████| 100/100 [00:31<00:00,  3.20it/s]


 Batch done: Kept 81, Dropped 19

 Processing batch 4 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.76it/s]


 Batch done: Kept 73, Dropped 27

 Processing batch 5 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


 Batch done: Kept 66, Dropped 34

 Processing batch 6 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.78it/s]


 Batch done: Kept 73, Dropped 27

 Processing batch 7 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.08it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 8 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


 Batch done: Kept 75, Dropped 25

 Processing batch 9 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.97it/s]


 Batch done: Kept 53, Dropped 47

 Processing batch 10 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.58it/s]


 Batch done: Kept 67, Dropped 33

 Processing batch 11 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


 Batch done: Kept 78, Dropped 22

 Processing batch 12 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.39it/s]


 Batch done: Kept 62, Dropped 38

 Processing batch 13 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.17it/s]


 Batch done: Kept 71, Dropped 29

 Processing batch 14 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.04it/s]


 Batch done: Kept 75, Dropped 25

 Processing batch 15 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


 Batch done: Kept 62, Dropped 38

 Processing batch 16 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.32it/s]


 Batch done: Kept 70, Dropped 30

 Processing batch 17 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.48it/s]


 Batch done: Kept 64, Dropped 36

 Processing batch 18 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.49it/s]


 Batch done: Kept 68, Dropped 32

 Processing batch 19 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.18it/s]


 Batch done: Kept 63, Dropped 37

 Processing batch 20 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.59it/s]


 Batch done: Kept 57, Dropped 43

 Processing batch 21 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


 Batch done: Kept 51, Dropped 49

 Processing batch 22 (100 videos)...



100%|██████████| 100/100 [00:23<00:00,  4.23it/s]


 Batch done: Kept 48, Dropped 52

 Processing batch 23 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.13it/s]


 Batch done: Kept 66, Dropped 34

 Processing batch 24 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


 Batch done: Kept 71, Dropped 29

 Processing batch 25 (100 videos)...



100%|██████████| 100/100 [00:24<00:00,  4.00it/s]


 Batch done: Kept 49, Dropped 51

 Processing batch 26 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.28it/s]


 Batch done: Kept 38, Dropped 62

 Processing batch 27 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.89it/s]


 Batch done: Kept 60, Dropped 40

 Processing batch 28 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.30it/s]


 Batch done: Kept 74, Dropped 26

 Processing batch 29 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


 Batch done: Kept 78, Dropped 22

 Processing batch 30 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.42it/s]


 Batch done: Kept 47, Dropped 53

 Processing batch 31 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 32 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


 Batch done: Kept 64, Dropped 36

 Processing batch 33 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 34 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


 Batch done: Kept 58, Dropped 42

 Processing batch 35 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.09it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 36 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.71it/s]


 Batch done: Kept 64, Dropped 36

 Processing batch 37 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


 Batch done: Kept 54, Dropped 46

 Processing batch 38 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.81it/s]


 Batch done: Kept 72, Dropped 28

 Processing batch 39 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.58it/s]


 Batch done: Kept 65, Dropped 35

 Processing batch 40 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.21it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 41 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.18it/s]


 Batch done: Kept 78, Dropped 22

 Processing batch 42 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.86it/s]


 Batch done: Kept 42, Dropped 58

 Processing batch 43 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.71it/s]


 Batch done: Kept 68, Dropped 32

 Processing batch 44 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.09it/s]


 Batch done: Kept 68, Dropped 32

 Processing batch 45 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


 Batch done: Kept 58, Dropped 42

 Processing batch 46 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.19it/s]


 Batch done: Kept 50, Dropped 50

 Processing batch 47 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


 Batch done: Kept 55, Dropped 45

 Processing batch 48 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.91it/s]


 Batch done: Kept 63, Dropped 37

 Processing batch 49 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.06it/s]


 Batch done: Kept 87, Dropped 13

 Processing batch 50 (100 videos)...



100%|██████████| 100/100 [00:23<00:00,  4.33it/s]


 Batch done: Kept 52, Dropped 48

 Processing batch 51 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.46it/s]


 Batch done: Kept 49, Dropped 51

 Processing batch 52 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.72it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 53 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.03it/s]


 Batch done: Kept 74, Dropped 26

 Processing batch 54 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.64it/s]


 Batch done: Kept 66, Dropped 34

 Processing batch 55 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 56 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.80it/s]


 Batch done: Kept 56, Dropped 44

 Processing batch 57 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.18it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 58 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


 Batch done: Kept 54, Dropped 46

 Processing batch 59 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


 Batch done: Kept 78, Dropped 22

 Processing batch 60 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.19it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 61 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 62 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.22it/s]


 Batch done: Kept 58, Dropped 42

 Processing batch 63 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.27it/s]


 Batch done: Kept 65, Dropped 35

 Processing batch 64 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.26it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 65 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


 Batch done: Kept 66, Dropped 34

 Processing batch 66 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.38it/s]


 Batch done: Kept 71, Dropped 29

 Processing batch 67 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.67it/s]


 Batch done: Kept 61, Dropped 39

 Processing batch 68 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.89it/s]


 Batch done: Kept 68, Dropped 32

 Processing batch 69 (100 videos)...



100%|██████████| 100/100 [00:28<00:00,  3.57it/s]


 Batch done: Kept 80, Dropped 20

 Processing batch 70 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.64it/s]


 Batch done: Kept 67, Dropped 33

 Processing batch 71 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.41it/s]


 Batch done: Kept 74, Dropped 26

 Processing batch 72 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.51it/s]


 Batch done: Kept 52, Dropped 48

 Processing batch 73 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.64it/s]


 Batch done: Kept 46, Dropped 54

 Processing batch 74 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


 Batch done: Kept 50, Dropped 50

 Processing batch 75 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


 Batch done: Kept 78, Dropped 22

 Processing batch 76 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.93it/s]


 Batch done: Kept 65, Dropped 35

 Processing batch 77 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.97it/s]


 Batch done: Kept 58, Dropped 42

 Processing batch 78 (100 videos)...



100%|██████████| 100/100 [00:23<00:00,  4.31it/s]


 Batch done: Kept 59, Dropped 41

 Processing batch 79 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.53it/s]


 Batch done: Kept 87, Dropped 13

 Processing batch 80 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


 Batch done: Kept 57, Dropped 43

 Processing batch 81 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.75it/s]


 Batch done: Kept 46, Dropped 54

 Processing batch 82 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.80it/s]


 Batch done: Kept 58, Dropped 42

 Processing batch 83 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


 Batch done: Kept 71, Dropped 29

 Processing batch 84 (100 videos)...



100%|██████████| 100/100 [00:18<00:00,  5.30it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 85 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.42it/s]


 Batch done: Kept 70, Dropped 30

 Processing batch 86 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.73it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 87 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


 Batch done: Kept 49, Dropped 51

 Processing batch 88 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.73it/s]


 Batch done: Kept 73, Dropped 27

 Processing batch 89 (100 videos)...



100%|██████████| 100/100 [00:23<00:00,  4.32it/s]


 Batch done: Kept 76, Dropped 24

 Processing batch 90 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


 Batch done: Kept 65, Dropped 35

 Processing batch 91 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.16it/s]


 Batch done: Kept 71, Dropped 29

 Processing batch 92 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.67it/s]


 Batch done: Kept 44, Dropped 56

 Processing batch 93 (100 videos)...



100%|██████████| 100/100 [00:19<00:00,  5.02it/s]


 Batch done: Kept 69, Dropped 31

 Processing batch 94 (100 videos)...



100%|██████████| 100/100 [00:22<00:00,  4.45it/s]


 Batch done: Kept 67, Dropped 33

 Processing batch 95 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


 Batch done: Kept 84, Dropped 16

 Processing batch 96 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.65it/s]


 Batch done: Kept 80, Dropped 20

 Processing batch 97 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


 Batch done: Kept 73, Dropped 27

 Processing batch 98 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.71it/s]


 Batch done: Kept 55, Dropped 45

 Processing batch 99 (100 videos)...



100%|██████████| 100/100 [00:23<00:00,  4.22it/s]


 Batch done: Kept 66, Dropped 34

 Processing batch 100 (100 videos)...



100%|██████████| 100/100 [00:21<00:00,  4.71it/s]


 Batch done: Kept 55, Dropped 45

 Processing batch 101 (100 videos)...



100%|██████████| 100/100 [00:20<00:00,  4.80it/s]


 Batch done: Kept 74, Dropped 26

 Processing batch 102 (66 videos)...



100%|██████████| 66/66 [00:12<00:00,  5.14it/s]

 Batch done: Kept 51, Dropped 15

 All Batches Completed!
 Total Kept: 6560 (speech_ratio >= 0.85)
 Total Dropped: 3606 (too silent or failed)





In [None]:
import os
import cv2
import shutil
from tqdm import tqdm

video_dir = "/content/drive/MyDrive/urfunny2_audio_filtered_videos_vad"
filtered_output_dir = "/content/drive/MyDrive/urfunny2_filtered_videos_visual_opencv"
os.makedirs(filtered_output_dir, exist_ok=True)

face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

def face_detect_ratio(video_path, sample_rate=2, min_ratio=0.6):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    interval = max(1, int(fps // sample_rate))

    total_sampled = 0
    face_frames = 0
    idx = 0

    while True:
        success, frame = cap.read()
        if not success:
            break
        if idx % interval == 0:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, 1.3, 5)
            if len(faces) > 0:
                face_frames += 1
            total_sampled += 1
        idx += 1

    cap.release()
    if total_sampled == 0:
        return 0.0
    return face_frames / total_sampled

video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])
print(f" Total videos to process: {len(video_files)}")
kept = 0
dropped = 0

for vid in tqdm(video_files):
    path = os.path.join(video_dir, vid)
    try:
        ratio = face_detect_ratio(path)
        if ratio >= 0.6:
            shutil.copy(path, os.path.join(filtered_output_dir, vid))
            kept += 1
        else:
            dropped += 1
    except Exception as e:
        print(f"[ERROR] {vid}: {e}")
        dropped += 1

print(f"\n Done! Kept: {kept}, Dropped: {dropped}")


 Total videos to process: 6609


100%|██████████| 6609/6609 [3:35:35<00:00,  1.96s/it]


 Done! Kept: 1329, Dropped: 5280





In [None]:
import cv2
import os
import numpy as np
from tqdm import tqdm
import shutil

video_dir = "/content/drive/MyDrive/urfunny2_filtered_videos"
output_dir = "/content/drive/MyDrive/urfunny2_final_videos_motion_secondwise"
os.makedirs(output_dir, exist_ok=True)

PIXEL_DIFF_THRESHOLD = 5.0
STATIC_RATIO_THRESHOLD = 0.8
MIN_DURATION = 3

def static_video_detect(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    if duration < MIN_DURATION:
        cap.release()
        return False

    frame_indices = [int(fps * t) for t in range(int(duration))]
    sampled_frames = []

    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            continue
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        sampled_frames.append(gray)
    cap.release()

    if len(sampled_frames) < 2:
        return True

    diffs = [np.mean(np.abs(sampled_frames[i+1] - sampled_frames[i])) for i in range(len(sampled_frames)-1)]
    static_count = sum(d < PIXEL_DIFF_THRESHOLD for d in diffs)
    static_ratio = static_count / len(diffs)

    return static_ratio >= STATIC_RATIO_THRESHOLD

video_files = sorted([f for f in os.listdir(video_dir) if f.endswith('.mp4')])
kept = 0
dropped = 0

for fname in tqdm(video_files, desc="Second-wise Motion Filtering"):
    fpath = os.path.join(video_dir, fname)
    if static_video_detect(fpath):
        dropped += 1
        print(f"[DROP] {fname} - Too many static seconds")
    else:
        shutil.copy(fpath, os.path.join(output_dir, fname))
        kept += 1

print(f"\n Second-wise filtering completed: Kept {kept}, Dropped {dropped}")


Second-wise Motion Filtering:  15%|█▍        | 194/1329 [01:54<09:45,  1.94it/s]

[DROP] 11561.mp4 - Too many static seconds


Second-wise Motion Filtering:  16%|█▌        | 208/1329 [01:57<03:55,  4.76it/s]

[DROP] 11749.mp4 - Too many static seconds


Second-wise Motion Filtering:  27%|██▋       | 360/1329 [03:13<07:42,  2.09it/s]

[DROP] 13458.mp4 - Too many static seconds


Second-wise Motion Filtering:  64%|██████▍   | 857/1329 [06:56<02:55,  2.68it/s]

[DROP] 486.mp4 - Too many static seconds


Second-wise Motion Filtering:  65%|██████▌   | 869/1329 [07:01<03:39,  2.10it/s]

[DROP] 49.mp4 - Too many static seconds


Second-wise Motion Filtering:  70%|██████▉   | 928/1329 [07:26<04:19,  1.55it/s]

[DROP] 5531.mp4 - Too many static seconds


Second-wise Motion Filtering: 100%|██████████| 1329/1329 [10:33<00:00,  2.10it/s]


 Second-wise filtering completed: Kept 1323, Dropped 6





In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import shutil
import pandas as pd

video_dir = "/content/drive/MyDrive/urfunny2_final_videos_motion_secondwise"
output_dir = "/content/drive/MyDrive/urfunny2_final_videos_no_ppt"
os.makedirs(output_dir, exist_ok=True)


SAMPLE_RATE = 1
PPT_THRESHOLD = 0.3
PIXEL_EDGE_RATIO = 0.02
COLOR_STD_THRESHOLD = 15

def ppt_frame_detect(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    edge_ratio = np.sum(edges > 0) / edges.size
    color_std = np.std(image)
    return edge_ratio < PIXEL_EDGE_RATIO and color_std < COLOR_STD_THRESHOLD

def video_most_ppt(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = int(total_frames / fps) if fps > 0 else 0
    sampled_ppt_count = 0
    total_sampled = 0

    for t in range(0, duration, SAMPLE_RATE):
        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
        ret, frame = cap.read()
        if not ret:
            continue
        total_sampled += 1
        if ppt_frame_detect(frame):
            sampled_ppt_count += 1

    cap.release()
    if total_sampled == 0:
        return False
    return (sampled_ppt_count / total_sampled) > PPT_THRESHOLD

video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])
kept, dropped = 0, 0

for fname in tqdm(video_files, desc="PPT Filtering"):
    fpath = os.path.join(video_dir, fname)
    if video_most_ppt(fpath):
        dropped += 1
    else:
        shutil.copy(fpath, os.path.join(output_dir, fname))
        kept += 1

summary_df = pd.DataFrame({
    "Total Videos": [len(video_files)],
    "Kept": [kept],
    "Dropped (PPT Dominated)": [dropped]
})
print(summary_df)

PPT Filtering: 100%|██████████| 1323/1323 [10:06<00:00,  2.18it/s]

   Total Videos  Kept  Dropped (PPT Dominated)
0          1323  1322                        1





In [None]:
import os

final_video_dir = "/content/drive/MyDrive/urfunny2_final_videos_no_ppt"
video_files = [f for f in os.listdir(final_video_dir) if f.endswith(".mp4")]
print(f" Total videos remaining after all filters: {len(video_files)}")


 Total videos remaining after all filters: 1322
