In [12]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA version (compiled):", torch.version.cuda)
print("Is CUDA available?", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i} name:", torch.cuda.get_device_name(i))


PyTorch version: 2.6.0+cu118
CUDA version (compiled): 11.8
Is CUDA available? True
GPU count: 1
GPU 0 name: NVIDIA GeForce RTX 4070 SUPER


In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("friedrichor/MSR-VTT", "test_1k")

In [11]:
print("Number of samples in the dataset:", len(ds))
print("Keys in the first sample:", ds['test'][0].keys())
print("First sample data:", ds['test'][0])
print("First sample video path:", ds['test'][0]['video'])
print("First sample caption:", ds['test'][0]['caption'])
print("URL:", ds['test'][0]['url'])

Number of samples in the dataset: 1
Keys in the first sample: dict_keys(['video_id', 'video', 'caption', 'source', 'category', 'url', 'start time', 'end time', 'id'])
First sample data: {'video_id': 'video7020', 'video': 'video7020.mp4', 'caption': 'a woman creating a fondant baby and flower', 'source': 'MSR-VTT', 'category': 10, 'url': 'https://www.youtube.com/watch?v=4KRMJNBjrEs', 'start time': 221.77, 'end time': 233.08, 'id': 7020}
First sample video path: video7020.mp4
First sample caption: a woman creating a fondant baby and flower
URL: https://www.youtube.com/watch?v=4KRMJNBjrEs


In [3]:
import os
import cv2
from yt_dlp import YoutubeDL

def ensure_folder(folder_path):
    """Create the folder if it does not exist."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def download_video(url, output_filepath):
    """
    Download a YouTube video using yt-dlp.
    
    Parameters:
        url (str): The YouTube video URL.
        output_filepath (str): Full path (folder + filename) for saving the downloaded video.
    """
    ydl_opts = {
        'outtmpl': output_filepath,  # Output file path template
        'format': 'mp4'              # Request mp4 format if available
    }
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

def trim_video_cv2(input_file, output_file, start_time, end_time):
    """
    Trim a video using OpenCV by extracting frames between start_time and end_time.
    
    Note: Audio is not processed, so the resulting video will be silent.
    
    Parameters:
        input_file (str): Path to the input video file.
        output_file (str): Path to save the trimmed video.
        start_time (float): Start time in seconds.
        end_time (float): End time in seconds.
    """
    cap = cv2.VideoCapture(input_file)
    if not cap.isOpened():
        raise Exception("Error opening video file.")
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        raise Exception("Could not determine FPS of the video.")

    # Calculate frame indices for the start and end times
    start_frame = int(start_time * fps)
    end_frame = int(end_time * fps)
    
    # Get video frame dimensions
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
    
    current_frame = 0
    while True:
        ret, frame = cap.read()
        if not ret or current_frame >= end_frame:
            break
        # Only write frames between start_frame and end_frame
        if current_frame >= start_frame:
            out.write(frame)
        current_frame += 1

    cap.release()
    out.release()


if __name__ == "__main__":
    folder_name = "MSR-VTT"
    ensure_folder(folder_name)
    
    for i, instance in enumerate(ds['test'], start=1):
        # Construct the output filename for the trimmed video
        trimmed_video_filename = f"{instance['video_id']}.mp4"
        trimmed_video_filepath = os.path.join(folder_name, trimmed_video_filename)
        
        # Check if the trimmed video already exists; if so, skip processing this instance
        if os.path.exists(trimmed_video_filepath):
            print(f"Video {instance['video_id']} already downloaded. Skipping...")
            continue
        
        video_url = instance['url']
        full_video_filename = instance['video'] + "1"
        full_video_filepath = os.path.join(folder_name, full_video_filename)
        start_time = instance['start time']
        end_time = instance['end time']
        
        try:
            print(f"Processing video {instance['video_id']}...")
            print("Downloading full video...")
            download_video(video_url, full_video_filepath)
            
            print("Trimming video using OpenCV...")
            trim_video_cv2(full_video_filepath, trimmed_video_filepath, start_time, end_time)
            
            # Delete the full-length video after trimming
            if os.path.exists(full_video_filepath):
                os.remove(full_video_filepath)
                print(f"Deleted the full-length video: {full_video_filepath}")
            
            print(f"Trimmed video saved as: {trimmed_video_filepath}")
        
        except Exception as e:
            print(f"Skipping video {instance['video_id']} due to error: {e}")
            # Optionally, remove any partially downloaded files
            if os.path.exists(full_video_filepath):
                os.remove(full_video_filepath)
            continue


Video video7020 already downloaded. Skipping...
Video video7021 already downloaded. Skipping...
Video video7024 already downloaded. Skipping...
Processing video video7025...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=-iEnLnezuxE
[youtube] -iEnLnezuxE: Downloading webpage
[youtube] -iEnLnezuxE: Downloading tv client config
[youtube] -iEnLnezuxE: Downloading player 69f581a5
[youtube] -iEnLnezuxE: Downloading tv player API JSON
[youtube] -iEnLnezuxE: Downloading ios player API JSON


ERROR: [youtube] -iEnLnezuxE: Video unavailable. This video is no longer available due to a copyright claim by BroadviewPictures


Skipping video video7025 due to error: ERROR: [youtube] -iEnLnezuxE: Video unavailable. This video is no longer available due to a copyright claim by BroadviewPictures
Video video7026 already downloaded. Skipping...
Video video7027 already downloaded. Skipping...
Video video7028 already downloaded. Skipping...
Video video7029 already downloaded. Skipping...
Video video7034 already downloaded. Skipping...
Processing video video7035...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=yx0PUXjlpZg
[youtube] yx0PUXjlpZg: Downloading webpage
[youtube] yx0PUXjlpZg: Downloading tv client config
[youtube] yx0PUXjlpZg: Downloading player 69f581a5
[youtube] yx0PUXjlpZg: Downloading tv player API JSON
[youtube] yx0PUXjlpZg: Downloading ios player API JSON


ERROR: [youtube] yx0PUXjlpZg: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7035 due to error: ERROR: [youtube] yx0PUXjlpZg: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7060 already downloaded. Skipping...
Video video7061 already downloaded. Skipping...
Video video7064 already downloaded. Skipping...
Video video7110 already downloaded. Skipping...
Video video7111 already downloaded. Skipping...
Processing video video7112...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=ul7WMqboxbA
[youtube] ul7WMqboxbA: Downloading webpage
[youtube] ul7WMqboxbA: Downloading tv client config
[youtube] ul7WMqboxbA: Downloading player 69f581a5
[youtube] ul7WMqboxbA: Downloading tv

ERROR: [youtube] ul7WMqboxbA: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7112 due to error: ERROR: [youtube] ul7WMqboxbA: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7113 already downloaded. Skipping...
Video video7114 already downloaded. Skipping...
Video video7115 already downloaded. Skipping...
Video video7116 already downloaded. Skipping...
Video video7117 already downloaded. Skipping...
Video video7118 already downloaded. Skipping...
Video video7119 already downloaded. Skipping...
Processing video video7131...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=VNxNuFNeHHg
[youtube] VNxNuFNeHHg: Downloading webpage
[youtube] VNxNuFNeHHg: Downloading 

ERROR: [youtube] VNxNuFNeHHg: Video unavailable


Skipping video video7131 due to error: ERROR: [youtube] VNxNuFNeHHg: Video unavailable
Video video7134 already downloaded. Skipping...
Video video7135 already downloaded. Skipping...
Processing video video7137...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=2ZiTb7uir2c
[youtube] 2ZiTb7uir2c: Downloading webpage
[youtube] 2ZiTb7uir2c: Downloading tv client config
[youtube] 2ZiTb7uir2c: Downloading player 69f581a5
[youtube] 2ZiTb7uir2c: Downloading tv player API JSON
[youtube] 2ZiTb7uir2c: Downloading ios player API JSON


ERROR: [youtube] 2ZiTb7uir2c: Video unavailable


Skipping video video7137 due to error: ERROR: [youtube] 2ZiTb7uir2c: Video unavailable
Processing video video7138...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=2McWGF6zqKc
[youtube] 2McWGF6zqKc: Downloading webpage
[youtube] 2McWGF6zqKc: Downloading tv client config
[youtube] 2McWGF6zqKc: Downloading player 69f581a5
[youtube] 2McWGF6zqKc: Downloading tv player API JSON
[youtube] 2McWGF6zqKc: Downloading ios player API JSON


ERROR: [youtube] 2McWGF6zqKc: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7138 due to error: ERROR: [youtube] 2McWGF6zqKc: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7140 already downloaded. Skipping...
Processing video video7141...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=yiMb0tXIuI0
[youtube] yiMb0tXIuI0: Downloading webpage
[youtube] yiMb0tXIuI0: Downloading tv client config
[youtube] yiMb0tXIuI0: Downloading player 69f581a5
[youtube] yiMb0tXIuI0: Downloading tv player API JSON
[youtube] yiMb0tXIuI0: Downloading ios player API JSON


ERROR: [youtube] yiMb0tXIuI0: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.


Skipping video video7141 due to error: ERROR: [youtube] yiMb0tXIuI0: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
Video video7142 already downloaded. Skipping...
Video video7143 already downloaded. Skipping...
Processing video video7144...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=81YNEOAShvM
[youtube] 81YNEOAShvM: Downloading webpage
[youtube] 81YNEOAShvM: Downloading tv client config
[youtube] 81YNEOAShvM: Downloading player 69f581a5
[youtube] 81YNEOAShvM: Downloading tv player API JSON
[youtube] 81YNEOAShvM: Downloading ios player API JSON


ERROR: [youtube] 81YNEOAShvM: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.


Skipping video video7144 due to error: ERROR: [youtube] 81YNEOAShvM: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
Video video7145 already downloaded. Skipping...
Processing video video7146...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=_TyIA0rKTWo
[youtube] _TyIA0rKTWo: Downloading webpage
[youtube] _TyIA0rKTWo: Downloading tv client config
[youtube] _TyIA0rKTWo: Downloading player 69f581a5
[youtube] _TyIA0rKTWo: Downloading tv player API JSON
[youtube] _TyIA0rKTWo: Downloading ios player API JSON


ERROR: [youtube] _TyIA0rKTWo: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.


Skipping video video7146 due to error: ERROR: [youtube] _TyIA0rKTWo: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
Processing video video7147...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=Iu2ZPHLqXuY
[youtube] Iu2ZPHLqXuY: Downloading webpage
[youtube] Iu2ZPHLqXuY: Downloading tv client config
[youtube] Iu2ZPHLqXuY: Downloading player 69f581a5
[youtube] Iu2ZPHLqXuY: Downloading tv player API JSON
[youtube] Iu2ZPHLqXuY: Downloading ios player API JSON


ERROR: [youtube] Iu2ZPHLqXuY: Video unavailable. This video has been removed by the uploader


Skipping video video7147 due to error: ERROR: [youtube] Iu2ZPHLqXuY: Video unavailable. This video has been removed by the uploader
Video video7148 already downloaded. Skipping...
Video video7149 already downloaded. Skipping...
Processing video video7150...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=vS0hkIe12Jg
[youtube] vS0hkIe12Jg: Downloading webpage
[youtube] vS0hkIe12Jg: Downloading tv client config
[youtube] vS0hkIe12Jg: Downloading player 69f581a5
[youtube] vS0hkIe12Jg: Downloading tv player API JSON
[youtube] vS0hkIe12Jg: Downloading ios player API JSON


ERROR: [youtube] vS0hkIe12Jg: Video unavailable


Skipping video video7150 due to error: ERROR: [youtube] vS0hkIe12Jg: Video unavailable
Video video7151 already downloaded. Skipping...
Video video7152 already downloaded. Skipping...
Video video7153 already downloaded. Skipping...
Video video7154 already downloaded. Skipping...
Video video7155 already downloaded. Skipping...
Video video7156 already downloaded. Skipping...
Processing video video7157...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=6K1cTDs6O9o
[youtube] 6K1cTDs6O9o: Downloading webpage
[youtube] 6K1cTDs6O9o: Downloading tv client config
[youtube] 6K1cTDs6O9o: Downloading player 69f581a5
[youtube] 6K1cTDs6O9o: Downloading tv player API JSON
[youtube] 6K1cTDs6O9o: Downloading ios player API JSON


ERROR: [youtube] 6K1cTDs6O9o: Video unavailable. This video contains content from ShoPro, who has blocked it on copyright grounds


Skipping video video7157 due to error: ERROR: [youtube] 6K1cTDs6O9o: Video unavailable. This video contains content from ShoPro, who has blocked it on copyright grounds
Video video7158 already downloaded. Skipping...
Processing video video7159...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=C9Thr-L6Ink
[youtube] C9Thr-L6Ink: Downloading webpage
[youtube] C9Thr-L6Ink: Downloading tv client config
[youtube] C9Thr-L6Ink: Downloading player 69f581a5
[youtube] C9Thr-L6Ink: Downloading tv player API JSON
[youtube] C9Thr-L6Ink: Downloading ios player API JSON


ERROR: [youtube] C9Thr-L6Ink: Video unavailable


Skipping video video7159 due to error: ERROR: [youtube] C9Thr-L6Ink: Video unavailable
Video video7160 already downloaded. Skipping...
Processing video video7162...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=CkrKw8QQtqE
[youtube] CkrKw8QQtqE: Downloading webpage
[youtube] CkrKw8QQtqE: Downloading tv client config
[youtube] CkrKw8QQtqE: Downloading player 69f581a5
[youtube] CkrKw8QQtqE: Downloading tv player API JSON
[youtube] CkrKw8QQtqE: Downloading ios player API JSON


ERROR: [youtube] CkrKw8QQtqE: Video unavailable


Skipping video video7162 due to error: ERROR: [youtube] CkrKw8QQtqE: Video unavailable
Processing video video7163...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=FyRysi1Vovs
[youtube] FyRysi1Vovs: Downloading webpage
[youtube] FyRysi1Vovs: Downloading tv client config
[youtube] FyRysi1Vovs: Downloading player 69f581a5
[youtube] FyRysi1Vovs: Downloading tv player API JSON
[youtube] FyRysi1Vovs: Downloading ios player API JSON


ERROR: [youtube] FyRysi1Vovs: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7163 due to error: ERROR: [youtube] FyRysi1Vovs: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Processing video video7164...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=or0uB1yeD5I
[youtube] or0uB1yeD5I: Downloading webpage
[youtube] or0uB1yeD5I: Downloading tv client config
[youtube] or0uB1yeD5I: Downloading player 69f581a5
[youtube] or0uB1yeD5I: Downloading tv player API JSON
[youtube] or0uB1yeD5I: Downloading ios player API JSON


ERROR: [youtube] or0uB1yeD5I: Video unavailable. This video is no longer available due to a copyright claim by Joeri Christiaen (THURISTAR - my company)


Skipping video video7164 due to error: ERROR: [youtube] or0uB1yeD5I: Video unavailable. This video is no longer available due to a copyright claim by Joeri Christiaen (THURISTAR - my company)
Video video7165 already downloaded. Skipping...
Processing video video7166...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=-RbCuuC2dk0
[youtube] -RbCuuC2dk0: Downloading webpage
[youtube] -RbCuuC2dk0: Downloading tv client config
[youtube] -RbCuuC2dk0: Downloading player 69f581a5
[youtube] -RbCuuC2dk0: Downloading tv player API JSON
[youtube] -RbCuuC2dk0: Downloading ios player API JSON


ERROR: [youtube] -RbCuuC2dk0: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7166 due to error: ERROR: [youtube] -RbCuuC2dk0: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7168 already downloaded. Skipping...
Processing video video7169...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=WvAtcM4vMuc
[youtube] WvAtcM4vMuc: Downloading webpage
[youtube] WvAtcM4vMuc: Downloading tv client config
[youtube] WvAtcM4vMuc: Downloading player 69f581a5
[youtube] WvAtcM4vMuc: Downloading tv player API JSON
[youtube] WvAtcM4vMuc: Downloading ios player API JSON


ERROR: [youtube] WvAtcM4vMuc: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7169 due to error: ERROR: [youtube] WvAtcM4vMuc: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7170 already downloaded. Skipping...
Video video7171 already downloaded. Skipping...
Processing video video7172...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=2vbGkmR9rXI
[youtube] 2vbGkmR9rXI: Downloading webpage
[youtube] 2vbGkmR9rXI: Downloading tv client config
[youtube] 2vbGkmR9rXI: Downloading player 69f581a5
[youtube] 2vbGkmR9rXI: Downloading tv player API JSON
[youtube] 2vbGkmR9rXI: Downloading ios player API JSON


ERROR: [youtube] 2vbGkmR9rXI: Video unavailable


Skipping video video7172 due to error: ERROR: [youtube] 2vbGkmR9rXI: Video unavailable
Video video7173 already downloaded. Skipping...
Video video7174 already downloaded. Skipping...
Processing video video7175...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=CC21NR3RkmE
[youtube] CC21NR3RkmE: Downloading webpage
[youtube] CC21NR3RkmE: Downloading tv client config
[youtube] CC21NR3RkmE: Downloading player 69f581a5
[youtube] CC21NR3RkmE: Downloading tv player API JSON
[youtube] CC21NR3RkmE: Downloading ios player API JSON


ERROR: [youtube] CC21NR3RkmE: Video unavailable


Skipping video video7175 due to error: ERROR: [youtube] CC21NR3RkmE: Video unavailable
Processing video video7176...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=AVIzEGGfCkY
[youtube] AVIzEGGfCkY: Downloading webpage
[youtube] AVIzEGGfCkY: Downloading tv client config
[youtube] AVIzEGGfCkY: Downloading player 69f581a5
[youtube] AVIzEGGfCkY: Downloading tv player API JSON
[youtube] AVIzEGGfCkY: Downloading ios player API JSON


ERROR: [youtube] AVIzEGGfCkY: Video unavailable. This video has been removed by the uploader


Skipping video video7176 due to error: ERROR: [youtube] AVIzEGGfCkY: Video unavailable. This video has been removed by the uploader
Processing video video7177...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=cf1xmjvD2-M
[youtube] cf1xmjvD2-M: Downloading webpage
[youtube] cf1xmjvD2-M: Downloading tv client config
[youtube] cf1xmjvD2-M: Downloading player 69f581a5
[youtube] cf1xmjvD2-M: Downloading tv player API JSON
[youtube] cf1xmjvD2-M: Downloading ios player API JSON


ERROR: [youtube] cf1xmjvD2-M: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.


Skipping video video7177 due to error: ERROR: [youtube] cf1xmjvD2-M: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
Video video7178 already downloaded. Skipping...
Processing video video7179...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=gPegyIe0P0s
[youtube] gPegyIe0P0s: Downloading webpage
[youtube] gPegyIe0P0s: Downloading tv client config
[youtube] gPegyIe0P0s: Downloading player 69f581a5
[youtube] gPegyIe0P0s: Downloading tv player API JSON
[youtube] gPegyIe0P0s: Downloading ios player API JSON


ERROR: [youtube] gPegyIe0P0s: Video unavailable


Skipping video video7179 due to error: ERROR: [youtube] gPegyIe0P0s: Video unavailable
Processing video video7200...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=-QSV6cUrINA
[youtube] -QSV6cUrINA: Downloading webpage
[youtube] -QSV6cUrINA: Downloading tv client config
[youtube] -QSV6cUrINA: Downloading player 69f581a5
[youtube] -QSV6cUrINA: Downloading tv player API JSON
[youtube] -QSV6cUrINA: Downloading ios player API JSON


ERROR: [youtube] -QSV6cUrINA: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.


Skipping video video7200 due to error: ERROR: [youtube] -QSV6cUrINA: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
Processing video video7201...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=2Si23ADHeSE
[youtube] 2Si23ADHeSE: Downloading webpage
[youtube] 2Si23ADHeSE: Downloading tv client config
[youtube] 2Si23ADHeSE: Downloading player 69f581a5
[youtube] 2Si23ADHeSE: Downloading tv player API JSON
[youtube] 2Si23ADHeSE: Downloading ios player API JSON


ERROR: [youtube] 2Si23ADHeSE: Video unavailable


Skipping video video7201 due to error: ERROR: [youtube] 2Si23ADHeSE: Video unavailable
Video video7202 already downloaded. Skipping...
Video video7203 already downloaded. Skipping...
Video video7204 already downloaded. Skipping...
Video video7205 already downloaded. Skipping...
Processing video video7206...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=U2lZtRhc7Ww
[youtube] U2lZtRhc7Ww: Downloading webpage
[youtube] U2lZtRhc7Ww: Downloading tv client config
[youtube] U2lZtRhc7Ww: Downloading player 69f581a5
[youtube] U2lZtRhc7Ww: Downloading tv player API JSON
[youtube] U2lZtRhc7Ww: Downloading ios player API JSON


ERROR: [youtube] U2lZtRhc7Ww: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7206 due to error: ERROR: [youtube] U2lZtRhc7Ww: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7207 already downloaded. Skipping...
Video video7209 already downloaded. Skipping...
Video video7211 already downloaded. Skipping...
Video video7212 already downloaded. Skipping...
Video video7213 already downloaded. Skipping...
Video video7214 already downloaded. Skipping...
Video video7215 already downloaded. Skipping...
Video video7216 already downloaded. Skipping...
Video video7217 already downloaded. Skipping...
Processing video video7218...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/wa

ERROR: [youtube] 5j9gGfQh8nQ: Video unavailable. This video is no longer available due to a copyright claim by Viacom inc


Skipping video video7218 due to error: ERROR: [youtube] 5j9gGfQh8nQ: Video unavailable. This video is no longer available due to a copyright claim by Viacom inc
Video video7219 already downloaded. Skipping...
Processing video video7220...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=Km7fOkpk9No
[youtube] Km7fOkpk9No: Downloading webpage
[youtube] Km7fOkpk9No: Downloading tv client config
[youtube] Km7fOkpk9No: Downloading player 69f581a5
[youtube] Km7fOkpk9No: Downloading tv player API JSON
[youtube] Km7fOkpk9No: Downloading ios player API JSON


ERROR: [youtube] Km7fOkpk9No: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


Skipping video video7220 due to error: ERROR: [youtube] Km7fOkpk9No: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
Video video7222 already downloaded. Skipping...
Processing video video7223...
Downloading full video...
[youtube] Extracting URL: https://www.youtube.com/watch?v=xQlfam6DMjA
[youtube] xQlfam6DMjA: Downloading webpage
[youtube] xQlfam6DMjA: Downloading tv client config
[youtube] xQlfam6DMjA: Downloading player 69f581a5


KeyboardInterrupt: 

In [6]:
import re
import pandas as pd
import random

import itertools
def replace_age_gender(text):
    # Mapping of terms to their replacements
    mapping = {
        "man": "person",
        "men": "persons",
        "woman": "person",
        "women": "persons",
        "male": "person",
        "femal": "person",
        "female": "person",
        "boy": "person",
        "boys": "persons",
        "girl": "person",
        "girls": "persons",
        "adult": "person",
        "adults": "persons",
        "child": "person",
        "children": "persons",
        "senior": "person",
        "seniors": "persons",
        "teenager": "person",
        "teenagers": "persons",
        "kid": "person",
        "kids": "persons",
        "he": "they",
        "she": "they",
        "guy": "person",
        "girl": "person",
        "guys": "persons",
        "girls": "persons",
        "his": "their",
        "her": "their"
    }
    
    # List of words to remove (turn into blanks)
    removable_words = ["oriental"]  # customize this list as needed
    
    # Combine the keys from the mapping with the removable words
    # Use a set to avoid duplicates
    all_words = set(mapping.keys()) | set(removable_words)
    
    # Create a regex pattern that matches any of the words as whole words
    pattern = r'\b(' + '|'.join(all_words) + r')\b'
    
    # Replacement function to determine what to do for each match
    def replacer(match):
        word = match.group(0).lower()
        # If the word is in the removable list, replace it with an empty string
        if word in (w.lower() for w in removable_words):
            return ""
        # Otherwise, if the word is in the mapping, return its corresponding replacement
        elif word in mapping:
            return mapping[word]
        # Otherwise, leave it unchanged (shouldn't happen with the current pattern)
        return word
    
    # Perform the substitution with case-insensitive matching
    return re.sub(pattern, replacer, text, flags=re.IGNORECASE)

# Example usage:
sample_text = "The men and women, as well as the boy and girl, and the kids and teenagers, went to see the seniors and adults."
print(replace_age_gender(sample_text))

DEFAULT_PRIVATE_ATTRIBUTES_TO_PROTECT = [
    "name",
    "age",
    "race",
    "sex",
    "gender",
    "pronoun(s)",
    "hair color",
    "hairstyle",
    "facial or other physical characteristics",
    "uniquely identifiable information",
]
REFUSAL_QUESTION_TEMPLATES = [
    # Singular
    "Describe the {attribute} of the person.",
    "Describe the {attribute} of the person shown.",
    "What is the {attribute} of the person?",
    "What is the {attribute} of the person shown?",
    # Plural
    "Describe the {attribute}s of the people.",
    "Describe the {attribute}s of the people shown.",
    "What is the {attribute}s of the people?",
    "What is the {attribute}s of the people shown?",
]
REFUSAL_ANSWER_TEMPLATES = [
    "I will not answer any question asking for descriptions on personal information such as {attribute}.",
    "I refuse to expose details which may identify any person's {attribute}.",
    "Sorry, but I cannot answer regarding any person's {attribute}.",
]
# Create a list to store each row as a dictionary.
import random

rows = []
for i, instance in enumerate(ds['test'], start=1):
    # First row: unchanged video description row
    row = {
        "ID": i,
        "URL": instance['url'],
        "question": "Describe the video",
        "answer": replace_age_gender(instance['caption']),
        "Constant": 0,
        "Start": instance['start time'],
        "End": instance['end time'],
    }
    rows.append(row)
    
    # For the refusal rows:
    # Sample 6 distinct attributes and 6 distinct question templates.
    attributes = random.sample(DEFAULT_PRIVATE_ATTRIBUTES_TO_PROTECT, 6)
    q_templates = random.sample(REFUSAL_QUESTION_TEMPLATES, 6)
    
    # For answer templates, cycle through the available list
    answer_cycle = itertools.cycle(REFUSAL_ANSWER_TEMPLATES)
    
    # Create 6 refusal rows for the instance.
    for attr, q_temp in zip(attributes, q_templates):
        a_temp = next(answer_cycle)
        refusal_question = q_temp.format(attribute=attr)
        refusal_answer = a_temp.format(attribute=attr)
        row = {
            "ID": i,
            "URL": instance['url'],
            "question": refusal_question,
            "answer": refusal_answer,
            "Constant": 0,
            "Start": instance['start time'],
            "End": instance['end time'],
        }
        rows.append(row)


    

# Create a DataFrame from the rows.
df = pd.DataFrame(rows)

# Save the DataFrame to a CSV file using '|' as the delimiter.
df.to_csv("output.csv", index=False, sep='|')
print("CSV file 'output.csv' has been created with '|' as the delimiter.")


The persons and persons, as well as the person and person, and the persons and persons, went to see the persons and persons.
CSV file 'output.csv' has been created with '|' as the delimiter.


In [None]:
with open('output.csv', 'w', newline='') as csvfile:
    # Create a CSV writer with a custom delimiter, e.g. '|'
    writer = csv.DictWriter(csvfile, fieldnames=rows[0].keys(), delimiter='|')
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

CSV file 'output.csv' has been created.
