In [None]:
import yt_dlp
import os
import pronouncing
import mediapipe as mp
import cv2
import json
import boto3
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from moviepy.video.io.VideoFileClip import VideoFileClip
import numpy as np
import math
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import random
import time

In [None]:
secret_name = "youtube"
region_name = "us-east-1"

# Create a Secrets Manager client
session = boto3.session.Session()
secretsmanager = session.client(service_name='secretsmanager', region_name=region_name)

get_secret_value_response = secretsmanager.get_secret_value(SecretId=secret_name)

secret = get_secret_value_response['SecretString']
api_key = json.loads(secret)["API_KEY"]

In [None]:
s3_client = boto3.client('s3')
bucket = 'slip-ml'

In [None]:
def get_playlist_videos(playlist_url):
    """
    Extract all video URLs from a YouTube playlist using the YouTube Data API.
    
    Args:
        playlist_url (str): The YouTube playlist URL (e.g., https://www.youtube.com/playlist?list=PL86SiVwkw_odmp-RVzD8yef3wU7Z2uD5a)
        api_key (str): Your YouTube Data API key
    
    Returns:
        list: List of video URLs
    """
    # Extract playlist ID from URL
    playlist_id = playlist_url.split("list=")[-1].split("&")[0]
    
    # Initialize YouTube API client
    youtube = build('youtube', 'v3', developerKey=api_key)
    
    video_urls = []
    next_page_token = None
    
    try:
        while True:
            # Request playlist items
            request = youtube.playlistItems().list(
                part="contentDetails",
                playlistId=playlist_id,
                maxResults=50,  # Max allowed per request
                pageToken=next_page_token
            )
            response = request.execute()
            
            # Extract video IDs and create URLs
            for item in response['items']:
                video_id = item['contentDetails']['videoId']
                video_url = f"https://www.youtube.com/watch?v={video_id}"
                video_urls.append(video_url)
            
            # Check for next page
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
    
    except HttpError as e:
        print(f"An error occurred: {e}")
        return []
    
    return video_urls

In [None]:
def create_s3_file_list(prefix):
    response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    all_files = [obj["Key"] for obj in response["Contents"]]
    token = response.get("NextContinuationToken")
    s3_items = True
    while s3_items:
        try:
            response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
            if "Contents" in response:
                all_files.extend([obj["Key"] for obj in response["Contents"]])
                s3_items = response.get("IsTruncated", False)
                token = response.get("NextContinuationToken")
            else:
                s3_items = False
        except Exception as e:
            print(f"Error listing S3 objects: {e}")
            break

    return all_files

In [None]:
def list_s3_files_and_check_video(video_id, all_s3_files):
    for file in all_s3_files:
        if video_id in file:
            return True
    return False

In [None]:
def upload_to_s3(local_file, s3_folder):
    s3_file = f"{s3_folder}/{local_file}"
    try:
        s3_client.upload_file(local_file, bucket, s3_file)
        print(f"Upload Successful: {local_file} -> {s3_file}")
    except FileNotFoundError:
        print(f"The file was not found: {local_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return local_file, s3_folder

In [None]:
def extract_video_id(youtube_url):
    """
    Extracts the video ID from a YouTube URL.

    Args:
        youtube_url (str): The YouTube video URL.

    Returns:
        str: The extracted video ID, or None if the URL is invalid.
    """
    try:
        # Check if the URL contains 'v='
        if "v=" in youtube_url:
            video_id = youtube_url.split("v=")[-1]
            # Remove any additional parameters after the video ID
            video_id = video_id.split("&")[0]
            return video_id
        else:
            print(f"Invalid YouTube URL: {youtube_url}")
            return None
    except Exception as e:
        print(f"An error occurred while extracting video ID: {e}")
        return None

In [None]:
ecs_client = boto3.client("ecs", region_name=region_name)
def kickoff_ecs_job(local_file, s3_folder):
    """
    Kicks off an ECS job with a Docker container configured to process a YouTube video..
    """

    # Configure the container overrides to pass the video_id as a command argument
    container_overrides = {
        "name": 'vallr-gather-data',
        #"image": '438465160412.dkr.ecr.us-east-1.amazonaws.com/vallr-gather-data',
        "command": ["python", "gather-data.py", "--s3-folder", s3_folder, "--local-file", local_file],
    }

    # Run the ECS task
    try:
        response = ecs_client.run_task(
            cluster='workspace',
            taskDefinition='preprocess',
            overrides={"containerOverrides": [container_overrides]},
            launchType="FARGATE",
            networkConfiguration={
                "awsvpcConfiguration": {
                    "subnets": ["subnet-02b0f97a6782f3791"],  # Replace with your subnet IDs
                    "securityGroups": ["sg-07d656bd0202dad6b"],  # Replace with your security group IDs
                    "assignPublicIp": "ENABLED"
                }
            }
        )
        return response
    except Exception as e:
        print(f"An error occurred while starting the ECS task: {e}")
        return None

In [None]:
def download_youtube_video_yt_dlp(url):
    # extract video ID from the URL
    video_id = url.split("v=")[-1]
    if "&" in video_id:
        video_id = video_id.split("&")[0]

    ydl_opts = {
        "outtmpl": f"{video_id}.%(ext)s",  # Output path and filename
        "format": "best",  # Select the best single file (video + audio)
        "merge_output_format": None,  # Avoid merging, stick to single stream
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print("Download completed successfully!")
        local_file, s3_folder = upload_to_s3(video_id + '.mp4', 'data/ted')
        os.remove(f"{video_id}.mp4")
        return local_file, s3_folder
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
all_s3_files = []
for p in ['data/vallr/train/face/', 'data/vallr/test/face/']:
    all_s3_files.extend(create_s3_file_list(p))

In [None]:
playlist_url = 'https://www.youtube.com/playlist?list=PLOGi5-fAu8bEsgXDEOxRm73J8FRLtBfxU'
video_urls = get_playlist_videos(playlist_url)

# remove duplicate video urls
all_video_urls = list(set(video_urls))
print(f"Total videos after removing duplicates: {len(all_video_urls)}")

# randomly shuffle the video urls
random.shuffle(all_video_urls)

In [None]:
num_processed = 0
to_be_processed = []
for youtube_url in all_video_urls:
    video_id = extract_video_id(youtube_url)
    is_processed = list_s3_files_and_check_video(video_id, all_s3_files)
    if is_processed: 
        num_processed += 1
    else:
        to_be_processed.append(youtube_url)
        

print(f"Number of videos already processed: {num_processed}")
print(f"Percentage of videos already processed: {num_processed / len(all_video_urls) * 100:.2f}%")
print(f"Number of videos chunks: {len(all_s3_files)}")
print(f"Number of videos to be processed: {len(to_be_processed)}")

In [None]:
running_tasks = []
for youtube_url in to_be_processed:
    if len(running_tasks) <= 17:
            try:
                print(f"Processing video: {youtube_url}")
                local_file, s3_folder = download_youtube_video_yt_dlp(youtube_url)
                response = kickoff_ecs_job(local_file, s3_folder)
                task_arn = response['tasks'][0]['taskArn']
                running_tasks.append(task_arn)
            except:
                 print(f"Wait for the task to finish before starting a new one.")
                 time.sleep(60*10)
    else:
        for task_arn in running_tasks:
            try:
                describe = ecs_client.describe_tasks(
                        cluster='workspace',
                        tasks=[task_arn],
                )
                task_arn = describe['tasks'][0]['taskArn']
                status = describe['tasks'][0]['lastStatus']
                if status == 'STOPPED':
                    running_tasks.remove(task_arn)
                    print(f"Task {task_arn} has stopped.")
                else:
                    print(f"Task {task_arn} is still running.")
                    print(f"Wait for the task to finish before starting a new one.")
                    time.sleep(60*10)
            except IndexError:
                print(f"Task {task_arn} not found.")
                running_tasks.remove(task_arn)
            except Exception as e:
                print(f"An error occurred while describing the task: {e}")
                time.sleep(60*10)