In [18]:
import yt_dlp
import os
from moviepy.video.io.VideoFileClip import VideoFileClip
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import json
import boto3
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [19]:
secret_name = "youtube"
region_name = "us-east-1"

# Create a Secrets Manager client
session = boto3.session.Session()
secretsmanager = session.client(service_name='secretsmanager', region_name=region_name)

get_secret_value_response = secretsmanager.get_secret_value(SecretId=secret_name)

secret = get_secret_value_response['SecretString']
api_key = json.loads(secret)["API_KEY"]


In [20]:
total_duration = 0

In [21]:
s3_client = boto3.client('s3')
bucket = 'slip-ml'

In [22]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [23]:
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using {torch_dtype} dtype")

Using torch.float32 dtype


In [24]:
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

In [25]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use mps


In [26]:
def get_playlist_videos(playlist_url):
    """
    Extract all video URLs from a YouTube playlist using the YouTube Data API.
    
    Args:
        playlist_url (str): The YouTube playlist URL (e.g., https://www.youtube.com/playlist?list=PL86SiVwkw_odmp-RVzD8yef3wU7Z2uD5a)
        api_key (str): Your YouTube Data API key
    
    Returns:
        list: List of video URLs
    """
    # Extract playlist ID from URL
    playlist_id = playlist_url.split("list=")[-1].split("&")[0]
    
    # Initialize YouTube API client
    youtube = build('youtube', 'v3', developerKey=api_key)
    
    video_urls = []
    next_page_token = None
    
    try:
        while True:
            # Request playlist items
            request = youtube.playlistItems().list(
                part="contentDetails",
                playlistId=playlist_id,
                maxResults=50,  # Max allowed per request
                pageToken=next_page_token
            )
            response = request.execute()
            
            # Extract video IDs and create URLs
            for item in response['items']:
                video_id = item['contentDetails']['videoId']
                video_url = f"https://www.youtube.com/watch?v={video_id}"
                video_urls.append(video_url)
            
            # Check for next page
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
    
    except HttpError as e:
        print(f"An error occurred: {e}")
        return []
    
    return video_urls

In [27]:
def download_youtube_video_yt_dlp(url):
    # extract video ID from the URL
    video_id = url.split("v=")[-1]
    if "&" in video_id:
        video_id = video_id.split("&")[0]

    ydl_opts = {
        "outtmpl": f"{video_id}.%(ext)s",  # Output path and filename
        "format": "best",  # Select the best single file (video + audio)
        "merge_output_format": None,  # Avoid merging, stick to single stream
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print("Download completed successfully!")
        return video_id
    except Exception as e:
        print(f"An error occurred: {e}")

In [28]:
def upload_to_s3(local_file):
    if 'text' in local_file:
        s3_folder = 'text'
    elif 'video' in local_file:
        s3_folder = 'video'
    else:
        s3_folder = 'data'
    
    s3_file = f"{s3_folder}/{local_file}"
    
    # Upload the file
    s3_client.upload_file(local_file, bucket, s3_file)
    print(f"Upload Successful: {local_file} -> {s3_file}")

In [29]:
def get_video_chunk_names(path):
    # extract video ID from the filename
    video_id = path.split('__')[0]
    chunk_name = path.split('__')[2].split('.')[0]
    return video_id, chunk_name

In [30]:
def split_audio_video(video_file):
    try:
        # extract video ID from the filename
        video_id, chunk_name = get_video_chunk_names(video_file)
        print(f"Video ID: {video_id}, Chunk Name: {chunk_name}")

        # import video
        video_chunk = VideoFileClip(video_file)

        # Split audio and video
        audio_path = os.path.join(f"{video_id}__audio__{chunk_name}.mp3")
        video_path = os.path.join(f"{video_id}__video__{chunk_name}.mp4")
        print(f"Audio path: {audio_path}, Video path: {video_path}")


        # Write audio to file
        video_chunk.audio.write_audiofile(audio_path)

        # Write video to file
        video_only = video_chunk.without_audio()
        video_only.write_videofile(video_path, codec="libx264", audio_codec="aac")
        upload_to_s3(video_path)

        # Close the video clip
        video_chunk.close()
        video_only.close()

        # delete chunk video file
        os.remove(video_file)

        print("Audio and video split successfully!")
        return audio_path, video_path
    except Exception as e:
        print(f"An error occurred: {e}")

In [31]:
def clip_video_into_chunks(video_id):
    global total_duration
    input_file = video_id + '.mp4'
    try:
        video = VideoFileClip(input_file)
        duration = int(video.duration) - 30  # Get the duration of the video in seconds
        total_duration += duration

        chunk_length = (duration // 10)+1  # Length of each chunk in seconds
        chunks = []

        for start in range(0, duration, chunk_length):
            end = min(start + chunk_length, duration)
            chunk = video.subclipped(start, end)
            chunk_filename = f"{video_id}__chunk__{start // chunk_length}.mp4"
            chunk.write_videofile(chunk_filename, codec="libx264")
            chunks.append(chunk_filename)

        video.close()

        # detele original video file
        os.remove(input_file)

        return chunks
    except Exception as e:
        print(f"An error occurred while splitting the video: {e}")
        return []

In [32]:
def audio_to_text(audio_file):
    result = pipe(f"{os.getcwd()}/{audio_file}")
    text = result["text"]
    # extract video ID from the filename
    video_id, chunk_name = get_video_chunk_names(audio_file)
    # save text to JSON file
    data = {
        
        "video_id": video_id,
        "chunk_name": chunk_name,
        "text": text
    }

    # Save the data to a JSON file
    json_file = f"{video_id}__text__{chunk_name}.json"
    with open(json_file, "w") as f:
        json.dump(data, f, indent=4)

    print(f"Text saved to {json_file}")
    upload_to_s3(json_file)

    # remove audio file
    os.remove(audio_file)
    os.remove(json_file)

In [33]:
# get list of YT video urls
playlist_urls = ["https://www.youtube.com/playlist?list=PL86SiVwkw_odmp-RVzD8yef3wU7Z2uD5a",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_ocn2nwGFoFWkBN8pFwgUshe",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_oe-sPwrWqm0k7t8bOK8dFpB",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_ofCWfjyBWs4PES8w-5AwPbx",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_oeR6BsaVjOwHunDOyAmDYxc",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_ocJPhcA3xiqszDcyIiGIt5y",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_oeLwHETCekBrdfP7M93LOpU",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_odq_rn2jUfdDAYgQfvijNtp",
                 "https://www.youtube.com/playlist?list=PL86SiVwkw_ofqbtdqZzFgzd--kxXhdRB4"]
                 

In [34]:
all_video_urls = []
for playlist_url in playlist_urls:
    video_urls = get_playlist_videos(playlist_url)
    all_video_urls.extend(video_urls)
print(f"Total videos: {len(all_video_urls)}")

Total videos: 1182


In [38]:
# remove duplicate video urls
all_video_urls = list(set(all_video_urls))
print(f"Total videos after removing duplicates: {len(all_video_urls)}")

Total videos after removing duplicates: 1141


In [None]:
last_video_num = 0
for i, video_url in enumerate(all_video_urls):
    if '74M0hPAeFHs' in video_url:
        print(f"Skipping video {i+1}/{len(all_video_urls)}: {video_url}")
        last_video_num = i
        continue

Skipping video 58/1141: https://www.youtube.com/watch?v=74M0hPAeFHs


In [None]:
for i, url in enumerate(all_video_urls[last_video_num:], start=last_video_num+1):
    print(f"*******************************************************Processing video {i} of {len(all_video_urls)}********************************************************")
    try:
        video_name = download_youtube_video_yt_dlp(url)
        video_chunks = clip_video_into_chunks(video_name)
        for chunk in video_chunks:
            audio_path, video_path = split_audio_video(chunk)
            audio_to_text(audio_path)
            os.remove(f"{os.getcwd()}/{video_path}")
    except Exception as e:
        print(f"An error occurred: {e}")
        continue

*******************************************************Processing video 0 of 1182********************************************************
[youtube] Extracting URL: https://www.youtube.com/watch?v=4Prc1UfuokY
[youtube] 4Prc1UfuokY: Downloading webpage
[youtube] 4Prc1UfuokY: Downloading tv client config
[youtube] 4Prc1UfuokY: Downloading player fded239a-main
[youtube] 4Prc1UfuokY: Downloading tv player API JSON
[youtube] 4Prc1UfuokY: Downloading ios player API JSON
[youtube] 4Prc1UfuokY: Downloading m3u8 information


KeyboardInterrupt: 

In [None]:
print(f"Total duration: {total_duration/60/60} hours")

Total duration: 5.056111111111111 hours
