In [22]:
### IMPORTS ###
import pandas as pd
import yt_dlp
import json
import os
from pathlib import Path

In [24]:
# help(yt_dlp.YoutubeDL)
# Detailed explanation of yt-dlp options for your use case
"""
OPTIONS NEEDED FOR YOUR REQUIREMENTS:

1. TITLE - Automatically extracted as info_dict['title']
   
2. AUTHOR - Automatically extracted as info_dict['uploader']
   
3. LINK - The URL you provide to the function
   
4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)

5. SUBTITLES WITH TIMESTAMPS (Japanese & English):
   - writesubtitles: True
     Downloads manually created subtitles
   
   - writeautomaticsub: True  
     Downloads auto-generated subtitles (YouTube's automatic captions)
   
   - subtitleslangs: ['ja', 'en']
     Specifies which subtitle languages to download
     Can use 'all' to download all available languages
   
   - subtitlesformat: 'vtt'  (RECOMMENDED)
     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'
     VTT (WebVTT) includes timestamps in this format:
         00:00:01.000 --> 00:00:03.000
         Subtitle text here
     
     VTT is best for your use case because:
     - Includes precise timestamps
     - Easy to parse programmatically
     - Well-supported format

6. AUDIO FILE:
   - format: 'bestaudio/best'
     Downloads the best available audio quality
   
   - postprocessors: [{
         'key': 'FFmpegExtractAudio',
         'preferredcodec': 'mp3',      # Output format (mp3, wav, m4a, etc.)
         'preferredquality': '192',    # Audio bitrate (128, 192, 256, 320)
     }]
     Converts audio to MP3 format
   
   With the subtitle timestamps, you can later extract specific 
   audio segments using ffmpeg or pydub

OTHER USEFUL OPTIONS:

- writeinfojson: True
  Saves ALL metadata as a .info.json file including:
  - Full description, tags, categories
  - View count, like count, upload date
  - All available formats and quality info
  - Comments (if getcomments: True)
  
- outtmpl: 'path/%(title)s.%(ext)s'
  Controls output filename. Available variables:
  - %(title)s - Video title
  - %(uploader)s - Channel/uploader name
  - %(id)s - Video ID
  - %(upload_date)s - Upload date (YYYYMMDD)
  - %(duration)s - Duration in seconds
  Example: 'downloads/%(uploader)s/%(title)s_%(id)s.%(ext)s'

- writethumbnail: True
  Downloads the video thumbnail image
  
- quiet: False
  Shows download progress (True hides it)

- getcomments: True (with writeinfojson: True)
  Extracts video comments and saves to JSON
"""

"\nOPTIONS NEEDED FOR YOUR REQUIREMENTS:\n\n1. TITLE - Automatically extracted as info_dict['title']\n\n2. AUTHOR - Automatically extracted as info_dict['uploader']\n\n3. LINK - The URL you provide to the function\n\n4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)\n\n5. SUBTITLES WITH TIMESTAMPS (Japanese & English):\n   - writesubtitles: True\n     Downloads manually created subtitles\n\n   - writeautomaticsub: True  \n     Downloads auto-generated subtitles (YouTube's automatic captions)\n\n   - subtitleslangs: ['ja', 'en']\n     Specifies which subtitle languages to download\n     Can use 'all' to download all available languages\n\n   - subtitlesformat: 'vtt'  (RECOMMENDED)\n     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'\n     VTT (WebVTT) includes timestamps in this format:\n         00:00:01.000 --> 00:00:03.000\n         Subtitle text here\n\n     VTT is best for your use case because:\n     - Includes precise timestamps\n     - Easy t

In [None]:
def download_media_with_subtitles(url, output_dir='data'):
    """
    Download audio as MP3, subtitles, and metadata from a given URL.
    
    Args:
        url        (str): URL of the media to download
        output_dir (str): Directory to save downloaded files
    
    Returns:
        result: Information about downloaded files
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)     
    
    # Configure yt-dlp options
    ydl_opts = {
        # Audio settings
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        
        # Output settings
        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
        
        # Subtitle settings - KEY FOR YOUR USE CASE
        'writesubtitles': True,              # Manual subs
        'writeautomaticsub': True,           # Auto-generated subs
        'subtitleslangs': ['ja'],            # Japanese (use translator to convert to en)
        'subtitlesformat': 'vtt',            # VTT format (includes timestamps)
        
        # Metadata settings
        'writeinfojson': True,               # Save full metadata as JSON
        'writethumbnail': False,             # Set True if you want thumbnails
        
        # Rate limiting to avoid 429 errors
        'sleep_interval_requests': 2,        # Sleep 2 seconds between API requests
        'sleep_interval_subtitles': 10,      # Sleep 10 seconds between subtitle downloads
        'sleep_interval': 1,                 # Sleep 1 second before each download
        
        # Progress settings
        'quiet': False,
        'no_warnings': False,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Extract info without downloading first to check what's available
            info = ydl.extract_info(url, download=False)

            # Display what's available
            print(f"\nTitle: {info.get('title')}")
            print(f"Author: {info.get('uploader', 'N/A')}")
            print(f"Duration: {info.get('duration', 'N/A')} seconds")
            print(f"URL: {url}\n")
            
            # Check subtitles
            manual_subs = list(info.get('subtitles', {}).keys())
            auto_subs = list(info.get('automatic_captions', {}).keys())
            print(f"Manual subtitles: {manual_subs if manual_subs else 'None'}")
            print(f"Auto-captions available for: {', '.join(auto_subs[:5])}... (and {len(auto_subs)-5} more)" if len(auto_subs) > 5 else f"Auto-captions: {auto_subs}\n")
            
            # Download everything
            print("\n===== Downloading audio, subtitles, and metadata... =====")
            print("(Adding delays between requests to avoid rate limiting...)\n")     

            ydl.download([url])
            
            result = {
                'title': info.get('title'),
                'duration': info.get('duration'),
                'uploader': info.get('uploader'),
                'upload_date': info.get('upload_date'),
                'view_count': info.get('view_count'),
                'like_count': info.get('like_count'),
                'description': info.get('description'),
                'subtitles_available': list(info.get('subtitles', {}).keys()),
                'auto_captions_available': list(info.get('automatic_captions', {}).keys()),
                'output_dir': output_dir
            }
            
            print("\nDownload complete!\n")
            return result
    except Exception as e:
        print(f"Error: {str(e)}\n")
        return None


In [26]:
# Test the function with a URL
# Replace with your desired URL
url = "https://www.youtube.com/watch?v=1PPLx2N_YTE"

# Download audio, subtitles, and metadata
result = download_media_with_subtitles(url, output_dir='japanese_media')

# Display the metadata
if result:
    #print("\\n\" + \"=\"*50")
    print("DOWNLOADED MEDIA INFORMATION\n")
    #print("=\"*50")
    print(json.dumps(result, indent=2, ensure_ascii=False))

[youtube] Extracting URL: https://www.youtube.com/watch?v=1PPLx2N_YTE
[youtube] 1PPLx2N_YTE: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading m3u8 information
[info] 1PPLx2N_YTE: Downloading subtitles: ja

Title: „Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice)
Author: YUYU„ÅÆÊó•Êú¨Ë™ûPodcast
Duration: 4205 seconds
URL: https://www.youtube.com/watch?v=1PPLx2N_YTE

Manual subtitles: None
Auto-captions available for: ab, aa, af, ak, sq... (and 152 more)

===== Downloading audio, subtitles, and metadata... =====
(Adding delays between requests to avoid rate limiting...)

[youtube] Extracting URL: https://www.youtube.com/watch?v=1PPLx2N_YTE
[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1PPLx2N_YTE: Downloading m3u8 information
[info] 1PPLx2N_YTE: Downloading subtitles: ja
[info] 1PPLx2N_YTE: Downloading 1 format(s): 251
Deleting existing file japanese_media/„Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice).ja.vtt
[info] Writing video subtitles to: japanese_media/„Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice).ja.vtt
[download] Sleeping 10.00 seconds ...




[download] Destination: japanese_media/„Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice).ja.vtt
[download] 100% of  651.10KiB in 00:00:00 at 729.68KiB/s
[info] Writing video metadata as JSON to: japanese_media/„Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice).info.json
[download] Sleeping 1.00 seconds ...
[download] Destination: japanese_media/„Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice).webm
[download] 100% of   63.07MiB in 00:00:28 at 2.23MiB/s     
[ExtractAudio] Destination: japanese_media/„Äê1hour Podcast„ÄëÊú¨Âêç„ÅØÔºüÁµêÂ©ö„Åó„Å¶„ÇãÔºüËã¶Êâã„Å™„Åì„Å®„ÅØÔºüYUYU„Å´„Å§„ÅÑ„Å¶Ë©±„Åó„Åæ„Åó„ÅüÔºÅ (Japanese Radio for Listening practice).mp3
Deleting original file japanese_media/„Äê1hour Podcast„ÄëÊú¨Âê