In [38]:
### IMPORTS ###
import pandas as pd
import yt_dlp
import json
import os
from pathlib import Path
import re

import webvtt
from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
import functools

In [35]:
# help(yt_dlp.YoutubeDL)
# Detailed explanation of yt-dlp options for your use case
"""
OPTIONS NEEDED FOR YOUR REQUIREMENTS:

1. TITLE - Automatically extracted as info_dict['title']
   
2. AUTHOR - Automatically extracted as info_dict['uploader']
   
3. LINK - The URL you provide to the function
   
4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)

5. SUBTITLES WITH TIMESTAMPS (Japanese & English):
   - writesubtitles: True
     Downloads manually created subtitles
   
   - writeautomaticsub: True  
     Downloads auto-generated subtitles (YouTube's automatic captions)
   
   - subtitleslangs: ['ja', 'en']
     Specifies which subtitle languages to download
     Can use 'all' to download all available languages
   
   - subtitlesformat: 'vtt'  (RECOMMENDED)
     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'
     VTT (WebVTT) includes timestamps in this format:
         00:00:01.000 --> 00:00:03.000
         Subtitle text here
     
     VTT is best for your use case because:
     - Includes precise timestamps
     - Easy to parse programmatically
     - Well-supported format

6. AUDIO FILE:
   - format: 'bestaudio/best'
     Downloads the best available audio quality
   
   - postprocessors: [{
         'key': 'FFmpegExtractAudio',
         'preferredcodec': 'mp3',      # Output format (mp3, wav, m4a, etc.)
         'preferredquality': '192',    # Audio bitrate (128, 192, 256, 320)
     }]
     Converts audio to MP3 format
   
   With the subtitle timestamps, you can later extract specific 
   audio segments using ffmpeg or pydub

OTHER USEFUL OPTIONS:

- writeinfojson: True
  Saves ALL metadata as a .info.json file including:
  - Full description, tags, categories
  - View count, like count, upload date
  - All available formats and quality info
  - Comments (if getcomments: True)
  
- outtmpl: 'path/%(title)s.%(ext)s'
  Controls output filename. Available variables:
  - %(title)s - Video title
  - %(uploader)s - Channel/uploader name
  - %(id)s - Video ID
  - %(upload_date)s - Upload date (YYYYMMDD)
  - %(duration)s - Duration in seconds
  Example: 'downloads/%(uploader)s/%(title)s_%(id)s.%(ext)s'

- writethumbnail: True
  Downloads the video thumbnail image
  
- quiet: False
  Shows download progress (True hides it)

- getcomments: True (with writeinfojson: True)
  Extracts video comments and saves to JSON
"""

"\nOPTIONS NEEDED FOR YOUR REQUIREMENTS:\n\n1. TITLE - Automatically extracted as info_dict['title']\n\n2. AUTHOR - Automatically extracted as info_dict['uploader']\n\n3. LINK - The URL you provide to the function\n\n4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)\n\n5. SUBTITLES WITH TIMESTAMPS (Japanese & English):\n   - writesubtitles: True\n     Downloads manually created subtitles\n\n   - writeautomaticsub: True  \n     Downloads auto-generated subtitles (YouTube's automatic captions)\n\n   - subtitleslangs: ['ja', 'en']\n     Specifies which subtitle languages to download\n     Can use 'all' to download all available languages\n\n   - subtitlesformat: 'vtt'  (RECOMMENDED)\n     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'\n     VTT (WebVTT) includes timestamps in this format:\n         00:00:01.000 --> 00:00:03.000\n         Subtitle text here\n\n     VTT is best for your use case because:\n     - Includes precise timestamps\n     - Easy t

In [None]:
def download_media_with_subtitles(url, output_dir='data'):
    """
    Download audio as MP3, subtitles, and metadata from a given URL.
    
    Args:
        url        (str): URL of the media to download
        output_dir (str): Directory to save downloaded files
    
    Returns:
        result: Information about downloaded files
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)     
    
    # Configure yt-dlp options
    ydl_opts = {
        # Audio settings
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        
        # Output settings
        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
        
        # Subtitle settings - KEY FOR YOUR USE CASE
        'writesubtitles': True,              # Manual subs
        'writeautomaticsub': True,           # Auto-generated subs
        'subtitleslangs': ['ja'],            # Japanese (use translator to convert to en)
        'subtitlesformat': 'vtt',            # VTT format (includes timestamps)
        
        # Metadata settings
        'writeinfojson': True,               # Save full metadata as JSON
        'writethumbnail': False,             # Set True if you want thumbnails
        
        # Rate limiting to avoid 429 errors
        'sleep_interval_requests': 2,        # Sleep 2 seconds between API requests
        'sleep_interval_subtitles': 10,      # Sleep 10 seconds between subtitle downloads
        'sleep_interval': 1,                 # Sleep 1 second before each download
        
        # Progress settings
        'quiet': False,
        'no_warnings': False,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Extract info without downloading first to check what's available
            info = ydl.extract_info(url, download=False)

            # Display what's available
            print(f"\nTitle: {info.get('title')}")
            print(f"Author: {info.get('uploader', 'N/A')}")
            print(f"Duration: {info.get('duration', 'N/A')} seconds")
            print(f"URL: {url}\n")
            
            # Check subtitles
            manual_subs = list(info.get('subtitles', {}).keys())
            auto_subs = list(info.get('automatic_captions', {}).keys())
            print(f"Manual subtitles: {manual_subs if manual_subs else 'None'}")
            print(f"Auto-captions available for: {', '.join(auto_subs[:5])}... (and {len(auto_subs)-5} more)" if len(auto_subs) > 5 else f"Auto-captions: {auto_subs}\n")
            
            # Download everything
            print("\n===== Downloading audio, subtitles, and metadata... =====")
            print("(Adding delays between requests to avoid rate limiting...)\n")     

            ydl.download([url])
            
            result = {
                'title': info.get('title'),
                'duration': info.get('duration'),
                'uploader': info.get('uploader'),
                'upload_date': info.get('upload_date'),
                'view_count': info.get('view_count'),
                'like_count': info.get('like_count'),
                'description': info.get('description'),
                'subtitles_available': list(info.get('subtitles', {}).keys()),
                'auto_captions_available': list(info.get('automatic_captions', {}).keys()),
                'output_dir': output_dir
            }
            
            print("\nDownload complete!\n")
            return result
    except Exception as e:
        print(f"Error: {str(e)}\n")
        return None


In [50]:
# Test the function with a URL
# Replace with your desired URL
# url = "https://www.youtube.com/watch?v=1PPLx2N_YTE" # YUYU podcast test, auto gen subs
# url = "https://www.youtube.com/watch?v=hkeegM5t-lw" # Jiro podcast test, manual subs
# url = "https://open.spotify.com/track/7DyS11kB1YNrfDzQqtjmTh?si=24a7621a3a24483c" # spotify song
url = "https://www.youtube.com/watch?v=1mz-A--mANU"

# Download audio, subtitles, and metadata
result = download_media_with_subtitles(url, output_dir='japanese_media')

# Display the metadata
if result:
    #print("\\n\" + \"=\"*50")
    print("DOWNLOADED MEDIA INFORMATION\n")
    #print("=\"*50")
    print(json.dumps(result, indent=2, ensure_ascii=False))

[youtube] Extracting URL: https://www.youtube.com/watch?v=1mz-A--mANU
[youtube] 1mz-A--mANU: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading m3u8 information
[info] 1mz-A--mANU: Downloading subtitles: ja

Title: 平井 大 / Slow & Easy（Music Video）
Author: 平井 大 HIRAIDAI
Duration: 222 seconds
URL: https://www.youtube.com/watch?v=1mz-A--mANU

Manual subtitles: ['ja']
Auto-captions available for: ja, ab-ja, aa-ja, af-ja, ak-ja... (and 152 more)

===== Downloading audio, subtitles, and metadata... =====
(Adding delays between requests to avoid rate limiting...)

[youtube] Extracting URL: https://www.youtube.com/watch?v=1mz-A--mANU
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading m3u8 information
[info] 1mz-A--mANU: Downloading subtitles: ja
[info] 1mz-A--mANU: Downloading 1 format(s): 251
[info] Writing video subtitles to: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt
[download] Sleeping 10.00 seconds ...




[download] Destination: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt
[download] 100% of    3.31KiB in 00:00:00 at 35.56KiB/s
[info] Writing video metadata as JSON to: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.info.json
[download] Sleeping 1.00 seconds ...
[download] Destination: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.webm
[download] 100% of    3.58MiB in 00:00:01 at 2.19MiB/s   
[ExtractAudio] Destination: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.mp3
Deleting original file japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.webm (pass -k to keep)

Download complete!

DOWNLOADED MEDIA INFORMATION

{
  "title": "平井 大 / Slow & Easy（Music Video）",
  "duration": 222,
  "uploader": "平井 大 HIRAIDAI",
  "upload_date": "20181220",
  "view_count": 19151280,
  "like_count": 85267,
  "description": "平井 大の楽曲をチェック！\nhttps://DaiHirai.lnk.to/toppage\n\nFOLLOW \"HIRAIDAI\"\nWebsite: https://hiraidai.com/\nInstagram: https://www.instagram.com/hirai_dai_official/\nTwitter: https://t

In [53]:
### CREATING THE vtt PARSER ###
import re
from pathlib import Path

In [155]:
# 1.) vtt cleaner

# Remove empty lines
# Remove WEBVTT, Kind: captions, Language: ja lines (always first 3)
# Remove all lines with "<c>" or "</c>" since these are bad format and are repeatted in 
# next line anyways

# Output to (cleaned)+path
def vtt_clean(input_file):
    input_path = Path(input_file)

    if not input_path.exists():
        raise FileNotFoundError(F"File not found: {input_path}")

    # Read full file into content buffer
    content = input_path.read_text(encoding='utf-8')

    # Process
    re.sub(r"\n\n\n|\n\n", "\n", content) # Remove extra line breaks
    content = '\n'.join( # Remove all lines that have tags (they are repeats)
        line for line in content.split('\n') 
        if '<c>' not in line 
        and '</c>' not in line
        and 'WEBVTT' not in line
        and 'Kind: captions' not in line
        and 'Language: ja' not in line
    )

    content = content.split("\n") # Create array of all the lines with content
    content = [item for item in content if item not in ('', ' ')]

    window = 1
    cur_sent = ""
    c_len = len(content)
    data = []
    for i in range(0, c_len, window):
        if "-->" not in content[i]:
            continue

        if "-->" in content[i+1]:
            continue

        start_time, end_time = parse_timeframe(content[i])
        if (end_time - start_time) < .5:
            continue
        else:
            cur_sent = content[i+1]
            print(f"Window[0] {content[i]}\nWindow[1] {content[i+1]}\n")
            data.append({
                'start_time': start_time,
                'end_time': end_time,
                'text': content[i+1]
            })
    return data

In [156]:
def parse_timeframe(timeframe):
    # Remove any "-->" or align:start position:0% lines
    timeframe = re.sub(r"align.*", "", timeframe)
    timeframe = re.sub(r" --> ", ":", timeframe)
    timeframe = timeframe.split(":")
    s_hour = int(timeframe[0])
    s_min  = int(timeframe[1])
    s_sec  = float(timeframe[2])
    e_hour = int(timeframe[3])
    e_min  = int(timeframe[4])
    e_sec  = float(timeframe[5])

    start_time = round(((3600 * s_hour) + (60 * s_min) + s_sec), 2)
    end_time   = round(((3600 * e_hour) + (60 * e_min) + e_sec), 2)
    #print(timeframe)
    #print(F"start_time: {start_time}\nend_time: {end_time}")
    return start_time, end_time

In [157]:
s_test, e_test = parse_timeframe("00:00:02.230 --> 00:00:02.240 align:start position:0%")
e_test

2.24

In [161]:
#data = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/japanese_media/【1hour Podcast】本名は？結婚してる？苦手なことは？YUYUについて話しました！ (Japanese Radio for Listening practice).ja.vtt")
#data = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/japanese_media/【Japanese Podcast】I Got This! - Master 706 Essential Words Without Even Noticing.ja.vtt")
data = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt")

Window[0] 00:00:16.720 --> 00:00:21.220
Window[1] Oh my 心地いい風と ほら Sunshine

Window[0] 00:00:21.220 --> 00:00:24.360
Window[1] 浜辺に忘れてきた Smile

Window[0] 00:00:24.360 --> 00:00:26.840
Window[1] 拾い集めにいこう

Window[0] 00:00:27.560 --> 00:00:29.120
Window[1] のんびりいこうよ

Window[0] 00:00:29.120 --> 00:00:33.500
Window[1] I know...でも焦りすぎなんじゃないの？

Window[0] 00:00:33.500 --> 00:00:36.636
Window[1] 回り見渡してごらんよ

Window[0] 00:00:36.640 --> 00:00:39.120
Window[1] 答えはいつだって

Window[0] 00:00:39.800 --> 00:00:41.380
Window[1] 近くにあるよ

Window[0] 00:00:41.380 --> 00:00:43.600
Window[1] 前だけ見てたら

Window[0] 00:00:43.600 --> 00:00:46.679
Window[1] 愛すべき人や風景に

Window[0] 00:00:46.679 --> 00:00:48.948
Window[1] 気づけないから

Window[0] 00:00:50.260 --> 00:00:53.980
Window[1] Let’s take it slow and easy together

Window[0] 00:00:53.980 --> 00:00:57.214
Window[1] キミと眺める海は透き通って

Window[0] 00:00:57.220 --> 00:01:00.300
Window[1] 太陽はいつもより眩しくて

Window[0] 00:01:00.300 --> 00:01:03.380
Window[1] 幸せは「作るもの」じゃなくて

Window[0] 00:01:03.38

In [162]:
data

[{'start_time': 16.72, 'end_time': 21.22, 'text': 'Oh my 心地いい風と ほら Sunshine'},
 {'start_time': 21.22, 'end_time': 24.36, 'text': '浜辺に忘れてきた Smile'},
 {'start_time': 24.36, 'end_time': 26.84, 'text': '拾い集めにいこう'},
 {'start_time': 27.56, 'end_time': 29.12, 'text': 'のんびりいこうよ'},
 {'start_time': 29.12, 'end_time': 33.5, 'text': 'I know...でも焦りすぎなんじゃないの？'},
 {'start_time': 33.5, 'end_time': 36.64, 'text': '回り見渡してごらんよ'},
 {'start_time': 36.64, 'end_time': 39.12, 'text': '答えはいつだって'},
 {'start_time': 39.8, 'end_time': 41.38, 'text': '近くにあるよ'},
 {'start_time': 41.38, 'end_time': 43.6, 'text': '前だけ見てたら'},
 {'start_time': 43.6, 'end_time': 46.68, 'text': '愛すべき人や風景に'},
 {'start_time': 46.68, 'end_time': 48.95, 'text': '気づけないから'},
 {'start_time': 50.26,
  'end_time': 53.98,
  'text': 'Let’s take it slow and easy together'},
 {'start_time': 53.98, 'end_time': 57.21, 'text': 'キミと眺める海は透き通って'},
 {'start_time': 57.22, 'end_time': 60.3, 'text': '太陽はいつもより眩しくて'},
 {'start_time': 60.3, 'end_time': 63.38, 'text'