In [26]:
### IMPORTS ###
import pandas as pd
import yt_dlp
import json
import os
from pathlib import Path
import re

import webvtt
from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
import functools

In [27]:
# help(yt_dlp.YoutubeDL)
# Detailed explanation of yt-dlp options for your use case
"""
OPTIONS NEEDED FOR YOUR REQUIREMENTS:

1. TITLE - Automatically extracted as info_dict['title']
   
2. AUTHOR - Automatically extracted as info_dict['uploader']
   
3. LINK - The URL you provide to the function
   
4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)

5. SUBTITLES WITH TIMESTAMPS (Japanese & English):
   - writesubtitles: True
     Downloads manually created subtitles
   
   - writeautomaticsub: True  
     Downloads auto-generated subtitles (YouTube's automatic captions)
   
   - subtitleslangs: ['ja', 'en']
     Specifies which subtitle languages to download
     Can use 'all' to download all available languages
   
   - subtitlesformat: 'vtt'  (RECOMMENDED)
     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'
     VTT (WebVTT) includes timestamps in this format:
         00:00:01.000 --> 00:00:03.000
         Subtitle text here
     
     VTT is best for your use case because:
     - Includes precise timestamps
     - Easy to parse programmatically
     - Well-supported format

6. AUDIO FILE:
   - format: 'bestaudio/best'
     Downloads the best available audio quality
   
   - postprocessors: [{
         'key': 'FFmpegExtractAudio',
         'preferredcodec': 'mp3',      # Output format (mp3, wav, m4a, etc.)
         'preferredquality': '192',    # Audio bitrate (128, 192, 256, 320)
     }]
     Converts audio to MP3 format
   
   With the subtitle timestamps, you can later extract specific 
   audio segments using ffmpeg or pydub

OTHER USEFUL OPTIONS:

- writeinfojson: True
  Saves ALL metadata as a .info.json file including:
  - Full description, tags, categories
  - View count, like count, upload date
  - All available formats and quality info
  - Comments (if getcomments: True)
  
- outtmpl: 'path/%(title)s.%(ext)s'
  Controls output filename. Available variables:
  - %(title)s - Video title
  - %(uploader)s - Channel/uploader name
  - %(id)s - Video ID
  - %(upload_date)s - Upload date (YYYYMMDD)
  - %(duration)s - Duration in seconds
  Example: 'downloads/%(uploader)s/%(title)s_%(id)s.%(ext)s'

- writethumbnail: True
  Downloads the video thumbnail image
  
- quiet: False
  Shows download progress (True hides it)

- getcomments: True (with writeinfojson: True)
  Extracts video comments and saves to JSON
"""

"\nOPTIONS NEEDED FOR YOUR REQUIREMENTS:\n\n1. TITLE - Automatically extracted as info_dict['title']\n\n2. AUTHOR - Automatically extracted as info_dict['uploader']\n\n3. LINK - The URL you provide to the function\n\n4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)\n\n5. SUBTITLES WITH TIMESTAMPS (Japanese & English):\n   - writesubtitles: True\n     Downloads manually created subtitles\n\n   - writeautomaticsub: True  \n     Downloads auto-generated subtitles (YouTube's automatic captions)\n\n   - subtitleslangs: ['ja', 'en']\n     Specifies which subtitle languages to download\n     Can use 'all' to download all available languages\n\n   - subtitlesformat: 'vtt'  (RECOMMENDED)\n     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'\n     VTT (WebVTT) includes timestamps in this format:\n         00:00:01.000 --> 00:00:03.000\n         Subtitle text here\n\n     VTT is best for your use case because:\n     - Includes precise timestamps\n     - Easy t

In [28]:
def download_media_with_subtitles(url, output_dir='data'):
    """
    Download audio as MP3, subtitles, and metadata from a given URL.
    
    Args:
        url        (str): URL of the media to download
        output_dir (str): Directory to save downloaded files
    
    Returns:
        result: Information about downloaded files
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)     
    
    # Configure yt-dlp options
    ydl_opts = {
        # Audio settings
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        
        # Output settings
        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
        
        # Subtitle settings - KEY FOR YOUR USE CASE
        'writesubtitles': True,              # Manual subs
        'writeautomaticsub': True,           # Auto-generated subs
        'subtitleslangs': ['ja'],            # Japanese (use translator to convert to en)
        'subtitlesformat': 'vtt',            # VTT format (includes timestamps)
        
        # Metadata settings
        'writeinfojson': True,               # Save full metadata as JSON
        'writethumbnail': False,             # Set True if you want thumbnails
        
        # Rate limiting to avoid 429 errors
        'sleep_interval_requests': 2,        # Sleep 2 seconds between API requests
        'sleep_interval_subtitles': 10,      # Sleep 10 seconds between subtitle downloads
        'sleep_interval': 1,                 # Sleep 1 second before each download
        
        # Progress settings
        'quiet': False,
        'no_warnings': False,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Extract info without downloading first to check what's available
            info = ydl.extract_info(url, download=False)

            # Display what's available
            print(f"\nTitle: {info.get('title')}")
            print(f"Author: {info.get('uploader', 'N/A')}")
            print(f"Duration: {info.get('duration', 'N/A')} seconds")
            print(f"URL: {url}\n")
            
            # Check subtitles
            manual_subs = list(info.get('subtitles', {}).keys())
            auto_subs = list(info.get('automatic_captions', {}).keys())
            print(f"Manual subtitles: {manual_subs if manual_subs else 'None'}")
            print(f"Auto-captions available for: {', '.join(auto_subs[:5])}... (and {len(auto_subs)-5} more)" if len(auto_subs) > 5 else f"Auto-captions: {auto_subs}\n")
            
            # Download everything
            print("\n===== Downloading audio, subtitles, and metadata... =====")
            print("(Adding delays between requests to avoid rate limiting...)\n")     

            ydl.download([url])
            
            result = {
                'title': info.get('title'),
                'duration': info.get('duration'),
                'uploader': info.get('uploader'),
                'upload_date': info.get('upload_date'),
                'view_count': info.get('view_count'),
                'like_count': info.get('like_count'),
                'description': info.get('description'),
                'subtitles_available': list(info.get('subtitles', {}).keys()),
                'auto_captions_available': list(info.get('automatic_captions', {}).keys()),
                'output_dir': output_dir
            }
            
            print("\nDownload complete!\n")
            return result
    except Exception as e:
        print(f"Error: {str(e)}\n")
        return None


In [5]:
# Test the function with a URL
# Replace with your desired URL
# url = "https://www.youtube.com/watch?v=1PPLx2N_YTE" # YUYU podcast test, auto gen subs
# url = "https://www.youtube.com/watch?v=hkeegM5t-lw" # Jiro podcast test, manual subs
# url = "https://open.spotify.com/track/7DyS11kB1YNrfDzQqtjmTh?si=24a7621a3a24483c" # spotify song
url = "https://www.youtube.com/watch?v=1mz-A--mANU"

# Download audio, subtitles, and metadata
result = download_media_with_subtitles(url, output_dir='japanese_media')

# Display the metadata
if result:
    #print("\\n\" + \"=\"*50")
    print("DOWNLOADED MEDIA INFORMATION\n")
    #print("=\"*50")
    print(json.dumps(result, indent=2, ensure_ascii=False))

[youtube] Extracting URL: https://www.youtube.com/watch?v=1mz-A--mANU
[youtube] 1mz-A--mANU: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading m3u8 information
[info] 1mz-A--mANU: Downloading subtitles: ja

Title: 平井 大 / Slow & Easy（Music Video）
Author: 平井 大 HIRAIDAI
Duration: 222 seconds
URL: https://www.youtube.com/watch?v=1mz-A--mANU

Manual subtitles: ['ja']
Auto-captions available for: ja, ab-ja, aa-ja, af-ja, ak-ja... (and 152 more)

===== Downloading audio, subtitles, and metadata... =====
(Adding delays between requests to avoid rate limiting...)

[youtube] Extracting URL: https://www.youtube.com/watch?v=1mz-A--mANU
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] 1mz-A--mANU: Downloading m3u8 information
[info] 1mz-A--mANU: Downloading subtitles: ja
[info] 1mz-A--mANU: Downloading 1 format(s): 251
Deleting existing file japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt
[info] Writing video subtitles to: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt
[download] Sleeping 10.00 seconds ...




[download] Destination: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt
[download] 100% of    3.31KiB in 00:00:00 at 34.86KiB/s
[info] Writing video metadata as JSON to: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.info.json
[download] Sleeping 1.00 seconds ...
[download] Destination: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.webm
[download] 100% of    3.58MiB in 00:00:02 at 1.58MiB/s   
[ExtractAudio] Destination: japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.mp3
Deleting original file japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.webm (pass -k to keep)

Download complete!

DOWNLOADED MEDIA INFORMATION

{
  "title": "平井 大 / Slow & Easy（Music Video）",
  "duration": 222,
  "uploader": "平井 大 HIRAIDAI",
  "upload_date": "20181220",
  "view_count": 19170623,
  "like_count": 85333,
  "description": "平井 大の楽曲をチェック！\nhttps://DaiHirai.lnk.to/toppage\n\nFOLLOW \"HIRAIDAI\"\nWebsite: https://hiraidai.com/\nInstagram: https://www.instagram.com/hirai_dai_official/\nTwitter: https://t

In [29]:
### CREATING THE vtt PARSER ###
import re
from pathlib import Path

In [30]:
# 1.) vtt cleaner

# Remove empty lines
# Remove WEBVTT, Kind: captions, Language: ja lines (always first 3)
# Remove all lines with "<c>" or "</c>" since these are bad format and are repeatted in 
# next line anyways

# Output to (cleaned)+path
def vtt_clean(input_file):
    input_path = Path(input_file)

    if not input_path.exists():
        raise FileNotFoundError(F"File not found: {input_path}")

    # Read full file into content buffer
    content = input_path.read_text(encoding='utf-8')

    # Process
    re.sub(r"\n\n\n|\n\n", "\n", content) # Remove extra line breaks
    content = '\n'.join( # Remove all lines that have tags (they are repeats)
        line for line in content.split('\n') 
        if '<c>' not in line 
        and '</c>' not in line
        and 'WEBVTT' not in line
        and 'Kind: captions' not in line
        and 'Language: ja' not in line
    )

    content = content.split("\n") # Create array of all the lines with content
    content = [item for item in content if item not in ('', ' ')]

    window = 1
    cur_sent = ""
    c_len = len(content)
    data = []
    for i in range(0, c_len, window):
        if "-->" not in content[i]:
            continue

        if "-->" in content[i+1]:
            continue

        start_time, end_time = parse_timeframe(content[i])
        if (end_time - start_time) < .5:
            continue
        else:
            cur_sent = content[i+1]
            data.append({
                'start_time': start_time,
                'end_time': end_time,
                'text': content[i+1]
            })
    return data

In [None]:
def parse_timeframe(timeframe):
    # Remove any "-->" or align:start position:0% lines
    timeframe = re.sub(r"align.*", "", timeframe)
    timeframe = re.sub(r" --> ", ":", timeframe)
    timeframe = timeframe.split(":")
    s_hour = int(timeframe[0])
    s_min  = int(timeframe[1])
    s_sec  = float(timeframe[2])
    e_hour = int(timeframe[3])
    e_min  = int(timeframe[4])
    e_sec  = float(timeframe[5])

    start_time = round(((3600 * s_hour) + (60 * s_min) + s_sec), 2)
    end_time   = round(((3600 * e_hour) + (60 * e_min) + e_sec), 2)
    #print(timeframe)
    #print(F"start_time: {start_time}\nend_time: {end_time}")
    return start_time, end_time

In [9]:
s_test, e_test = parse_timeframe("00:00:02.230 --> 00:00:02.240 align:start position:0%")
e_test

2.24

In [13]:
data1 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/【1hour Podcast】本名は？結婚してる？苦手なことは？YUYUについて話しました！ (Japanese Radio for Listening practice).ja.vtt")
data2 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/【Japanese Podcast】I Got This! - Master 706 Essential Words Without Even Noticing.ja.vtt")
data3 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt")

In [33]:
data1

[{'start_time': 2.24, 'end_time': 4.47, 'text': 'はい、皆さんこんにちは。ゆの日本語'},
 {'start_time': 4.48, 'end_time': 7.07, 'text': 'ポッドキャストのお時間です。皆さん元気'},
 {'start_time': 7.08, 'end_time': 9.83, 'text': 'にしていますでしょうか?今日は久しぶり'},
 {'start_time': 9.84, 'end_time': 14.15, 'text': 'に1時間ポッドキャストにチャレンジし'},
 {'start_time': 14.16, 'end_time': 17.11, 'text': 'たいと思います。テーマはゆについて、あ'},
 {'start_time': 17.12, 'end_time': 19.95, 'text': '、ま、つまり僕についてですね。ま、'},
 {'start_time': 19.96, 'end_time': 23.31, 'text': 'おかげ様でYouTubeの登録者数が'},
 {'start_time': 23.32, 'end_time': 26.59, 'text': 'もうすぐ50万人を迎えてそれだけじゃ'},
 {'start_time': 26.6, 'end_time': 29.71, 'text': 'なくてですね、チャンネル運営が5年目を'},
 {'start_time': 29.72, 'end_time': 32.23, 'text': '迎える、ま、そういった年でもあるんです'},
 {'start_time': 32.24, 'end_time': 35.11, 'text': 'よ。まあ、つまり簡単に言うと5年前に'},
 {'start_time': 35.12, 'end_time': 38.91, 'text': 'ゆuの日本語ポッドキャストを始めてで、'},
 {'start_time': 38.92, 'end_time': 42.31, 'text': 'もう5年が経ったということで、ま、その'},
 {'start_time': 42.32, 'end_time': 44.19, 'text': '始めた

In [14]:
data1_df = pd.DataFrame(data1)
data2_df = pd.DataFrame(data2)
data3_df = pd.DataFrame(data3)

In [25]:
data1_df

Unnamed: 0,start_time,end_time,text
0,2.24,4.47,はい、皆さんこんにちは。ゆの日本語
1,4.48,7.07,ポッドキャストのお時間です。皆さん元気
2,7.08,9.83,にしていますでしょうか?今日は久しぶり
3,9.84,14.15,に1時間ポッドキャストにチャレンジし
4,14.16,17.11,たいと思います。テーマはゆについて、あ
...,...,...,...
1347,4192.44,4194.15,ポッドキャストを聞き終わった後、え、別
1348,4194.16,4196.19,のお気に入りのポッドキャストだったりと
1349,4196.20,4199.03,か最近のエピソードを聞いてくれると
1350,4199.04,4201.19,すごく嬉しいです。はい、ということで皆


In [None]:
# Further Data Cleaning
# Taking segmented subtitle data and cleaning it to make sentences complete

# Pseduocode:
# For normal videos: Punctuations are included ("。", "、", "～", "？", "！")
# 1.) Need to use a sliding window, but it has to be variable.  (based on sent)
# 2.) Use the ja_sentence_segmenter to split the text into sentences
# Case 1: Block is a complete sentence. (rare)
#      No need to parse the text --> just put in to data format
#      No need to change the times
# Case 2: Block was split into multiple sentences
#  2a: First sentence is not complete 
#      (WE KNOW IF THE FIRST SENTENCE IS NOT COMPLETE BASED ON THE PREVIOUS BLOCK
#       IF THE PREVIOUS BLOCK HAD AN INCOMPLETE SENT THEN WE ASSUME THE CURRENT FIRST
#       BLOCK IS THE RUNON FROM THE PREVIOUS, ELSE IT IS NOT A RUNON AND IS COMPLETE)
#      Remove it
#  2b: First sentence --> Last sentence as all complete: 
#      All sentences --> data format
#      Put the same time frame for all
#  2c: Last sentence is not complete:
#      incomplete_sent = last_sent
#      Then take the first sentence from the next segment and
#      (incomplete_sent + cur_sent) --> data format
#      (Prev_time_frame.start_time, cur_time_frame.end_time)
#
#      Else, if the next segment does not contain a full sentence:
#      incomplete_sent = incomplete_sent + cur_sent --> continue without inputting
#

In [47]:
# my_list = [{'one': 1}, 'two', 'three']
# my_list[0]['one']
re.split(r'(?<=[。？！])', "たいと思いますテーマはゆについて、あ")

['たいと思いますテーマはゆについて、あ']

In [51]:
# Map to data helper function
def sent_to_data(sent_id, text, start_time, end_time):
    return {
        'sentence_id': sent_id,
        'start_time': start_time,
        'end_time': end_time,
        'text': text,
        'jlpt_level': '',
        'grammar': '',
        'vocabulary': ''
    }

In [None]:
# Input is in [{'start_time': int, 'end_time': int, 'text': 'string'}, ...]
def segs_to_sents(data):
    result = [] # New resulting list
    i = 0       # Sequential index for the list based on each append
    
    tot_segs = len(data)
    start = 0 # Start of the sliding window
    end   = 0 # End of the sliding window

    incomplete_sent = "" # Sentence accumulator for incomplete sentences
    while end < tot_segs:
        # Get current segment and split based on ("。", "？", "！")
        sent_split = re.split(r'(?<=[。？！])', data[end]['text'])
        
        ##### Cases 1,2,3 The block contains exactly one sentence #####
        # Case 1: Block is a complete sentence and there is no incomplete sent
        # EX: sent_split = ["はい、皆さんこんにちは。"]
        if(incomplete_sent == "" and len(sent_split) == 1 and "。" in sent_split[0]):
            result.append(sent_to_data(i, sent_split, data[end]['start_time'], data[end['end_time']]))
            i     = i + 1
            start = start + 1
            end   = end + 1
            continue
        
        if(incomplete_sent != "" and len(sent_split) == 1):
        # Case 2: Block splits into 1 block WITH a punctuation
        # EX:  sent_split = ['ポッドキャストのお時間です。']
        # incomplete_sent = 'ゆの日本語'
            if "。" in sent_split[0]:
                sent = incomplete_sent + sent_split[0]
                result.append(sent_to_data(i, sent, data[start]['start_time='], data[end]['end_time']))

                incomplete_sent = ""
                i     = i + 1
                start = end + 1
                end   = end + 1
                continue

        # Case 3: Block splits into 1 block WITHOUT a punctuation
        # EX:  sent_split = ['に1時間ポッドキャストにチャレンジし'] (next seg: "たいと思います。")
        # incomplete_sent = '今日は久しぶり'
            if "。" not in sent_split[0]:
                incomplete_sent = incomplete_sent + sent_split[0]
                end = end + 1
                continue



In [52]:
incomplete_sent = "ゆの日本語"
test = "ポッドキャストのお時間です。"

incomplete_sent + test

'ゆの日本語ポッドキャストのお時間です。'

In [None]:
# seg_to_sent(data1)

In [None]:
# 0   2.24	4.47	はい、皆さんこんにちは。ゆの日本語
# 1	  4.48	7.07	ポッドキャストのお時間です。皆さん元気
# 2	  7.08	9.83	にしていますでしょうか?今日は久しぶり
# 3	  9.84	14.15	に1時間ポッドキャストにチャレンジし
# 4	 14.16	17.11	たいと思います。テーマはゆについて、あ

In [None]:
# ===== Data Format (EXAMPLE) ===== #
{
  'sentence_id': 0,
  'text': 'はい、皆さんこんにちは。',
  'time_range': (2.24, 4.47),  # For audio playback
  'grammar_patterns': ['ます form', 'greeting'],
  'jlpt_level': 'N5',
  'difficulty': 2,
  'pos_tags': [...],  # From MeCab
  'vocabulary': ['皆さん', 'こんにちは']
}

# # Example
# text = "はい、皆さんこんにちは。ゆの日本語"
# sentences = split_japanese_sentences(text)
# # ['はい、皆さんこんにちは。', 'ゆの日本語']

In [None]:
# Observations
# For podcast and normal videos:
# Sentences get cutoff and sometimes end on the next line.
# Easy to tell because periods are present
# We could clean this by searching for segments that dont end in a period
# then merge the next block until a period is hit.
# How do I know if I should merge into the first or second segment?
# Or should I always merge into the first segment?
# Then how should I update the start and end times? I don't think I should.
# But I should remove all the text I merged with the first block

# There are cases where sentences may be complete in 2, 3, or 4 lines
# This is when we start a sentence and now a period isn't seen for
# a couple of lines. In this case we should delete the block and merge
# the times.

# For music videos:
# There are no periods and only occasionally question marks
# It may be a lot more difficult to split up sentences by context
# It may be easier to keep this as is

# Maybe add a feature to the app where a user can select blocks 
# or manually type in their sentence and have the app assign grammar tags
# and allow them to add their sentences to their decks of choice