In [1]:
### IMPORTS ###
import pandas as pd
import numpy as np
import yt_dlp
import json
import os
from pathlib import Path
import re


import functools

In [2]:
# help(yt_dlp.YoutubeDL)
# Detailed explanation of yt-dlp options for your use case
"""
OPTIONS NEEDED FOR YOUR REQUIREMENTS:

1. TITLE - Automatically extracted as info_dict['title']
   
2. AUTHOR - Automatically extracted as info_dict['uploader']
   
3. LINK - The URL you provide to the function
   
4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)

5. SUBTITLES WITH TIMESTAMPS (Japanese & English):
   - writesubtitles: True
     Downloads manually created subtitles
   
   - writeautomaticsub: True  
     Downloads auto-generated subtitles (YouTube's automatic captions)
   
   - subtitleslangs: ['ja', 'en']
     Specifies which subtitle languages to download
     Can use 'all' to download all available languages
   
   - subtitlesformat: 'vtt'  (RECOMMENDED)
     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'
     VTT (WebVTT) includes timestamps in this format:
         00:00:01.000 --> 00:00:03.000
         Subtitle text here
     
     VTT is best for your use case because:
     - Includes precise timestamps
     - Easy to parse programmatically
     - Well-supported format

6. AUDIO FILE:
   - format: 'bestaudio/best'
     Downloads the best available audio quality
   
   - postprocessors: [{
         'key': 'FFmpegExtractAudio',
         'preferredcodec': 'mp3',      # Output format (mp3, wav, m4a, etc.)
         'preferredquality': '192',    # Audio bitrate (128, 192, 256, 320)
     }]
     Converts audio to MP3 format
   
   With the subtitle timestamps, you can later extract specific 
   audio segments using ffmpeg or pydub

OTHER USEFUL OPTIONS:

- writeinfojson: True
  Saves ALL metadata as a .info.json file including:
  - Full description, tags, categories
  - View count, like count, upload date
  - All available formats and quality info
  - Comments (if getcomments: True)
  
- outtmpl: 'path/%(title)s.%(ext)s'
  Controls output filename. Available variables:
  - %(title)s - Video title
  - %(uploader)s - Channel/uploader name
  - %(id)s - Video ID
  - %(upload_date)s - Upload date (YYYYMMDD)
  - %(duration)s - Duration in seconds
  Example: 'downloads/%(uploader)s/%(title)s_%(id)s.%(ext)s'

- writethumbnail: True
  Downloads the video thumbnail image
  
- quiet: False
  Shows download progress (True hides it)

- getcomments: True (with writeinfojson: True)
  Extracts video comments and saves to JSON
"""

"\nOPTIONS NEEDED FOR YOUR REQUIREMENTS:\n\n1. TITLE - Automatically extracted as info_dict['title']\n\n2. AUTHOR - Automatically extracted as info_dict['uploader']\n\n3. LINK - The URL you provide to the function\n\n4. DURATION - Automatically extracted as info_dict['duration'] (in seconds)\n\n5. SUBTITLES WITH TIMESTAMPS (Japanese & English):\n   - writesubtitles: True\n     Downloads manually created subtitles\n\n   - writeautomaticsub: True  \n     Downloads auto-generated subtitles (YouTube's automatic captions)\n\n   - subtitleslangs: ['ja', 'en']\n     Specifies which subtitle languages to download\n     Can use 'all' to download all available languages\n\n   - subtitlesformat: 'vtt'  (RECOMMENDED)\n     Format for subtitles. Options: 'vtt', 'srt', 'ass', 'lrc'\n     VTT (WebVTT) includes timestamps in this format:\n         00:00:01.000 --> 00:00:03.000\n         Subtitle text here\n\n     VTT is best for your use case because:\n     - Includes precise timestamps\n     - Easy t

In [3]:
def download_media_with_subtitles(url, output_dir='data'):
    """
    Download audio as MP3, subtitles, and metadata from a given URL.
    
    Args:
        url        (str): URL of the media to download
        output_dir (str): Directory to save downloaded files
    
    Returns:
        result: Information about downloaded files
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)     
    
    # Configure yt-dlp options
    ydl_opts = {
        # Audio settings
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        
        # Output settings
        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
        
        # Subtitle settings - KEY FOR YOUR USE CASE
        'writesubtitles': True,              # Manual subs
        'writeautomaticsub': True,           # Auto-generated subs
        'subtitleslangs': ['ja'],            # Japanese (use translator to convert to en)
        'subtitlesformat': 'vtt',            # VTT format (includes timestamps)
        
        # Metadata settings
        'writeinfojson': True,               # Save full metadata as JSON
        'writethumbnail': False,             # Set True if you want thumbnails
        
        # Rate limiting to avoid 429 errors
        'sleep_interval_requests': 2,        # Sleep 2 seconds between API requests
        'sleep_interval_subtitles': 10,      # Sleep 10 seconds between subtitle downloads
        'sleep_interval': 1,                 # Sleep 1 second before each download
        
        # Progress settings
        'quiet': False,
        'no_warnings': False,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Extract info without downloading first to check what's available
            info = ydl.extract_info(url, download=False)

            # Display what's available
            print(f"\nTitle: {info.get('title')}")
            print(f"Author: {info.get('uploader', 'N/A')}")
            print(f"Duration: {info.get('duration', 'N/A')} seconds")
            print(f"URL: {url}\n")
            
            # Check subtitles
            manual_subs = list(info.get('subtitles', {}).keys())
            auto_subs = list(info.get('automatic_captions', {}).keys())
            print(f"Manual subtitles: {manual_subs if manual_subs else 'None'}")
            print(f"Auto-captions available for: {', '.join(auto_subs[:5])}... (and {len(auto_subs)-5} more)" if len(auto_subs) > 5 else f"Auto-captions: {auto_subs}\n")
            
            # Download everything
            print("\n===== Downloading audio, subtitles, and metadata... =====")
            print("(Adding delays between requests to avoid rate limiting...)\n")     

            ydl.download([url])
            
            result = {
                'title': info.get('title'),
                'duration': info.get('duration'),
                'uploader': info.get('uploader'),
                'upload_date': info.get('upload_date'),
                'view_count': info.get('view_count'),
                'like_count': info.get('like_count'),
                'description': info.get('description'),
                'subtitles_available': list(info.get('subtitles', {}).keys()),
                'auto_captions_available': list(info.get('automatic_captions', {}).keys()),
                'output_dir': output_dir
            }
            
            print("\nDownload complete!\n")
            return result
    except Exception as e:
        print(f"Error: {str(e)}\n")
        return None


In [58]:
# Test the function with a URL
# Replace with your desired URL
# url = "https://www.youtube.com/watch?v=1PPLx2N_YTE" # YUYU podcast test, auto gen subs
# url = "https://www.youtube.com/watch?v=hkeegM5t-lw" # Jiro podcast test, manual subs
# url = "https://open.spotify.com/track/7DyS11kB1YNrfDzQqtjmTh?si=24a7621a3a24483c" # spotify song
# url = "https://www.youtube.com/watch?v=1mz-A--mANU"
url = "https://www.youtube.com/watch?v=fgZxDPUmoL8"

# Download audio, subtitles, and metadata
result = download_media_with_subtitles(url, output_dir='japanese_media')

# Display the metadata
if result:
    #print("\\n\" + \"=\"*50")
    print("DOWNLOADED MEDIA INFORMATION\n")
    #print("=\"*50")
    print(json.dumps(result, indent=2, ensure_ascii=False))

[youtube] Extracting URL: https://www.youtube.com/watch?v=fgZxDPUmoL8
[youtube] fgZxDPUmoL8: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading m3u8 information
[info] fgZxDPUmoL8: Downloading subtitles: ja

Title: カレーに100種類の食べ物を隠し味にブチ込んで食ってみた
Author: はじめしゃちょー（hajime）
Duration: 665 seconds
URL: https://www.youtube.com/watch?v=fgZxDPUmoL8

Manual subtitles: None
Auto-captions available for: ab, aa, af, ak, sq... (and 152 more)

===== Downloading audio, subtitles, and metadata... =====
(Adding delays between requests to avoid rate limiting...)

[youtube] Extracting URL: https://www.youtube.com/watch?v=fgZxDPUmoL8
[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading webpage




[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading android vr player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading ios downgraded player API JSON
[youtube] Sleeping 2 seconds ...
[youtube] fgZxDPUmoL8: Downloading m3u8 information
[info] fgZxDPUmoL8: Downloading subtitles: ja
[info] fgZxDPUmoL8: Downloading 1 format(s): 251
[info] Writing video subtitles to: japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.ja.vtt
[download] Sleeping 10.00 seconds ...




[download] Destination: japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.ja.vtt
[download] 100% of  110.51KiB in 00:00:00 at 300.62KiB/s
[info] Writing video metadata as JSON to: japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.info.json
[download] Sleeping 1.00 seconds ...
[download] Destination: japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.webm
[download] 100% of    9.60MiB in 00:00:03 at 3.10MiB/s   
[ExtractAudio] Destination: japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.mp3
Deleting original file japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.webm (pass -k to keep)

Download complete!

DOWNLOADED MEDIA INFORMATION

{
  "title": "カレーに100種類の食べ物を隠し味にブチ込んで食ってみた",
  "duration": 665,
  "uploader": "はじめしゃちょー（hajime）",
  "upload_date": "20200522",
  "view_count": 6995987,
  "like_count": 83778,
  "description": "☆おすすめ動画☆\n・かたっぱしから青汁を混ぜてスーパー栄養補給を狙う\n　https://youtu.be/jR44vOqbnSI\n・【実験】百味ビーンズ全部混ぜて１つにしたら何味？\n　https://youtu.be/1iwVHvRUcrQ\n\n\nチャンネル登録よろしくおねがいします !\nMy name is Hajime! \n\nファンサイトが出来ました！！！\n▽「

In [29]:
### CREATING THE vtt PARSER ###
import re
from pathlib import Path

In [28]:
def parse_timeframe(timeframe):
    # Remove any "-->" or align:start position:0% lines
    timeframe = re.sub(r"align.*", "", timeframe)
    timeframe = re.sub(r" --> ", ":", timeframe)
    timeframe = timeframe.split(":")
    s_hour = int(timeframe[0])
    s_min  = int(timeframe[1])
    s_sec  = float(timeframe[2])
    e_hour = int(timeframe[3])
    e_min  = int(timeframe[4])
    e_sec  = float(timeframe[5])

    start_time = round(((3600 * s_hour) + (60 * s_min) + s_sec), 2)
    end_time   = round(((3600 * e_hour) + (60 * e_min) + e_sec), 2)
    #print(timeframe)
    #print(F"start_time: {start_time}\nend_time: {end_time}")
    return start_time, end_time

In [26]:
# 1.) vtt cleaner

# Remove empty lines
# Remove WEBVTT, Kind: captions, Language: ja lines (always first 3)
# Remove all lines with "<c>" or "</c>" since these are bad format and are repeatted in 
# next line anyways

# Output to (cleaned)+path
def vtt_clean(input_file):
    input_path = Path(input_file)

    if not input_path.exists():
        raise FileNotFoundError(F"File not found: {input_path}")

    # Read full file into content buffer
    content = input_path.read_text(encoding='utf-8')

    # Process
    re.sub(r"\n\n\n|\n\n", "\n", content) # Remove extra line breaks
    content = '\n'.join( # Remove all lines that have tags (they are repeats)
        line for line in content.split('\n') 
        if '<c>' not in line 
        and '</c>' not in line
        and 'WEBVTT' not in line
        and 'Kind: captions' not in line
        and 'Language: ja' not in line
    )

    content = content.split("\n") # Create array of all the lines with content
    content = [item for item in content if item not in ('', ' ')]

    cur_sent = ""
    c_len = len(content) - 1
    data = []
    for i in range(0, c_len):
        if "-->" not in content[i]:
            continue

        if "-->" in content[i+1]:
            continue

        start_time, end_time = parse_timeframe(content[i])
        if (end_time - start_time) < .5:
            continue
        else:
            cur_sent = content[i+1]
            data.append({
                'start_time': start_time,
                'end_time': end_time,
                'text': content[i+1]
            })
    return data

In [30]:
s_test, e_test = parse_timeframe("00:00:02.230 --> 00:00:02.240 align:start position:0%")
e_test

2.24

In [31]:
data1 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/【1hour Podcast】本名は？結婚してる？苦手なことは？YUYUについて話しました！ (Japanese Radio for Listening practice).ja.vtt")
data2 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/【Japanese Podcast】I Got This! - Master 706 Essential Words Without Even Noticing.ja.vtt")
data3 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/平井 大 ⧸ Slow & Easy（Music Video）.ja.vtt")
data4 = vtt_clean("/home/evanxavierhu/Documents/VSCode_Local_Projects/JL_Project/Japanese_Lang_Project/data_exploration/japanese_media/カレーに100種類の食べ物を隠し味にブチ込んで食ってみた.ja.vtt")

In [32]:
data4

[{'start_time': 9.15,
  'end_time': 14.3,
  'text': 'あー本日は純社長です皆さんカーネロライフ作ったことありますでしょうか'},
 {'start_time': 14.31,
  'end_time': 18.14,
  'text': '僕はないですべ取るとか8かを作ったことなんですけどもちゃんとあの硬いあのこれ'},
 {'start_time': 18.15,
  'end_time': 20.07,
  'text': 'このタイプのやつをまずどうやって作るかされ文字通りよくわかんないですよねただ'},
 {'start_time': 20.08, 'end_time': 23.34, 'text': 'レトルトカレーじゃない華麗て'},
 {'start_time': 23.35,
  'end_time': 27.93,
  'text': 'ないとかなんか人参とか玉ねぎとか入ってないんですよ作ったことないからわかんない'},
 {'start_time': 27.94,
  'end_time': 31.98,
  'text': 'けど俺ねこれくらいのクッキーみたいですねって言えば彼がし作っていくわけなんだ'},
 {'start_time': 31.99,
  'end_time': 35.49,
  'text': 'けどもねえって何か無限だと思うんですよ具材各節いろいろあるじゃないですか基本的'},
 {'start_time': 35.5,
  'end_time': 39.1,
  'text': 'には肉たまねぎじゃがいもにんじんは入れると思うんですけどもそれだけじゃなくて'},
 {'start_time': 39.11,
  'end_time': 43.48,
  'text': 'らっしゃらないかでとまぁこいつと言ったらトッピングあるじゃんつまり'},
 {'start_time': 43.49,
  'end_time': 47.44,
  'text': 'なんでもいいんですあれの中身ってあるでコンペたくさんの拍手や具材準備しました'},
 {'start_time': 47.45,
  'end_time': 50.92,
  'text': 'なんだそれが思いますカレー月

In [33]:
data1_df = pd.DataFrame(data1)
data2_df = pd.DataFrame(data2)
data3_df = pd.DataFrame(data3)
data4_df = pd.DataFrame(data4)

In [34]:
data4_df

Unnamed: 0,start_time,end_time,text
0,9.15,14.30,あー本日は純社長です皆さんカーネロライフ作ったことありますでしょうか
1,14.31,18.14,僕はないですべ取るとか8かを作ったことなんですけどもちゃんとあの硬いあのこれ
2,18.15,20.07,このタイプのやつをまずどうやって作るかされ文字通りよくわかんないですよねただ
3,20.08,23.34,レトルトカレーじゃない華麗て
4,23.35,27.93,ないとかなんか人参とか玉ねぎとか入ってないんですよ作ったことないからわかんない
...,...,...,...
161,644.24,650.05,がめちゃくちゃ何か甘いん
162,650.06,650.92,あーんだろくれないかじゃないでね全然食えないことはないですけど色々あってる時で
163,650.93,655.03,よくわかんないや
164,655.04,657.22,全然まずいとかはないんですけどもまぁあんまりお勧めしないですね普通に食べた方が


In [35]:
# Map to data helper function
def sent_to_data(sent_id, text, start_time, end_time):
    return {
        'sentence_id': sent_id,
        'start_time': start_time,
        'end_time': end_time,
        'text': text,
        'jlpt_level': '',
        'grammar': '',
        'vocabulary': ''
    }

In [36]:
# Input is in [{'start_time': int, 'end_time': int, 'text': 'string'}, ...]
def segs_to_sents(data):
    result = [] # New resulting list
    index = 0       # Sequential index for the list based on each append
    
    tot_segs = len(data)
    start = 0 # Start of the sliding window
    end   = 0 # End of the sliding window

    incomplete_sent = "" # Sentence accumulator for incomplete sentences
    while end < tot_segs:
        # Get current segment and split based on ("。", "？", "！")
        sent_split = re.split(r'(?<=[。？！!.?])', data[end]['text'])
        sent_split = [s for s in sent_split if s != ""]
        
        ##### Cases 1,2,3 The block contains exactly one sentence #####
        # Case 1: Block is a complete sentence and there is no incomplete sent
        # EX: sent_split = ["はい、皆さんこんにちは。"]
        if(incomplete_sent == "" and len(sent_split) == 1 and re.search(r'[。？！.?!]', sent_split[0])):
            result.append(sent_to_data(index, sent_split, data[end]['start_time'], data[end]['end_time']))
            index = index + 1
            start = start + 1
            end   = end + 1
            continue
        
        if(incomplete_sent != "" and len(sent_split) == 1):
        # Case 2: Block splits into 1 block WITH a punctuation
        # EX:  sent_split = ['ポッドキャストのお時間です。']
        # incomplete_sent = 'ゆの日本語'
            if re.search(r'[。？！.?!]', sent_split[0]):
                sent = incomplete_sent + sent_split[0]
                result.append(sent_to_data(index, sent, data[start]['start_time'], data[end]['end_time']))

                incomplete_sent = ""
                index = index + 1
                start = end + 1
                end   = end + 1
                continue

        # Case 3: Block splits into 1 block WITHOUT a punctuation
        # EX:  sent_split = ['に1時間ポッドキャストにチャレンジし'] (next seg: "たいと思います。")
        # incomplete_sent = '今日は久しぶり'
            if not re.search(r'[。？！.?!]', sent_split[0]):
                incomplete_sent = incomplete_sent + sent_split[0]
                end = end + 1
                continue

        ##### From here on we know the length of the sentence_split is >1 #####

        if(incomplete_sent == ""):
        # Case 4: Block splits into multiple blocks. First sentence is complete. 
        # Middle blocks are always complete. Last is incomplete or complete
        # EX: sent_split = ['はい、皆さんこんにちは。', 'ゆの日本語']
        # EX: sent_split = ['はい、皆さんこんにちは。', 'ゆの日本語ポッドキャストのお時間です。']
        # EX: sent_split = ['僕の事を思っている。', 'はい。', 'ま、ただね、']
        # Add the first sentence
            first_sent = sent_split[0]
            result.append(sent_to_data(index, first_sent, data[end]['start_time'], data[end]['end_time']))
            index = index + 1
            start = end # Move the start of the window to our current block
        else:
        # Case ５: Block splits into multiple blocks. First sentence is incomplete. 
        # Middle blocks are always complete. Last is incomplete or complete
        # EX: sent_split = ['んこんにちは。', 'ゆの日本語']
        # EX: sent_split = ['んこんにちは。', 'ゆの日本語ポッドキャストのお時間です。']
        # EX: sent_split = ['を思っている。', 'はい。', 'ま、ただね、']
            first_sent = incomplete_sent + sent_split[0]
            result.append(sent_to_data(index, first_sent, data[start]['start_time'], data[end]['end_time']))
            index = index + 1
            start = end
            incomplete_sent = ""

        # Now work on the other blocks
        for i in range(1, len(sent_split)): # Exclude the first block
            block = sent_split[i]
            if i == len(sent_split) - 1: # We are at the last block (all mid's will be sentences)  
                if re.search(r'[。？！.?!]', block): # block is a sentence
                    result.append(sent_to_data(index, block, data[end]['start_time'], data[end]['end_time']))
                    index = index + 1
                    incomplete_sent = ""
                    start = start + 1
                    end = end + 1
                    continue
                else:
                    incomplete_sent = incomplete_sent + block
                    end = end + 1
                    continue
            else: # not at the last block
                result.append(sent_to_data(index, block, data[end]['start_time'], data[end]['end_time']))
                index = index + 1
                incomplete_sent = ""
                
    return result


In [None]:
def segs_to_sents_v2(data):
    result     = []
    num_segs   = len(data)
    start_time = data[0]['start_time']
    end_time   = 0
    index      = 0 # The current index (only updates once per loop)
    sent_index = 0 # The sentence index (updates per each insert)
    acc_sent   = ""

    while(index < num_segs):
        print(F"Current Index: {index}")
        # Set the end time to the current segments end time. 
        # Start time only changes once a sentence is completed or we end the loop with no acc_sent (the next seg starts a sent)
        end_time = data[index]['end_time']

        # Split the current segment
        sent_blocks = re.split(r'(?<=[。？！!.?])', data[index]['text'])
        sent_blocks = [s for s in sent_blocks if s != ""]

        # print(sent_blocks)

        for block in sent_blocks:
            print(F"acc_sent before: \"{acc_sent}\"")
            acc_sent = acc_sent + block
            print(F"acc_sent after: \"{acc_sent}\"")

            # Case 1: Block completed a sentence
            if re.search(r'[。？！!.?]$', acc_sent):
                result.append(sent_to_data(sent_index, acc_sent, start_time, end_time))
                start_time = data[index]['start_time']
                acc_sent = ""
                # Forgot to do sent_index++
            # Case 2: Block did not complete a sentence
            # else:
                # Do nothing
        
        index = index + 1
        # print()
        if acc_sent == "" and index < num_segs: # < num_segs handles edge case for last sent being complete
            start_time = data[index]['start_time']
    return result

In [1]:
ex_data = [{'start_time': 2.24, 'end_time': 4.47, 'text': 'はい、さん皆こんにちはゆの日本語'},
 {'start_time': 4.48, 'end_time': 7.07, 'text': 'ポッドキャストのお時間です皆さん元気'},
 {'start_time': 7.08, 'end_time': 9.83, 'text': 'にしていますでしょうか?今日は久しぶり'},
 {'start_time': 9.84, 'end_time': 14.15, 'text': 'に1時間ポッドキャストにチャレンジし'},
 {'start_time': 14.16, 'end_time': 17.11, 'text': 'たいと思いますテーマはゆについて、あ'},
 {'start_time': 17.12, 'end_time': 19.95, 'text': '、ま、つまり僕についてですねま.'}]

In [2]:
test = "はい。僕はいつも嬉しいだよ！"
sent_blocks = re.split(r'(?<=[。？！!.?])', test)
sent_blocks = [s for s in sent_blocks if s != ""]
sent_blocks

NameError: name 're' is not defined

In [79]:
segs_to_sents_v2(ex_data)

Current Index: 0
acc_sent before: ""
acc_sent after: "はい、さん皆こんにちはゆの日本語"
Current Index: 1
acc_sent before: "はい、さん皆こんにちはゆの日本語"
acc_sent after: "はい、さん皆こんにちはゆの日本語ポッドキャストのお時間です皆さん元気"
Current Index: 2
acc_sent before: "はい、さん皆こんにちはゆの日本語ポッドキャストのお時間です皆さん元気"
acc_sent after: "はい、さん皆こんにちはゆの日本語ポッドキャストのお時間です皆さん元気にしていますでしょうか?"
acc_sent before: ""
acc_sent after: "今日は久しぶり"
Current Index: 3
acc_sent before: "今日は久しぶり"
acc_sent after: "今日は久しぶりに1時間ポッドキャストにチャレンジし"
Current Index: 4
acc_sent before: "今日は久しぶりに1時間ポッドキャストにチャレンジし"
acc_sent after: "今日は久しぶりに1時間ポッドキャストにチャレンジしたいと思いますテーマはゆについて、あ"
Current Index: 5
acc_sent before: "今日は久しぶりに1時間ポッドキャストにチャレンジしたいと思いますテーマはゆについて、あ"
acc_sent after: "今日は久しぶりに1時間ポッドキャストにチャレンジしたいと思いますテーマはゆについて、あ、ま、つまり僕についてですねま."


[{'sentence_id': 0,
  'start_time': 2.24,
  'end_time': 9.83,
  'text': 'はい、さん皆こんにちはゆの日本語ポッドキャストのお時間です皆さん元気にしていますでしょうか?',
  'jlpt_level': '',
  'grammar': '',
  'vocabulary': ''},
 {'sentence_id': 0,
  'start_time': 7.08,
  'end_time': 19.95,
  'text': '今日は久しぶりに1時間ポッドキャストにチャレンジしたいと思いますテーマはゆについて、あ、ま、つまり僕についてですねま.',
  'jlpt_level': '',
  'grammar': '',
  'vocabulary': ''}]

In [None]:
# ===== Data Format (EXAMPLE) ===== #
{
  'sentence_id': 1,
  'start_time': 4.48,
  'end_time': 9.83,
  'text': '皆さん元気にしていますでしょうか?',
  'jlpt_level': '',
  'grammar': '',
  'vocabulary': ''
}

# # Example
# text = "はい、皆さんこんにちは。ゆの日本語"
# sentences = split_japanese_sentences(text)
# # ['はい、皆さんこんにちは。', 'ゆの日本語']

In [None]:
# Observations
# For podcast and normal videos:
# Sentences get cutoff and sometimes end on the next line.
# Easy to tell because periods are present
# We could clean this by searching for segments that dont end in a period
# then merge the next block until a period is hit.
# How do I know if I should merge into the first or second segment?
# Or should I always merge into the first segment?
# Then how should I update the start and end times? I don't think I should.
# But I should remove all the text I merged with the first block

# There are cases where sentences may be complete in 2, 3, or 4 lines
# This is when we start a sentence and now a period isn't seen for
# a couple of lines. In this case we should delete the block and merge
# the times.

# For music videos:
# There are no periods and only occasionally question marks
# It may be a lot more difficult to split up sentences by context
# It may be easier to keep this as is

# Maybe add a feature to the app where a user can select blocks 
# or manually type in their sentence and have the app assign grammar tags
# and allow them to add their sentences to their decks of choice

In [None]:
# ===== Web Scraping / Building Grammar Dataset ===== #

In [9]:
#!pip install requests beautifulsoup4

In [3]:
from dataclasses import dataclass, asdict
from typing import List, Optional
import json
import requests

In [4]:
@dataclass
class GrammarPoint:
    grammar: str
    meaning: str
    jlpt_level: str
    example_jp: Optional[str] = None
    example_en: Optional[str] = None

In [5]:
def fetch_html(url: str) -> str:
    """Fetch HTML content from URL"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    print(f"Fetching {url}...")
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    response.encoding = 'utf-8'
    
    print(f"Successfully fetched {len(response.text)} characters")
    return response.text

In [6]:
test_html = fetch_html("https://jlptgrammarlist.neocities.org/")

Fetching https://jlptgrammarlist.neocities.org/...
Successfully fetched 146527 characters


In [16]:
def parse_grammar_from_html(html: str) -> List[GrammarPoint]:
    """Parse grammar points directly from the HTML DOM structure.
    
    The page has: <div class="grammar-list n5/n4/n3/n2/n1">
      containing <div class="item"> elements, each with:
        - <span class="term">   -> grammar point
        - loose text node       -> English meaning
        - <div class="japanese-sentence"> -> example sentence (JP)
        - <div class="english-meaning">   -> example sentence (EN)
    """
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(html, 'html.parser')
    grammar_points = []
    
    level_map = {'n5': 'N5', 'n4': 'N4', 'n3': 'N3', 'n2': 'N2', 'n1': 'N1'}
    
    for level_class, level_name in level_map.items():
        level_div = soup.find('div', class_=level_class)
        if not level_div:
            continue
        
        for item in level_div.find_all('div', class_='item'):
            # Extract grammar term
            term_span = item.find('span', class_='term')
            if not term_span:
                continue
            grammar = term_span.get_text(strip=True)
            if not grammar:
                continue
            
            # Extract English meaning (loose text between <span class="common"> and <div>)
            # Get all direct text nodes that aren't inside child elements
            meaning_parts = []
            for child in item.children:
                if isinstance(child, str) and child.strip():
                    meaning_parts.append(child.strip())
            meaning = ' '.join(meaning_parts)
            
            if not meaning:
                continue
            
            # Extract example Japanese sentence
            jp_div = item.find('div', class_='japanese-sentence')
            example_jp = jp_div.get_text(strip=True) if jp_div else None
            
            # Extract example English meaning
            en_div = item.find('div', class_='english-meaning')
            example_en = en_div.get_text(strip=True) if en_div else None
            
            grammar_points.append(GrammarPoint(
                grammar=grammar,
                meaning=meaning,
                jlpt_level=level_name,
                example_jp=example_jp,
                example_en=example_en
            ))
    
    return grammar_points

In [None]:
# No longer need extract_text_content — parse directly from HTML
# test_content = extract_text_content(test_html)

In [None]:
# test_content

In [None]:
# Old parse_grammar_from_text is replaced by parse_grammar_from_html above

In [17]:
parsed_grammar = parse_grammar_from_html(test_html)

In [19]:
#parsed_grammar

In [None]:
# ===== Language Model Multi-Classification ===== #

In [None]:
#!pip install protobuf
#!pip install unidic-lite
#!pip install fugashi

In [21]:
# ===== Required Imports ===== #
from transformers import (
    pipeline,
    AutoTokenizer, 
    AutoModelForPreTraining,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# A Japanese-capable zero-shot classifier

# Labels dataset:
grammar_labels = [gp.grammar for gp in parsed_grammar]

# https://huggingface.co/akiFQC/bert-base-japanese-v3_nli-jsnli
classifier = pipeline("zero-shot-classification", model='akiFQC/bert-base-japanese-v3_nli-jsnli')

# sent = "Appleは先程、iPhoneの最新機種について発表しました。"
example_sents
# candidate_labels = ["技術", "スポーツ", "政治"]
candidate_labels = grammar_labels
res = classifier(sent, candidate_labels, multi_label=True)
res


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 465.43it/s, Materializing param=classifier.weight]                                      


{'sequence': 'Appleは先程、iPhoneの最新機種について発表しました。',
 'labels': ['わけではない',
  'ないものでもない',
  'に足る / に足りる',
  'というわけではない',
  'いくら○○ても / いくら○○でも',
  'か何か',
  'ものがある',
  'いかんでは / いかんによっては',
  'お○○願う',
  'なくてはいけない / なくてはならない',
  'というか○○というか',
  'なくはない',
  'ないとも限らない',
  'つ○○つ',
  '(よ)うか○○まいか',
  'なければいけない / なければならない',
  'ことは○○が',
  'かと思ったら / かと思うと',
  'た結果 / の結果',
  '(よ)うが○○まいが / (よ)うと○○まいと',
  'てよかった',
  'あらかじめ',
  '極まる / 極まりない',
  'やすい',
  'ないではおかない / ずにはおかない',
  '直ちに',
  'ないではすまない / ずにはすまない',
  'やら○○やら',
  'てしょうがない',
  '以上2',
  'といい○○といい',
  'なり○○なり',
  '始める',
  'たり○○たり',
  '要するに',
  'お○○になる',
  '以上1',
  'かたがた',
  'ふと1',
  'すなわち',
  'であれ○○であれ',
  'ともなると / ともなれば',
  '続ける',
  'ぐるみ',
  'がる',
  'つもり',
  'とか○○とか',
  'お○○ください',
  '極めて',
  'て以来',
  'ことごとく',
  'かけ',
  'たとえ○○ても',
  'た末 / の末',
  '抜く',
  'おそらく',
  'なる',
  'つまり',
  'のみならず',
  'いよいよ',
  'がたい',
  'かれ○○かれ',
  'あるいは',
  'たとえば',
  'やがて',
  'ことができる',
  'られる1',
  'ものを',
  'より',
  'がち',
  'ようがない / ようもない',
  'とりわけ',
  'より○○のほうが○○',
  'のように / のような',

In [None]:
# Few-shot multi-label classifier
