# Generate Transcript from Audio

This notebook allows you to process an audio file, split it into clips, and transcribe the content using Whisper.cpp. You can configure the settings using interactive widgets.

## Step 1: Setup and Configuration

Configure the input file, output directory, and other settings using interactive widgets.

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display

# Default configuration values
DEFAULT_INPUT_FILE = "../data/demo/demo.wav"
# DEFAULT_INPUT_FILE = "/Users/macmini/Library/Mobile Documents/com~apple~CloudDocs/Documents/2025SEM/2025-05-17 18-51-59.mp4"
# DEFAULT_INPUT_FILE = "/Volumes/MacHD/Ê¢µÂÖ¨Â≠ê/19„ÄÅÊ¢µÂÖ¨Â≠ê„ÄäÊê≠ËÆ™ÂêéÁª≠ËÅäÂ§©„Äã/Á¨¨ÂõõËäÇÔºö‰∏≠ÊúüËÅäÂ§©‚ÄúÊ∏êÂÖ•‰Ω≥Â¢É‚Äù„ÄêWWW.PUACP.COM„Äë.mp4"
DEFAULT_OUTPUT_DIR = "../data/output_clips"
DEFAULT_CLIP_DURATION_SEC = 10  # in seconds
DEFAULT_WHISPER_EXEC = "../whisper.cpp/build/bin/whisper-cli"
DEFAULT_WHISPER_MODEL = "../whisper.cpp/models/ggml-medium.bin"
DEFAULT_LANGUAGE = "zh"
DEFAULT_TRANSCRIPT_FILENAME = "transcription.txt"
DEFAULT_WORKERS = 3

# Widgets for configuration
input_file_widget = widgets.Text(
    value=DEFAULT_INPUT_FILE,
    placeholder='Enter input audio file path',
    description='Input File:',
    layout={'width': '500px'}
)

output_dir_widget = widgets.Text(
    value=DEFAULT_OUTPUT_DIR,
    placeholder='Enter output directory',
    description='Output Dir:',
    layout={'width': '500px'}
)

clip_duration_widget = widgets.IntSlider(
    value=DEFAULT_CLIP_DURATION_SEC,
    min=10,
    max=1800,
    step=5,
    description='Clip Duration (sec):',
    layout={'width': '500px'}
)

whisper_exec_widget = widgets.Text(
    value=DEFAULT_WHISPER_EXEC,
    placeholder='Enter Whisper.cpp executable path',
    description='Whisper Exec:',
    layout={'width': '500px'}
)

whisper_model_widget = widgets.Text(
    value=DEFAULT_WHISPER_MODEL,
    placeholder='Enter Whisper model path',
    description='Whisper Model:',
    layout={'width': '500px'}
)

language_widget = widgets.Dropdown(
    options=[('Chinese (zh)', 'zh'), ('English (en)', 'en')],
    value=DEFAULT_LANGUAGE,
    description='Language:',
    layout={'width': '500px'}
)

transcript_filename_widget = widgets.Text(
    value='',
    placeholder=f'Enter transcript filename (default: {DEFAULT_TRANSCRIPT_FILENAME})',
    description='Transcript File:',
    layout={'width': '500px'}
)

workers_widget = widgets.IntSlider(
    value=DEFAULT_WORKERS,
    min=1,
    max=8,
    step=1,
    description='Workers:',
    layout={'width': '500px'}
)

# Display widgets
display(input_file_widget)
display(output_dir_widget)
display(clip_duration_widget)
display(whisper_exec_widget)
display(whisper_model_widget)
display(language_widget)
display(transcript_filename_widget)
display(workers_widget)

Text(value='../data/demo/demo.wav', description='Input File:', layout=Layout(width='500px'), placeholder='Ente‚Ä¶

Text(value='../data/output_clips', description='Output Dir:', layout=Layout(width='500px'), placeholder='Enter‚Ä¶

IntSlider(value=10, description='Clip Duration (sec):', layout=Layout(width='500px'), max=1800, min=10, step=5‚Ä¶

Text(value='../whisper.cpp/build/bin/whisper-cli', description='Whisper Exec:', layout=Layout(width='500px'), ‚Ä¶

Text(value='../whisper.cpp/models/ggml-medium.bin', description='Whisper Model:', layout=Layout(width='500px')‚Ä¶

Dropdown(description='Language:', layout=Layout(width='500px'), options=(('Chinese (zh)', 'zh'), ('English (en‚Ä¶

Text(value='', description='Transcript File:', layout=Layout(width='500px'), placeholder='Enter transcript fil‚Ä¶

IntSlider(value=4, description='Workers:', layout=Layout(width='500px'), max=8, min=1)

## Step 2: Import Audio Processing Functions

Import the necessary functions for processing audio files from `voice2transcripts.py`.

In [2]:
import os
import sys

# Add the scripts directory to the path so we can import the functions
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from voice2transcripts import clear_output_folder, convert_to_wav, split_audio, transcribe_audio
from time_stamp_cleaner import clean_transcription, save_cleaned_transcription

## Step 3: Process Audio and Generate Transcript

Run the processing pipeline to convert, split, transcribe, and clean the audio file.

In [3]:
try:
    # Get values from widgets
    input_file = input_file_widget.value
    output_dir = output_dir_widget.value
    clip_duration_sec = clip_duration_widget.value  # Duration in seconds
    whisper_exec = whisper_exec_widget.value
    whisper_model = whisper_model_widget.value
    language = language_widget.value
    transcript_filename = transcript_filename_widget.value if transcript_filename_widget.value.strip() else DEFAULT_TRANSCRIPT_FILENAME
    workers = workers_widget.value

    # Check if input file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"‚ùå Êâæ‰∏çÂà∞Ëº∏ÂÖ•Èü≥Ë®äÊ™îÊ°àÔºö{input_file}")

    # Clear old files
    clear_output_folder(output_dir)

    # Execute the processing pipeline
    print("üöÄ ÈñãÂßãÈü≥Ë®äËôïÁêÜËàáËΩâÈåÑÊµÅÁ®ã...")
    wav_file = convert_to_wav(input_file, output_dir)
    clip_files = split_audio(wav_file, clip_duration_sec, output_dir)
    transcribe_audio(clip_files, output_dir, whisper_exec, whisper_model, language, transcript_filename=transcript_filename, workers=workers)
    print(f"üéâ ËΩâÈåÑËôïÁêÜÂÆåÊàêÔºÅËΩâÈåÑÁµêÊûúÂ∑≤ÂÑ≤Â≠òËá≥ {os.path.join(output_dir, '../transcripts/' + transcript_filename)}")
except Exception as e:
    print(f"‚ùå ËôïÁêÜÈÅéÁ®ã‰∏≠ÁôºÁîüÈåØË™§Ôºö{e}")

üóëÔ∏è Ëº∏Âá∫Ë≥áÊñôÂ§æÂ∑≤Ê∏ÖÈô§ÔºÅ
üöÄ ÈñãÂßãÈü≥Ë®äËôïÁêÜËàáËΩâÈåÑÊµÅÁ®ã...
üîÑ Èü≥Ë®äÂ∑≤ËΩâÊèõÁÇ∫ WAV Ê†ºÂºèÔºö../data/output_clips/converted.wav
‚úÖ Èü≥Ë®äÂàáÂâ≤ÂÆåÊàêÔºåÂÖ± 12 ÂÄãÁâáÊÆµÔºåÊôÇÈñìÊà≥Â∑≤ÂÑ≤Â≠ò
üé§ ËΩâÈåÑÁâáÊÆµ 1/12: clip_001.wav ...
üé§ ËΩâÈåÑÁâáÊÆµ 2/12: clip_002.wav ...
üé§ ËΩâÈåÑÁâáÊÆµ 3/12: clip_003.wav ...
üé§ ËΩâÈåÑÁâáÊÆµ 4/12: clip_004.wav ...
‚ö†Ô∏è Whisper.cpp Ë®äÊÅØ: whisper_init_from_file_with_params_no_state: loading model from '../whisper.cpp/models/ggml-medium.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_init_with_params_no_state: devices    = 3
whisper_init_with_params_no_state: backends   = 3
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 1024
whisper_model_load: n_audio_head  

## Step 4: Clean Transcription

Clean the transcription by removing per-sentence timestamps and formatting the content.

### Step 4.1: Configure Cleaned Transcript Filename

In [4]:
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path

# Get the stem of the input file for default naming
input_file_stem = Path(input_file_widget.value).stem

# Widget for cleaned transcript filename
cleaned_transcript_filename_widget = widgets.Text(
    value=f"{input_file_stem}_clean.txt",
    placeholder=f'Enter cleaned transcript filename (default: {input_file_stem}_clean.txt)',
    description='Cleaned Transcript:',
    layout={'width': '500px'}
)
display(cleaned_transcript_filename_widget)

Text(value='demo_clean.txt', description='Cleaned Transcript:', layout=Layout(width='500px'), placeholder='Ent‚Ä¶

### Step 4.2: Clean and Save Transcription

In [5]:
try:
    # Get the transcript file path based on the output directory structure
    transcript_dir = os.path.join(os.path.dirname(output_dir), 'transcripts')
    transcript_path = os.path.join(transcript_dir, transcript_filename)
    
    # Get the cleaned transcript filename from the widget or use default based on input file stem
    cleaned_transcript_filename = cleaned_transcript_filename_widget.value if cleaned_transcript_filename_widget.value.strip() else f"{input_file_stem}_clean.txt"
    cleaned_transcript_path = os.path.join(transcript_dir, cleaned_transcript_filename)
    
    # Read, clean, and save the transcription
    print("üßπ ÈñãÂßãÊ∏ÖÁêÜËΩâÈåÑÁµêÊûú...")
    print(f"ÂòóË©¶ËÆÄÂèñËΩâÈåÑÊ™îÊ°àÔºö{transcript_path}")
    if os.path.exists(transcript_path):
        print(f"‚úÖ ÊâæÂà∞ËΩâÈåÑÊ™îÊ°àÔºåÈñãÂßãÊ∏ÖÁêÜ...")
        with open(transcript_path, 'r', encoding='utf-8') as f:
            text = f.read()
        global cleaned_segments
        cleaned_segments = clean_transcription(text)
        if cleaned_segments:
            save_cleaned_transcription(cleaned_segments, cleaned_transcript_path)
            print(f"üéâ Ê∏ÖÁêÜÂÆåÊàêÔºÅÊ∏ÖÁêÜÂæåÁöÑËΩâÈåÑÁµêÊûúÂ∑≤ÂÑ≤Â≠òËá≥ {cleaned_transcript_path}")
        else:
            print("‚ö†Ô∏è Ê≤íÊúâÊâæÂà∞ÊúâÊïàÁöÑËΩâÈåÑÂÖßÂÆπÔºåÁÑ°Ê≥ïÂÑ≤Â≠òÊ∏ÖÁêÜÂæåÁöÑÊ™îÊ°à„ÄÇ")
    else:
        print(f"‚ùå ËΩâÈåÑÊ™îÊ°à‰∏çÂ≠òÂú®Ôºö{transcript_path}")
except FileNotFoundError as e:
    print(f"‚ùå Êâæ‰∏çÂà∞Ê™îÊ°àÔºö{e}")
except IOError as e:
    print(f"‚ùå ËÆÄÂèñÊ™îÊ°àÊôÇÁôºÁîüÈåØË™§Ôºö{e}")
except Exception as e:
    print(f"‚ùå Ê∏ÖÁêÜËΩâÈåÑÈÅéÁ®ã‰∏≠ÁôºÁîüÊú™Áü•ÈåØË™§Ôºö{e}")

üßπ ÈñãÂßãÊ∏ÖÁêÜËΩâÈåÑÁµêÊûú...
ÂòóË©¶ËÆÄÂèñËΩâÈåÑÊ™îÊ°àÔºö../data/transcripts/transcription.txt
‚úÖ ÊâæÂà∞ËΩâÈåÑÊ™îÊ°àÔºåÈñãÂßãÊ∏ÖÁêÜ...
üéâ Ê∏ÖÁêÜÂÆåÊàêÔºÅÊ∏ÖÁêÜÂæåÁöÑËΩâÈåÑÁµêÊûúÂ∑≤ÂÑ≤Â≠òËá≥ ../data/transcripts/demo_clean.txt


## Step 5: Convert Transcription to SRT Format

Convert the cleaned transcription to SRT subtitle format.

### Step 5.1: Configure SRT Filename

In [6]:
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path

# Get the stem of the input file for default naming
input_file_stem = Path(input_file_widget.value).stem

# Widget for SRT filename
srt_filename_widget = widgets.Text(
    value=f"{input_file_stem}.srt",
    placeholder=f'Enter SRT filename (default: {input_file_stem}.srt)',
    description='SRT File:',
    layout={'width': '500px'}
)
display(srt_filename_widget)

Text(value='demo.srt', description='SRT File:', layout=Layout(width='500px'), placeholder='Enter SRT filename ‚Ä¶

### Step 5.2: Generate SRT File

In [7]:
try:
    from time_stamp_cleaner import convert_to_srt
    
    # Get the SRT filename from the widget or use default based on input file stem
    srt_filename = srt_filename_widget.value if srt_filename_widget.value.strip() else f"{input_file_stem}.srt"
    srt_path = os.path.join(transcript_dir, srt_filename)
    
    # Convert cleaned segments to SRT format
    print("üìù ÈñãÂßãËΩâÊèõÁÇ∫ SRT Ê†ºÂºè...")
    if cleaned_segments:
        convert_to_srt(cleaned_segments, srt_path)
        print(f"üéâ ËΩâÊèõÂÆåÊàêÔºÅSRT Â≠óÂπïÊ™îÂ∑≤ÂÑ≤Â≠òËá≥ {srt_path}")
    else:
        print("‚ö†Ô∏è Ê≤íÊúâÊ∏ÖÁêÜÂæåÁöÑËΩâÈåÑÂÖßÂÆπÂèØ‰æõËΩâÊèõÁÇ∫ SRT Ê†ºÂºè„ÄÇ")
except Exception as e:
    print(f"‚ùå ËΩâÊèõÁÇ∫ SRT Ê†ºÂºèÊôÇÁôºÁîüÈåØË™§Ôºö{e}")

üìù ÈñãÂßãËΩâÊèõÁÇ∫ SRT Ê†ºÂºè...
üéâ ËΩâÊèõÂÆåÊàêÔºÅSRT Â≠óÂπïÊ™îÂ∑≤ÂÑ≤Â≠òËá≥ ../data/transcripts/demo.srt
