# Generate Transcript from Audio

This notebook allows you to process an audio file, split it into clips, and transcribe the content using Whisper.cpp. You can configure the settings using interactive widgets.

## Step 1: Setup and Configuration

Configure the input file, output directory, and other settings using interactive widgets.

In [26]:
import os
import ipywidgets as widgets
from IPython.display import display

# Default configuration values
DEFAULT_INPUT_FILE = "../data//demo/demo.wav"
# DEFAULT_INPUT_FILE = "/Volumes/MacHD/梵公子/梵公子【美女资源收集器】/第六节【识别“S”女】必听/第六节【识别“s”女】【WWW.PUACP.COM】.mp4"
DEFAULT_OUTPUT_DIR = "../data/output_clips"
DEFAULT_CLIP_DURATION_MIN = 1  # in minutes
DEFAULT_WHISPER_EXEC = "../whisper.cpp/build/bin/whisper-cli"
DEFAULT_WHISPER_MODEL = "../whisper.cpp/models/ggml-medium.bin"
DEFAULT_LANGUAGE = "zh"
DEFAULT_TRANSCRIPT_FILENAME = "transcription.txt"

# Widgets for configuration
input_file_widget = widgets.Text(
    value=DEFAULT_INPUT_FILE,
    placeholder='Enter input audio file path',
    description='Input File:',
    layout={'width': '500px'}
)

output_dir_widget = widgets.Text(
    value=DEFAULT_OUTPUT_DIR,
    placeholder='Enter output directory',
    description='Output Dir:',
    layout={'width': '500px'}
)

clip_duration_widget = widgets.IntSlider(
    value=DEFAULT_CLIP_DURATION_MIN,
    min=1,
    max=30,
    step=1,
    description='Clip Duration (min):',
    layout={'width': '500px'}
)

whisper_exec_widget = widgets.Text(
    value=DEFAULT_WHISPER_EXEC,
    placeholder='Enter Whisper.cpp executable path',
    description='Whisper Exec:',
    layout={'width': '500px'}
)

whisper_model_widget = widgets.Text(
    value=DEFAULT_WHISPER_MODEL,
    placeholder='Enter Whisper model path',
    description='Whisper Model:',
    layout={'width': '500px'}
)

language_widget = widgets.Dropdown(
    options=[('Chinese (zh)', 'zh'), ('English (en)', 'en')],
    value=DEFAULT_LANGUAGE,
    description='Language:',
    layout={'width': '500px'}
)

transcript_filename_widget = widgets.Text(
    value='',
    placeholder=f'Enter transcript filename (default: {DEFAULT_TRANSCRIPT_FILENAME})',
    description='Transcript File:',
    layout={'width': '500px'}
)

# Display widgets
display(input_file_widget)
display(output_dir_widget)
display(clip_duration_widget)
display(whisper_exec_widget)
display(whisper_model_widget)
display(language_widget)
display(transcript_filename_widget)

Text(value='../data//demo/demo.wav', description='Input File:', layout=Layout(width='500px'), placeholder='Ent…

Text(value='../data/output_clips', description='Output Dir:', layout=Layout(width='500px'), placeholder='Enter…

IntSlider(value=1, description='Clip Duration (min):', layout=Layout(width='500px'), max=30, min=1)

Text(value='../whisper.cpp/build/bin/whisper-cli', description='Whisper Exec:', layout=Layout(width='500px'), …

Text(value='../whisper.cpp/models/ggml-medium.bin', description='Whisper Model:', layout=Layout(width='500px')…

Dropdown(description='Language:', layout=Layout(width='500px'), options=(('Chinese (zh)', 'zh'), ('English (en…

Text(value='', description='Transcript File:', layout=Layout(width='500px'), placeholder='Enter transcript fil…

## Step 2: Import Audio Processing Functions

Import the necessary functions for processing audio files from `voice2transcripts.py`.

In [27]:
import os
import sys

# Add the scripts directory to the path so we can import the functions
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from voice2transcripts import clear_output_folder, convert_to_wav, split_audio, transcribe_audio
from time_stamp_cleaner import clean_transcription, save_cleaned_transcription

## Step 3: Process Audio and Generate Transcript

Run the processing pipeline to convert, split, transcribe, and clean the audio file.

In [28]:
try:
    # Get values from widgets
    input_file = input_file_widget.value
    output_dir = output_dir_widget.value
    clip_duration_sec = clip_duration_widget.value * 60  # Convert minutes to seconds
    whisper_exec = whisper_exec_widget.value
    whisper_model = whisper_model_widget.value
    language = language_widget.value
    transcript_filename = transcript_filename_widget.value if transcript_filename_widget.value.strip() else DEFAULT_TRANSCRIPT_FILENAME

    # Check if input file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"❌ 找不到輸入音訊檔案：{input_file}")

    # Clear old files
    clear_output_folder(output_dir)

    # Execute the processing pipeline
    print("🚀 開始音訊處理與轉錄流程...")
    wav_file = convert_to_wav(input_file, output_dir)
    clip_files = split_audio(wav_file, clip_duration_sec, output_dir)
    transcribe_audio(clip_files, output_dir, whisper_exec, whisper_model, language, transcript_filename)
    print(f"🎉 轉錄處理完成！轉錄結果已儲存至 {os.path.join(output_dir, '../transcripts/' + transcript_filename)}")
except Exception as e:
    print(f"❌ 處理過程中發生錯誤：{e}")

🗑️ 輸出資料夾已清除！
🚀 開始音訊處理與轉錄流程...
🔄 音訊已轉換為 WAV 格式：../data/output_clips/converted.wav
✅ 音訊切割完成，共 2 個片段，時間戳已儲存
🎤 轉錄片段 1/2: clip_001.wav ...
⚠️ Whisper.cpp 訊息: whisper_init_from_file_with_params_no_state: loading model from '../whisper.cpp/models/ggml-medium.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_init_with_params_no_state: devices    = 3
whisper_init_with_params_no_state: backends   = 3
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 1024
whisper_model_load: n_audio_head  = 16
whisper_model_load: n_audio_layer = 24
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 1024
whisper_model_load: n_text_head   = 16
whisper_model_load: n_text_layer  = 24
whisper_model_load: n_mels        = 80
whisper_m

## Step 4: Clean Transcription

Clean the transcription by removing per-sentence timestamps and formatting the content.

### Step 4.1: Configure Cleaned Transcript Filename

In [29]:
import ipywidgets as widgets
from IPython.display import display

# Widget for cleaned transcript filename
cleaned_transcript_filename_widget = widgets.Text(
    value='',
    placeholder='Enter cleaned transcript filename (default: clean_transcription.txt)',
    description='Cleaned Transcript:',
    layout={'width': '500px'}
)
display(cleaned_transcript_filename_widget)

Text(value='', description='Cleaned Transcript:', layout=Layout(width='500px'), placeholder='Enter cleaned tra…

### Step 4.2: Clean and Save Transcription

In [30]:
try:
    # Get the transcript file path based on the output directory structure
    transcript_dir = os.path.join(os.path.dirname(output_dir), 'transcripts')
    transcript_path = os.path.join(transcript_dir, transcript_filename)
    
    # Get the cleaned transcript filename from the widget
    cleaned_transcript_filename = cleaned_transcript_filename_widget.value if cleaned_transcript_filename_widget.value.strip() else "clean_transcription.txt"
    cleaned_transcript_path = os.path.join(transcript_dir, cleaned_transcript_filename)
    
    # Read, clean, and save the transcription
    print("🧹 開始清理轉錄結果...")
    print(f"嘗試讀取轉錄檔案：{transcript_path}")
    if os.path.exists(transcript_path):
        print(f"✅ 找到轉錄檔案，開始清理...")
        with open(transcript_path, 'r', encoding='utf-8') as f:
            text = f.read()
        cleaned_segments = clean_transcription(text)
        if cleaned_segments:
            save_cleaned_transcription(cleaned_segments, cleaned_transcript_path)
            print(f"🎉 清理完成！清理後的轉錄結果已儲存至 {cleaned_transcript_path}")
        else:
            print("⚠️ 沒有找到有效的轉錄內容，無法儲存清理後的檔案。")
    else:
        print(f"❌ 轉錄檔案不存在：{transcript_path}")
except FileNotFoundError as e:
    print(f"❌ 找不到檔案：{e}")
except IOError as e:
    print(f"❌ 讀取檔案時發生錯誤：{e}")
except Exception as e:
    print(f"❌ 清理轉錄過程中發生未知錯誤：{e}")

🧹 開始清理轉錄結果...
嘗試讀取轉錄檔案：../data/transcripts/transcription.txt
✅ 找到轉錄檔案，開始清理...
🎉 清理完成！清理後的轉錄結果已儲存至 ../data/transcripts/clean_transcription.txt


## Step 5: Convert Transcript to SRT Format

Convert the cleaned transcript into a continuous SRT file format for subtitles.

In [31]:
import re
import os

def time_to_srt_format(time_str):
    """Convert time format from HH:MM:SS or MM:SS to HH:MM:SS,000 for SRT."""
    parts = time_str.split(':')
    if len(parts) == 2:  # Format is MM:SS
        time_str = '00:' + time_str
    elif len(parts) == 3:  # Format is HH:MM:SS
        pass
    else:
        return '00:00:00,000'  # Default fallback for invalid format
    return time_str + ',000'

def convert_transcript_to_srt(transcript_path, srt_path):
    """Convert a transcript file with timestamps to SRT format, handling non-continuous segments."""
    try:
        with open(transcript_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Split content into segments based on timestamp headers, supporting various time formats
        segments = re.split(r'\[(\d{1,3}:\d{2}(?::\d{2})?) - (\d{1,3}:\d{2}(?::\d{2})?)\]\n', content)
        srt_content = []
        index = 1
        
        for i in range(1, len(segments), 3):
            if i + 2 < len(segments):  # Ensure we have all parts (start, end, text)
                start_time = time_to_srt_format(segments[i])
                end_time = time_to_srt_format(segments[i+1])
                text = segments[i+2].strip()
                if text:
                    srt_content.append(f"{index}\n{start_time} --> {end_time}\n{text}\n")
                    index += 1
        
        with open(srt_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(srt_content))
        print(f"🎬 SRT file created successfully at: {srt_path}")
    except Exception as e:
        print(f"❌ Error converting transcript to SRT: {e}")

# Define paths for transcript and SRT output
transcript_dir = os.path.join(os.path.dirname(output_dir), 'transcripts')
transcript_path = os.path.join(transcript_dir, transcript_filename)
srt_path = os.path.join(transcript_dir, 'subtitles.srt')

# Convert transcript to SRT
if os.path.exists(transcript_path):
    convert_transcript_to_srt(transcript_path, srt_path)
else:
    print(f"❌ Transcript file does not exist: {transcript_path}")

🎬 SRT file created successfully at: ../data/transcripts/subtitles.srt
