# GUI for Audio Transcription

This notebook provides a graphical user interface (GUI) for processing audio files, generating transcripts, and converting them to SRT format. It uses functions defined in `scripts/gui.py`.

## Step 1: Setup and Configuration

Configure the input file or folder, output directory, and other settings using interactive widgets.

In [None]:
import os
import sys
import ipywidgets as widgets
from IPython.display import display

# Default configuration values
DEFAULT_INPUT_FILE = "../data/demo/demo.wav"
DEFAULT_INPUT_FOLDER = "../data/demo"
DEFAULT_OUTPUT_DIR = "../data/output_clips"
DEFAULT_CLIP_DURATION_SEC = 5  # in seconds for single file
DEFAULT_CLIP_DURATION_MIN = 1  # in minutes for folder processing
DEFAULT_WHISPER_EXEC = "../whisper.cpp/build/bin/whisper-cli"
DEFAULT_WHISPER_MODEL = "../whisper.cpp/models/ggml-medium.bin"
DEFAULT_LANGUAGE = "zh"
DEFAULT_TRANSCRIPT_FILENAME = "transcription.txt"
DEFAULT_WORKERS = 3
DEFAULT_USE_THREADS = False
DEFAULT_REST_TIME = 180  # in seconds, default rest time between transcriptions for folder processing

# Widgets for configuration
mode_widget = widgets.Dropdown(
    options=[('Single File', 'single'), ('Folder', 'folder')],
    value='single',
    description='Mode:',
    layout={'width': '500px'}
)

input_file_widget = widgets.Text(
    value=DEFAULT_INPUT_FILE,
    placeholder='Enter input audio file path',
    description='Input File:',
    layout={'width': '500px'}
)

input_folder_widget = widgets.Text(
    value=DEFAULT_INPUT_FOLDER,
    placeholder='Enter input folder path',
    description='Input Folder:',
    layout={'width': '500px'}
)

output_dir_widget = widgets.Text(
    value=DEFAULT_OUTPUT_DIR,
    placeholder='Enter output directory',
    description='Output Dir:',
    layout={'width': '500px'}
)

clip_duration_sec_widget = widgets.IntSlider(
    value=DEFAULT_CLIP_DURATION_SEC,
    min=1,
    max=1800,
    step=1,
    description='Clip Duration (sec):',
    layout={'width': '500px'}
)

clip_duration_min_widget = widgets.IntSlider(
    value=DEFAULT_CLIP_DURATION_MIN,
    min=1,
    max=30,
    step=1,
    description='Clip Duration (min):',
    layout={'width': '500px'}
)

whisper_exec_widget = widgets.Text(
    value=DEFAULT_WHISPER_EXEC,
    placeholder='Enter Whisper.cpp executable path',
    description='Whisper Exec:',
    layout={'width': '500px'}
)

whisper_model_widget = widgets.Text(
    value=DEFAULT_WHISPER_MODEL,
    placeholder='Enter Whisper model path',
    description='Whisper Model:',
    layout={'width': '500px'}
)

language_widget = widgets.Dropdown(
    options=[('Chinese (zh)', 'zh'), ('English (en)', 'en')],
    value=DEFAULT_LANGUAGE,
    description='Language:',
    layout={'width': '500px'}
)

transcript_filename_widget = widgets.Text(
    value='',
    placeholder=f'Enter transcript filename (default: {DEFAULT_TRANSCRIPT_FILENAME})',
    description='Transcript File:',
    layout={'width': '500px'}
)

workers_widget = widgets.IntSlider(
    value=DEFAULT_WORKERS,
    min=1,
    max=8,
    step=1,
    description='Workers:',
    layout={'width': '500px'}
)

use_threads_widget = widgets.Dropdown(
    options=[('Multithreading', True), ('Multiprocessing', False)],
    value=DEFAULT_USE_THREADS,
    description='Parallel Method:',
    layout={'width': '500px'}
)

rest_time_widget = widgets.IntSlider(
    value=DEFAULT_REST_TIME,
    min=0,
    max=600,
    step=10,
    description='Rest Time (sec):',
    layout={'width': '500px'}
)

# Display widgets based on mode
def update_widgets(change):
    mode = change['new']
    clear_output(wait=True)
    display(mode_widget)
    if mode == 'single':
        display(input_file_widget)
        display(output_dir_widget)
        display(clip_duration_sec_widget)
        display(whisper_exec_widget)
        display(whisper_model_widget)
        display(language_widget)
        display(transcript_filename_widget)
        display(workers_widget)
        display(use_threads_widget)
    else:
        display(input_folder_widget)
        display(output_dir_widget)
        display(clip_duration_min_widget)
        display(whisper_exec_widget)
        display(whisper_model_widget)
        display(language_widget)
        display(workers_widget)
        display(use_threads_widget)
        display(rest_time_widget)

mode_widget.observe(update_widgets, names='value')

# Initial display
display(mode_widget)
display(input_file_widget)
display(output_dir_widget)
display(clip_duration_sec_widget)
display(whisper_exec_widget)
display(whisper_model_widget)
display(language_widget)
display(transcript_filename_widget)
display(workers_widget)
display(use_threads_widget)

## Step 2: Import GUI Functions

Import the necessary functions for processing audio files from `gui.py`.

In [None]:
import os
import sys

# Add the scripts directory to the path so we can import the functions
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from gui import process_single_audio, process_folder_audio, clean_transcripts, convert_to_srt_files, load_hallucinations, clean_srt_file

## Step 3: Process Audio and Generate Transcript(s)

Run the processing pipeline to convert, split, transcribe, and clean the audio file(s).

In [None]:
try:
    mode = mode_widget.value
    output_dir = output_dir_widget.value
    whisper_exec = whisper_exec_widget.value
    whisper_model = whisper_model_widget.value
    language = language_widget.value
    workers = workers_widget.value
    use_threads = use_threads_widget.value

    if mode == 'single':
        input_file = input_file_widget.value
        clip_duration_sec = clip_duration_sec_widget.value
        transcript_filename = transcript_filename_widget.value if transcript_filename_widget.value.strip() else DEFAULT_TRANSCRIPT_FILENAME
        process_single_audio(input_file, output_dir, clip_duration_sec, whisper_exec, whisper_model, language, transcript_filename, workers, use_threads)
    else:
        input_folder = input_folder_widget.value
        clip_duration_sec = clip_duration_min_widget.value * 60  # Convert minutes to seconds
        rest_time = rest_time_widget.value
        process_folder_audio(input_folder, output_dir, clip_duration_sec, whisper_exec, whisper_model, language, workers, use_threads, rest_time)
except Exception as e:
    print(f"❌ Error during processing: {e}")

## Step 4: Clean Transcriptions

Clean the transcriptions by removing per-sentence timestamps and formatting the content.

In [None]:
try:
    transcript_dir = os.path.join(os.path.dirname(output_dir_widget.value), 'transcripts')
    cleaned_segments_dict = clean_transcripts(transcript_dir)
except Exception as e:
    print(f"❌ Error during cleaning: {e}")

## Step 5: Convert to SRT Format

Convert the cleaned transcriptions to SRT subtitle format.

In [None]:
try:
    transcript_dir = os.path.join(os.path.dirname(output_dir_widget.value), 'transcripts')
    convert_to_srt_files(transcript_dir, cleaned_segments_dict)
except Exception as e:
    print(f"❌ Error during SRT conversion: {e}")

## Step 6: Clean SRT Files

Clean the SRT files by removing hallucinated or blacklisted strings.

In [None]:
try:
    transcript_dir = os.path.join(os.path.dirname(output_dir_widget.value), 'transcripts')
    srt_files = [f for f in os.listdir(transcript_dir) if f.endswith('.srt') and not f.startswith('cleaned_')]
    blacklisted_phrases = load_hallucinations()
    
    if not srt_files:
        print(f"❌ No SRT files found in {transcript_dir}")
    else:
        print(f"📝 Found {len(srt_files)} SRT files to clean")
        for idx, srt_file in enumerate(srt_files, 1):
            srt_path = os.path.join(transcript_dir, srt_file)
            print(f"Cleaning SRT file {idx}/{len(srt_files)}: {srt_file}...")
            clean_srt_file(srt_path, blacklisted_phrases)
        print(f"🎉 All SRT files cleaned!")
except Exception as e:
    print(f"❌ Error during SRT cleaning: {e}")