# Format Transformer

This notebook allows you to transform transcript files into different formats, such as SRT for subtitles.

## Step 1: Configure Transcript and Output Settings

Use the widgets below to specify the input transcript file and the desired output SRT filename.

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display

# Default configuration values
DEFAULT_TRANSCRIPT_DIR = "../data/transcripts"
DEFAULT_TRANSCRIPT_FILENAME = "transcription.txt"
DEFAULT_SRT_FILENAME = "subtitles.srt"

# Widgets for configuration
transcript_dir_widget = widgets.Text(
    value=DEFAULT_TRANSCRIPT_DIR,
    placeholder='Enter transcript directory path',
    description='Transcript Dir:',
    layout={'width': '500px'}
)

transcript_filename_widget = widgets.Text(
    value=DEFAULT_TRANSCRIPT_FILENAME,
    placeholder='Enter transcript filename',
    description='Transcript File:',
    layout={'width': '500px'}
)

srt_filename_widget = widgets.Text(
    value=DEFAULT_SRT_FILENAME,
    placeholder='Enter output SRT filename',
    description='SRT Filename:',
    layout={'width': '500px'}
)

# Display widgets
display(transcript_dir_widget)
display(transcript_filename_widget)
display(srt_filename_widget)

## Step 2: Convert Transcript to SRT Format

Convert the specified transcript file into SRT format for subtitles.

In [None]:
import re
import os

def time_to_srt_format(time_str):
    """Convert time format from HH:MM:SS or MM:SS to HH:MM:SS,000 for SRT."""
    parts = time_str.split(':')
    if len(parts) == 2:  # Format is MM:SS
        return f"00:{parts[0]}:{parts[1]},000"
    elif len(parts) == 3:  # Format is HH:MM:SS
        return f"{parts[0]}:{parts[1]}:{parts[2]},000"
    else:
        return "00:00:00,000"  # Default fallback for invalid format

def convert_transcript_to_srt(transcript_path, srt_path):
    """Convert a transcript file with timestamps to SRT format, handling non-continuous segments."""
    try:
        with open(transcript_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Split content into segments based on timestamp headers, supporting various time formats
        segments = re.split(r'\[(\d{1,3}:\d{2}(?::\d{2})?) - (\d{1,3}:\d{2}(?::\d{2})?)\]\n', content)
        srt_content = []
        index = 1
        
        for i in range(1, len(segments), 3):
            if i + 2 < len(segments):  # Ensure we have all parts (start, end, text)
                start_time = time_to_srt_format(segments[i])
                end_time = time_to_srt_format(segments[i+1])
                text = segments[i+2].strip()
                if text:
                    # Split long text into multiple lines if necessary (max 70 chars per line)
                    lines = []
                    current_line = ""
                    for word in text.split():
                        if len(current_line + word) < 70:
                            current_line += word + " "
                        else:
                            lines.append(current_line.strip())
                            current_line = word + " "
                    if current_line:
                        lines.append(current_line.strip())
                    srt_content.append(f"{index}\n{start_time} --> {end_time}\n" + "\n".join(lines) + "\n")
                    index += 1
        
        with open(srt_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(srt_content))
        print(f"🎬 SRT file created successfully at: {srt_path}")
    except Exception as e:
        print(f"❌ Error converting transcript to SRT: {e}")

# Define paths for transcript and SRT output using widget values
transcript_dir = transcript_dir_widget.value
transcript_path = os.path.join(transcript_dir, transcript_filename_widget.value)
srt_path = os.path.join(transcript_dir, srt_filename_widget.value)

# Convert transcript to SRT
if os.path.exists(transcript_path):
    convert_transcript_to_srt(transcript_path, srt_path)
else:
    print(f"❌ Transcript file does not exist: {transcript_path}")