# **Translate an MKV subtitle from english to european portuguese**

### Breakdown of the Code:

1. **Environment Setup:**
   - **Loading Environment Variables:** Handles the loading of environment variables from a `.env` file.
   - **OpenAI Client Initialization:** Sets up the OpenAI client with the API key.

2. **Translation Function:**
   - **`translate_text`:** Translates text using the OpenAI GPT-4 model and handles errors.

3. **SRT Parsing and Writing:**
   - **`parse_srt`:** Parses SRT files into a list of subtitle segments.
   - **`write_srt`:** Writes subtitle segments to a new SRT file.

4. **Subtitle Extraction and Track Information:**
   - **`extract_subtitles`:** Extracts subtitles from MKV files using `mkvextract`.
   - **`get_subtitle_track_details`:** Retrieves details about subtitle tracks from an MKV file using `mkvinfo`.

5. **User Interaction:**
   - **`get_user_confirmation`:** Prompts the user for a yes/no confirmation using a Tkinter messagebox.
   - **`select_mkv_file`:** Opens a file dialog to let the user select an MKV file.

6. **Main Function:**
   - **`main`:** Coordinates the overall process, including file selection, subtitle extraction, user prompts, and translation.

In [1]:
#!pip install --upgrade httpx
#!pip install transformers torch pysrt tqdm sacremoses nest_asyncio

# Import necessary libraries

In [2]:
import os
import subprocess
import re
import tkinter as tk
from tkinter import filedialog, messagebox
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm
from IPython.display import display, Markdown

# --- Environment Setup ---

In [3]:
# Load environment variables from .env file
load_dotenv()

# Retrieve the OpenAI API key from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("API key not found. Set the OPENAI_API_KEY environment variable in your .env file.")

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# --- Translation Function ---

In [4]:
import os
import re
import openai
from tqdm.auto import tqdm
from dotenv import load_dotenv
from tkinter import Tk, filedialog
from termcolor import colored
from IPython.display import display, Markdown

# Load environment variables from .env file
load_dotenv()

# Retrieve the OpenAI API key from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("API key not found. Set the OPENAI_API_KEY environment variable in your .env file.")

# Initialize OpenAI client
openai.api_key = OPENAI_API_KEY

def translate_text(text, source_lang="en", target_lang="pt-PT"):
    """Translate text using GPT-4-turbo."""
    # Display the first 100 characters of the text being translated
    display(Markdown(f"**Translating text:** `{text[:100]}...`"))

    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",  # or the appropriate model for your use case
        messages=[
            {"role": "system", "content": f"You are a translator. Translate from {source_lang} to {target_lang}."},
            {"role": "user", "content": text}
        ],
        temperature=0,  # Adjust temperature if needed
        max_tokens=500
    )
    translation = response.choices[0].message['content'].strip()
    return translation

def parse_srt(file_path):
    """Parse SRT file into a list of tuples (index, timestamp, text)."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    subtitles = []
    parts = re.split(r'\n\n', content.strip())
    for part in parts:
        lines = part.split('\n')
        if len(lines) >= 3:
            index = lines[0]
            timestamp = lines[1]
            text = '\n'.join(lines[2:])
            subtitles.append((index, timestamp, text))
    return subtitles

def write_srt(subtitles, file_path):
    """Write translated subtitles to a new SRT file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        for index, timestamp, text in subtitles:
            file.write(f"{index}\n{timestamp}\n{text}\n\n")

def display_translations(original, translated):
    """Display original and translated texts in colored markdown."""
    display(Markdown(f"<span style='color: blue'>**Original:** {original}</span>"))
    display(Markdown(f"<span style='color: green'>**Translated:** {translated}</span>"))
    print(colored(f"Original: {original}", "blue"))
    print(colored(f"Translated: {translated}", "green"))

def main(input_file):
    print(colored(f"Reading file: {input_file}", "cyan"))
    subtitles = parse_srt(input_file)
    
    # Generate the output file name
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_PT-PT{ext}"
    
    # Translate subtitles with a progress bar
    translated_subtitles = []
    for index, timestamp, text in tqdm(subtitles, desc="Translating Subtitles"):
        # Skip sounds or comments
        if re.search(r'\[.*?\]', text):
            translated_subtitles.append((index, timestamp, text))
            continue
        
        # Translate and correct the text
        translated_text = translate_text(text)
        
        # Ensure text fits in 2 lines
        lines = translated_text.split('\n')
        if len(lines) > 2:
            translated_text = '\n'.join(lines[:2])
        
        # Display original and translated texts
        display_translations(text, translated_text)
        
        translated_subtitles.append((index, timestamp, translated_text))
    
    # Write the translated subtitles to a new SRT file
    write_srt(translated_subtitles, output_file)
    print(colored(f"Translation complete! Translated file saved as {output_file}", "green"))

if __name__ == "__main__":
    # Create a Tkinter root window and hide it
    root = Tk()
    root.withdraw()
    
    # Ask the user to select an SRT file
    input_subtitle_path = filedialog.askopenfilename(
        title="Select the English SRT file",
        filetypes=[("SRT files", "*.srt"), ("All files", "*.*")]
    )
    
    # Check if a file was selected
    if input_subtitle_path:
        main(input_subtitle_path)
    else:
        print(colored("No file selected. Exiting.", "red"))


[36mReading file: /Users/f.nuno/Downloads/Despicable.Me.4.2024.720p.WEBRip.800MB.x264-GalaxyRG[TGx]/Extracted_Subtitles/subtitle_track_3.srt[0m


Translating Subtitles:   0%|          | 0/1859 [00:00<?, ?it/s]

**Translating text:** `Ah. Hmm....`

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


# --- SRT Parsing and Writing ---

In [None]:
def parse_srt(file_path):
    """Parse SRT file into a list of tuples (index, timestamp, text)."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    subtitles = []
    parts = re.split(r'\n\n', content.strip())
    for part in parts:
        lines = part.split('\n')
        if len(lines) >= 3:
            index = lines[0]
            timestamp = lines[1]
            text = '\n'.join(lines[2:])
            # Filter out comments or sound indicators
            if not re.match(r'^\[.*\]$', text.strip()):
                subtitles.append((index, timestamp, text))
    return subtitles

def write_srt(subtitles, file_path):
    """Write subtitles to a new SRT file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        for index, timestamp, text in subtitles:
            file.write(f"{index}\n{timestamp}\n{text}\n\n")

# --- Subtitle Extraction and Track Information ---

In [None]:
def extract_subtitles(file_path, output_dir, track_id):
    """Extract subtitles from MKV file using mkvextract."""
    srt_path = os.path.join(output_dir, f'subtitle_track_{track_id}.srt')
    try:
        subprocess.run(['/usr/local/bin/mkvextract', 'tracks', file_path, f'{track_id}:{srt_path}'], check=True)
        return srt_path
    except subprocess.CalledProcessError as e:
        display(Markdown(f"**Failed to extract subtitles:** {e}"))
        return None

def get_subtitle_track_details(file_path):
    """Get subtitle track details from MKV file."""
    mkvinfo_path = '/usr/local/bin/mkvinfo'
    mkvextract_path = '/usr/local/bin/mkvextract'

    if not os.path.isfile(mkvinfo_path) or not os.access(mkvinfo_path, os.X_OK):
        display(Markdown(f"**Error:** '{mkvinfo_path}' is not found or not executable."))
        return "Unknown Title", [], ""

    if not os.path.isfile(mkvextract_path) or not os.access(mkvextract_path, os.X_OK):
        display(Markdown(f"**Error:** '{mkvextract_path}' is not found or not executable."))
        return "Unknown Title", [], ""

    try:
        result = subprocess.run([mkvinfo_path, file_path], capture_output=True, text=True, check=True)
        lines = result.stdout.splitlines()
        track_details = []
        track_info = {}
        title = "Unknown Title"
        for i, line in enumerate(lines):
            if "Title:" in line:
                title = line.split(":")[1].strip()
            if "Track number" in line:
                track_id = int(line.split(":")[1].split()[0])
                track_info = {"id": track_id}
            if "Track type: subtitles" in line:
                track_info["type"] = "subtitles"
                for j in range(i+1, i+10):
                    if "Language:" in lines[j]:
                        track_info["language"] = lines[j].split(":")[1].strip()
                    if "Codec ID:" in lines[j]:
                        track_info["codec"] = lines[j].split(":")[1].strip()
                track_details.append(track_info)
                track_info = {}
        return title, track_details, result.stdout
    except subprocess.CalledProcessError as e:
        display(Markdown(f"**Failed to get MKV file information:** {e}"))
        return "Unknown Title", [], ""
    except FileNotFoundError as e:
        display(Markdown(f"**Error:** {e}"))
        return "Unknown Title", [], ""

# --- User Interaction ---

In [None]:
def get_user_confirmation(message):
    """Prompt the user for confirmation."""
    root = tk.Tk()
    root.withdraw()  # Hide the root window
    return messagebox.askyesno("Confirmation", message)

def select_mkv_file():
    """Ask the user to select an MKV file."""
    root = tk.Tk()
    root.withdraw()  # Hide the root window
    file_path = filedialog.askopenfilename(
        title="Select MKV file",
        filetypes=[("MKV files", "*.mkv"), ("All files", "*.*")]
    )
    root.destroy()
    return file_path

# --- Main Function ---

In [None]:
def main():
    # Select the MKV file through GUI
    file_path = select_mkv_file()
    if not file_path:
        display(Markdown("**No file selected.**"))
        return

    # Output directory for the extracted subtitles
    output_dir = os.path.join(os.path.dirname(file_path), 'Extracted_Subtitles')
    os.makedirs(output_dir, exist_ok=True)

    # Get detailed information about subtitle tracks and the movie title
    movie_title, subtitle_tracks, mkv_info = get_subtitle_track_details(file_path)
    if not subtitle_tracks:
        display(Markdown(f"**No subtitle tracks found in the MKV file '{movie_title}'.**"))
        return

    # Display the movie title
    display(Markdown(f"**Movie Title:** {movie_title}"))

    # Check for English subtitles and extract them
    english_found = False
    for track in subtitle_tracks:
        if track.get('language') == 'eng':
            english_found = True
            srt_path = extract_subtitles(file_path, output_dir, track['id'])
            if srt_path:
                display(Markdown(f"**English subtitle extracted to:** `{srt_path}`"))
                break

    if not english_found:
        display(Markdown("**English subtitle not found**"))
        return

    # Check for Portuguese subtitles
    portuguese_found = any(track.get('language') == 'por' for track in subtitle_tracks)
    if portuguese_found:
        proceed = get_user_confirmation("Portuguese subtitles found. Do you still want to proceed with translation?")
        if not proceed:
            display(Markdown("**Translation aborted by user.**"))
            return

    # Translate subtitles if English subtitle was found
    translated_srt_path = os.path.join(output_dir, 'subtitle_track_eng_pt-PT.srt')
    subtitles = parse_srt(srt_path)
    display(Markdown(f"**Parsed {len(subtitles)} subtitle segments.**"))

    # Translate subtitles with progress tracking
    translated_subtitles = []
    for index, timestamp, text in tqdm(subtitles, desc="Translating Subtitles"):
        translated_text = translate_text(text)
        if text != translated_text:  # Only display if text is actually translated
            display_translations(text, translated_text)
        translated_subtitles.append((index, timestamp, translated_text))
    
    write_srt(translated_subtitles, translated_srt_path)
    display(Markdown(f"**Translated subtitles written to {translated_srt_path}**"))

# Run the main function
main()