# Load and preprocess

In [1]:
# Settings
fpath = "demo.srt" # path of the srt file
batch_size = 20 # number of lines to translate per request
TARGET_LANG = "french" # target language
OPENAI_API_KEY = "" # openai api key

In [2]:
ftype = "." + fpath.split(".")[-1]
fname = fpath.split("/")[-1].replace(ftype, "")

with open(fpath, "r") as f:
    contents = f.read()

lines = contents.split("\n\n")

# transform subtitle text into a list of dict
raw_subtitles = []
id_to_time = {}

for line in lines:
    tmp = line.split("\n")
    raw_subtitles.append({"id": tmp[0], "text": "\n".join(tmp[2:])})
    id_to_time[tmp[0]] = tmp[1]

# Translate

In [3]:
PROMPT = """You are a professional translation engine.
Please translate the text into {target_lang} without explanation.

### Instructions
1. The user will provide the original text in the form of a list.
2. Each JSON object in the list will contain 'id' and the original text content 'text'.
3. When translating each text, you must consider all the text within the entire list.
4. Output in JSON format, with a list 'texts' containing each JSON object with the text id and the translated text 'translated'.

### Output Format
{{
    "texts": [
        {{"id": "1", "translated": "Synthetic data is becoming increasingly important in accelerating the development of both large-scale and small-scale language models."}},
        {{"id": "2", "translated": "There are already several successful use cases."}},
        {{"id": "3", "translated": "However, researchers have discovered issues such as model collapse and imitation of other models."}}
    ]
}}
"""

In [4]:
import json
import time
from openai import OpenAI
from tqdm.auto import trange

client = OpenAI(api_key=OPENAI_API_KEY)

In [5]:
def translate(chunk):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages = [
            {"role": "system", "content": PROMPT.format(target_lang=TARGET_LANG)},
            {"role": "user", "content": f"{chunk}"}
        ],
        response_format={"type": "json_object"}
    )
    if response.choices:
        return json.loads(response.choices[0].message.content)["texts"]

In [6]:
translated = []
failed = []

# calculate number of requests
iterations = len(raw_subtitles) // batch_size
if len(raw_subtitles) % batch_size > 0:
    iterations += 1

for begin in trange(iterations):
    tmp = raw_subtitles[begin * batch_size: begin * batch_size + batch_size]
    try:
        result = translate(tmp)
        translated.extend(result)
    except openai.RateLimitError:
        time.sleep(60)
        result = translate(tmp)
        translated.extend(result)
    except:
        failed.extend([i["id"] for i in tmp])

  0%|          | 0/1 [00:00<?, ?it/s]

# Postprocessing

In [7]:
translated_subtitles = []
for obj in translated:
    tmp_time = id_to_time[obj["id"]]
    translated_subtitles.append(f"{obj['id']}\n{tmp_time}\n{obj['translated']}")

In [8]:
# translated srt will be saved in the same directory as this notebook
translated_fname = f"{fname}_translated{ftype}"
with open(translated_fname, "w") as f:
    f.write("\n\n".join(translated_subtitles))

In [9]:
# display the translated subtitles
print("\n\n".join(translated_subtitles))

1
00:00:01,000 --> 00:00:03,500
Bonjour, bienvenue dans la vidéo.

2
00:00:04,000 --> 00:00:06,000
Aujourd'hui, nous allons discuter des bases des fichiers SRT.

3
00:00:06,500 --> 00:00:09,000
Vous apprendrez comment les créer et les formater.

4
00:00:09,500 --> 00:00:12,000
Commençons !

5
00:00:12,500 --> 00:00:15,000
Tout d'abord, vous avez besoin d'un éditeur de texte pour écrire votre fichier SRT.

6
00:00:15,500 --> 00:00:18,000
Assurez-vous de l'enregistrer avec l'extension .srt.

7
00:00:18,500 --> 00:00:20,500
Chaque entrée de sous-titre commence par un numéro.

8
00:00:21,000 --> 00:00:23,500
Ensuite, vous avez besoin d'un code temporel pour quand le sous-titre doit apparaître.

9
00:00:24,000 --> 00:00:27,000
Le format du code temporel est : heures:minutes:secondes,millisecondes

10
00:00:27,500 --> 00:00:30,000
Enfin, vous écrivez le texte du sous-titre sous le code temporel.
