In [2]:
from translate import (
    ORIGINAL_SRT,
    load_srt,
    make_dict,
    items_per_time,
    translate_text,
    origin_lang,
    target_lang
)
from pprint import pprint
from itertools import islice
from config import headers, proxies, api_base
import requests
import time
import json
import re

model_to_use = "gpt-4o-mini"

def dict_chunks(dictionary, n):
    items = iter(dictionary.items())
    return iter(lambda: dict(islice(items, n)), {})

def find_last_json_list(text):
    json_matches = re.findall(r'\[.*?\]', text, re.DOTALL)
    
    if json_matches:
        return eval(json_matches[-1])
    else:
        return None

def is_reply_valid(text, t_text):
    try:
        trans = find_last_json_list(t_text)
        assert isinstance(trans, list), "Trans is not a dict!"
        assert trans, "Trans is None!"
    except SyntaxError:
        print(f"Error: unable to eval the output of AI: {t_text}")
        return False
    except AssertionError:
        print(f"Error: Not a list! {type(trans)=}")
        return False
    except json.decoder.JSONDecodeError:
        print(f"Error: unable to eval the output of AI: {t_text}")
        return False
    else:
        return True

def get_keyword(text: str) -> str:
    max_retries = 5
    retries = 0
    while retries < max_retries:
        try:
            max_tokens = 2000 if "qwen" in model_to_use else 3000
            data = {
                "model": model_to_use,
                "response_format": {"type": "json_object"},
                "messages": [
                    {
                        "role": "system",
                        "content": EXTRACTION_PROMPT,
                    },
                    {
                        "role": "user",
                        "content": text,
                    },
                ],
                "max_tokens": max_tokens,
                # 'temperature': 0.5,
            }
            completion = {}
            response = requests.post(
                f"{api_base}/chat/completions",
                headers=headers,
                data=json.dumps(data),
                proxies=proxies,
                timeout=60,
            )
            completion = response.json()
            assert response.status_code == 200
            t_text = completion["choices"][0]["message"]["content"]

            if is_reply_valid(text, t_text):
                return t_text
            else:
                retries += 1
                print()
                print(text)
                print(t_text)
                print(f"Invalid translation format. Retrying ({retries}/{max_retries})")
                time.sleep(3)

        except Exception as e:
            sleep_time = 10
            print()
            print(completion)
            print(
                e,
                f"will sleep {sleep_time} seconds, Retrying ({retries}/{max_retries})",
            )
            time.sleep(sleep_time)
            retries += 1

    print(
        f"Unable to get a valid translation after {max_retries} retries. Returning the original text."
    )
    return text

EXTRACTION_PROMPT = "你是一个专业的翻译统筹。你负责从输入外语字幕中提取出需要在翻译中统一的专有名词、专业术语等。" \
    "字幕会以json字典格式给出，请用json列表格式回复其中的所有的外语关键词。你可以先逐步分析思考，确认关键词是否必要，最后再输出json列表。"
TRANSLATE_PROMPT = "你是一个专业的术语关键词翻译，请将{}的术语翻译为{}，并且用json字典输出，键和值分别是原文和翻译。"

def main():
    subs = load_srt(ORIGINAL_SRT)
    feed_dict = make_dict(subs)
    input("Ready. Press enter to continue. . .")

    keywords = []
    for chunk in dict_chunks(feed_dict, items_per_time):
        no_timestamp = {k: v[1] for k, v in chunk.items()}
        current = get_keyword(json.dumps(no_timestamp).lower())
        print(f"Reply: {current}")
        current = find_last_json_list(current)
        # print(current)
        keywords += current

    keywords = list(set(keywords))
    all_kws = get_keyword(json.dumps(keywords))
    all_kws = find_last_json_list(all_kws)

    pprint(all_kws)

    source_text = json.dumps(all_kws)
    translation = translate_text(source_text, TRANSLATE_PROMPT.format(origin_lang, target_lang), False)
    print(translation)

main()

In [1]:
x = {
    "hardware 4": "硬件4",
    "three point turn": "三点掉头",
    "roundabout": "环岛",
    "costco": "好市多",
    "smart summon": "智能召唤",
}
r = ""
for k, v in x.items():
    r += f"{v}({k}), "
r = r.strip(", ")
print(r)

硬件4(hardware 4), 三点掉头(three point turn), 环岛(roundabout), 好市多(costco), 智能召唤(smart summon)
