Update translation block: 1)Support use translate function indpendent…

…ly 2) Support upload local ass or srt file for translation & export translated file as ass or srt format.3) Support translate to English. 4) Use pysubs2 library to simplify the code and improve efficiency.5)Show translation progress bar.
Ayanaminn · Apr 1, 2023 · 8e1d796 · 8e1d796
1 parent ef2e1b5
commit 8e1d796
Showing 1 changed file with 110 additions and 200 deletions.
diff --git a/N46Whisper.ipynb b/N46Whisper.ipynb
@@ -434,223 +434,127 @@
         "# @markdown **</br>**<font size=\"2\"> This feature allow users to translate previously transcribed subtitle text line by line using AI translation.\n",
         "# @markdown **</br>**Then generate bilingual subtitle files in same sub style.Read documentaion to learn more.</font>\n",
         "\n",
+        "# @markdown **</br><font size=\"3\">Select subtitle file source</br>\n",
+        "# @markdown <font size=\"3\">选择字幕文件(使用上一步的转录-use_transcribed/新上传-upload_new）</br>**\n",
+        "# @markdown <font size=\"2\">支持SRT与ASS文件\n",
+        "sub_source = \"upload_new\"  # @param [\"use_transcribed\",\"upload_new\"]\n",
+        "\n",
         "# @markdown **chatGPT:**\n",
-        "# @markdown **</br>**<font size=\"2\"> 要使用chatGPT翻译，请填入你自己的OpenAI API Key，然后执行单元格。</font>\n",
+        "# @markdown **</br>**<font size=\"2\"> 要使用chatGPT翻译，请填入你自己的OpenAI API Key，目标语言，输出类型，然后执行单元格。</font>\n",
         "# @markdown **</br>**<font size=\"2\"> Please input your own OpenAI API Key, then execute this cell.</font>\n",
         "# @markdown **</br>**<font size=\"2\">【注意】 免费的API对速度有所限制，需要较长时间，用户可以自行考虑付费方案。</font>\n",
         "# @markdown **</br>**<font size=\"2\">【Note】There are limitaions on usage for free API, consider paid plan to speed up.</font>\n",
+        "openai_key = '' # @param {type:\"string\"}\n",
+        "target_language = 'zh-hans'# @param [\"zh-hans\",\"english\"]\n",
+        "output_format = \"ass\"  # @param [\"ass\",\"srt\"]\n",
         "\n",
-        "!pip install openai\n",
         "import sys\n",
         "import os\n",
         "import re\n",
         "import codecs\n",
         "import regex as re\n",
+        "from pathlib import Path\n",
+        "from tqdm import tqdm\n",
+        "from google.colab import files\n",
+        "from IPython.display import clear_output \n",
+        "\n",
+        "clear_output()\n",
+        "\n",
+        "if sub_source == 'upload_new':\n",
+        "  uploaded = files.upload()\n",
+        "  sub_name = list(uploaded.keys())[0]\n",
+        "  sub_basename = Path(sub_name).stem\n",
+        "elif sub_source == 'use_transcribed':\n",
+        "  sub_name = file_basename +'.ass'\n",
+        "  sub_basename = file_basename\n",
+        "\n",
+        "!pip install openai\n",
+        "!pip install pysubs2\n",
         "import openai\n",
-        "from srt2ass import STYLE_DICT\n",
-        "\n",
-        "# test for code obfuscation\n",
-        "class ChatGPTAPI ():#line:12\n",
-        "    def __init__ (OO000OOOO0OOOOOOO ,OO00O0000O0O0O0O0 ,O00O00OO0OO0OO0OO ):#line:13\n",
-        "        OO000OOOO0OOOOOOO .key =OO00O0000O0O0O0O0 #line:14\n",
-        "        OO000OOOO0OOOOOOO .language =O00O00OO0OO0OO0OO #line:16\n",
-        "        OO000OOOO0OOOOOOO .key_len =len (OO00O0000O0O0O0O0 .split (\",\"))#line:17\n",
-        "    def translate (OOO0OO000O0OOO0OO ,OO00O0OOOO00O0O00 ):#line:23\n",
-        "        # print (OO00O0OOOO00O0O00 )#line:24\n",
-        "        openai .api_key =OOO0OO000O0OOO0OO .key #line:26\n",
-        "        try :#line:27\n",
-        "            OO00OO000O0O000O0 =openai .ChatCompletion .create (model =\"gpt-3.5-turbo\",messages =[{\"role\":\"user\",\"content\":f\"Please help me to translate,`{OO00O0OOOO00O0O00}` to {OOO0OO000O0OOO0OO.language}, please return only translated content not include the origin text\",}],)#line:37\n",
-        "            O00O000000000O00O =(OO00OO000O0O000O0 [\"choices\"][0 ].get (\"message\").get (\"content\").encode (\"utf8\").decode ())#line:44\n",
-        "        except Exception as OO0O000000OO00000 :#line:45\n",
-        "            O00O00O00OOOO00O0 =int (60 /OOO0OO000O0OOO0OO .key_len )#line:47\n",
-        "            time .sleep (O00O00O00OOOO00O0 )#line:48\n",
-        "            print (OO0O000000OO00000 ,f\"will sleep  {O00O00O00OOOO00O0} seconds\")#line:49\n",
-        "            openai .api_key =OOO0OO000O0OOO0OO .key #line:51\n",
-        "            OO00OO000O0O000O0 =openai .ChatCompletion .create (model =\"gpt-3.5-turbo\",messages =[{\"role\":\"user\",\"content\":f\"Please help me to translate,`{OO00O0OOOO00O0O00}` to {OOO0OO000O0OOO0OO.language}, please return only translated content not include the origin text\",}],)#line:60\n",
-        "            O00O000000000O00O =(OO00OO000O0O000O0 [\"choices\"][0 ].get (\"message\").get (\"content\").encode (\"utf8\").decode ())#line:67\n",
-        "        return O00O000000000O00O #line:69\n",
+        "import pysubs2\n",
+        "\n",
+        "clear_output()\n",
         "\n",
         "# original code\n",
-        "# class ChatGPTAPI():\n",
-        "#     def __init__(self, key, language):\n",
-        "#         self.key = key\n",
-        "#         # self.keys = itertools.cycle(key.split(\",\"))\n",
-        "#         self.language = language\n",
-        "#         self.key_len = len(key.split(\",\"))\n",
-        "\n",
-        "\n",
-        "#     # def rotate_key(self):\n",
-        "#     #     openai.api_key = next(self.keys)\n",
-        "\n",
-        "#     def translate(self, text):\n",
-        "#         print(text)\n",
-        "#         # self.rotate_key()\n",
-        "#         openai.api_key = self.key\n",
-        "#         try:\n",
-        "#             completion = openai.ChatCompletion.create(\n",
-        "#                 model=\"gpt-3.5-turbo\",\n",
-        "#                 messages=[\n",
-        "#                     {\n",
-        "#                         \"role\": \"user\",\n",
-        "#                         # english prompt here to save tokens\n",
-        "#                         \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
-        "#                     }\n",
-        "#                 ],\n",
-        "#             )\n",
-        "#             t_text = (\n",
-        "#                 completion[\"choices\"][0]\n",
-        "#                 .get(\"message\")\n",
-        "#                 .get(\"content\")\n",
-        "#                 .encode(\"utf8\")\n",
-        "#                 .decode()\n",
-        "#             )\n",
-        "#         except Exception as e:\n",
-        "#             # TIME LIMIT for open api , pay to reduce the waiting time\n",
-        "#             sleep_time = int(60 / self.key_len)\n",
-        "#             time.sleep(sleep_time)\n",
-        "#             print(e, f\"will sleep  {sleep_time} seconds\")\n",
-        "#             # self.rotate_key()\n",
-        "#             openai.api_key = self.key\n",
-        "#             completion = openai.ChatCompletion.create(\n",
-        "#                 model=\"gpt-3.5-turbo\",\n",
-        "#                 messages=[\n",
-        "#                     {\n",
-        "#                         \"role\": \"user\",\n",
-        "#                         \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
-        "#                     }\n",
-        "#                 ],\n",
-        "#             )\n",
-        "#             t_text = (\n",
-        "#                 completion[\"choices\"][0]\n",
-        "#                 .get(\"message\")\n",
-        "#                 .get(\"content\")\n",
-        "#                 .encode(\"utf8\")\n",
-        "#                 .decode()\n",
-        "#             )\n",
-        "#         # print(t_text)\n",
-        "#         return t_text\n",
+        "class ChatGPTAPI():\n",
+        "    def __init__(self, key, language):\n",
+        "        self.key = key\n",
+        "        # self.keys = itertools.cycle(key.split(\",\"))\n",
+        "        self.language = language\n",
+        "        self.key_len = len(key.split(\",\"))\n",
         "\n",
-        "class SubtitleTranslator():\n",
-        "    def __init__(self, srt_src, model, key, language, sub_style):\n",
-        "        self.srt_src = srt_src\n",
-        "        self.translate_model = model(key, language)\n",
-        "        self.sub_style = sub_style\n",
-        "\n",
-        "\n",
-        "    def read_srt(self, srt_src):\n",
-        "        # use correct codec to encode the input file\n",
-        "        encodings = [\"utf-32\", \"utf-16\", \"utf-8\", \"cp1252\", \"gb2312\", \"gbk\", \"big5\"]\n",
-        "        tmp = ''\n",
-        "        for enc in encodings:\n",
-        "            try:\n",
-        "                with codecs.open(srt_src, mode=\"r\", encoding=enc) as fd:\n",
-        "                    # return an instance of StreamReaderWriter\n",
-        "                    tmp = fd.read()\n",
-        "                    break\n",
-        "            except:\n",
-        "                # print enc + ' failed'\n",
-        "                continue\n",
-        "        return [tmp, enc]\n",
         "\n",
-        "    def extract_srt(self):\n",
-        "        src = self.read_srt(self.srt_src)\n",
-        "        content = src[0]\n",
-        "        # encoding = src[1] # Will not encode so do not need to pass codec para\n",
-        "        src = ''\n",
-        "        utf8bom = ''\n",
+        "    # def rotate_key(self):\n",
+        "    #     openai.api_key = next(self.keys)\n",
         "\n",
-        "        if u'\\ufeff' in content:\n",
-        "            content = content.replace(u'\\ufeff', '')\n",
-        "            utf8bom = u'\\ufeff'\n",
+        "    def translate(self, text):\n",
+        "        # print(text)\n",
+        "        # self.rotate_key()\n",
+        "        openai.api_key = self.key\n",
+        "        try:\n",
+        "            completion = openai.ChatCompletion.create(\n",
+        "                model=\"gpt-3.5-turbo\",\n",
+        "                messages=[\n",
+        "                    {\n",
+        "                        \"role\": \"user\",\n",
+        "                        # english prompt here to save tokens\n",
+        "                        \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
+        "                    }\n",
+        "                ],\n",
+        "            )\n",
+        "            t_text = (\n",
+        "                completion[\"choices\"][0]\n",
+        "                .get(\"message\")\n",
+        "                .get(\"content\")\n",
+        "                .encode(\"utf8\")\n",
+        "                .decode()\n",
+        "            )\n",
+        "        except Exception as e:\n",
+        "            # TIME LIMIT for open api , pay to reduce the waiting time\n",
+        "            sleep_time = int(60 / self.key_len)\n",
+        "            time.sleep(sleep_time)\n",
+        "            print(e, f\"will sleep  {sleep_time} seconds\")\n",
+        "            # self.rotate_key()\n",
+        "            openai.api_key = self.key\n",
+        "            completion = openai.ChatCompletion.create(\n",
+        "                model=\"gpt-3.5-turbo\",\n",
+        "                messages=[\n",
+        "                    {\n",
+        "                        \"role\": \"user\",\n",
+        "                        \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
+        "                    }\n",
+        "                ],\n",
+        "            )\n",
+        "            t_text = (\n",
+        "                completion[\"choices\"][0]\n",
+        "                .get(\"message\")\n",
+        "                .get(\"content\")\n",
+        "                .encode(\"utf8\")\n",
+        "                .decode()\n",
+        "            )\n",
+        "        # print(t_text)\n",
+        "        return t_text\n",
         "\n",
-        "        content = content.replace(\"\\r\", \"\")\n",
-        "        sub_lines = [x.strip() for x in content.split(\"\\n\") if x.strip()]\n",
-        "        return sub_lines\n",
+        "class SubtitleTranslator():\n",
+        "\n",
+        "    def __init__(self, sub_src, model, key, language):\n",
+        "        self.sub_src = sub_src\n",
+        "        self.translate_model = model(key, language)\n",
         "\n",
         "    def translate_by_line(self):\n",
-        "        utf8bom = ''\n",
-        "        subLines = ''\n",
-        "        dlgLines = ''\n",
-        "        lineCount = 0\n",
-        "        sub_lines = self.extract_srt()\n",
-        "        output_file = '.'.join(self.srt_src.split('.')[:-1])\n",
-        "        output_file += '_translate.ass'\n",
-        "\n",
-        "        for ln in range(len(sub_lines)):\n",
-        "            line = sub_lines[ln]\n",
-        "            # if line index element\n",
-        "            if line.isdigit() and re.match('-?\\d\\d:\\d\\d:\\d\\d', sub_lines[(ln + 1)]):\n",
-        "                # for each index, create an empty dialogue line for construct ass line\n",
-        "                if dlgLines:\n",
-        "                    subLines += dlgLines + \"\\n\"\n",
-        "                dlgLines = ''\n",
-        "                lineCount = 0\n",
-        "                continue\n",
-        "            else:\n",
-        "                # if time stamp element, construct the time stamp part for the dialogue line\n",
-        "                if re.match('-?\\d\\d:\\d\\d:\\d\\d', line):\n",
-        "                    line = line.replace('-0', '0')\n",
-        "                    if self.sub_style == 'default':\n",
-        "                        dlgLines += 'Dialogue: 0,' + line + ',default,,0,0,0,,'\n",
-        "                    elif self.sub_style == 'ikedaCN':\n",
-        "                        dlgLines += 'Dialogue: 0,' + line + ',池田字幕1080p,,0,0,0,,'\n",
-        "                    elif self.sub_style == 'sugawaraCN':\n",
-        "                        dlgLines += 'Dialogue: 0,' + line + ',中字 1080P,,0,0,0,,'\n",
-        "                    elif self.sub_style == 'kaedeCN':\n",
-        "                        dlgLines += 'Dialogue: 0,' + line + ',den SR红色,,0,0,0,,'\n",
-        "                    elif self.sub_style == 'taniguchiCN':\n",
-        "                        dlgLines += 'Dialogue: 0,' + line + ',正文_1080P,,0,0,0,,'\n",
-        "                # if text element, construct(append) the text part for the dialogue line\n",
-        "                else:\n",
-        "                    if lineCount < 2:\n",
-        "                        t_line = self.translate_model.translate(line)\n",
-        "                        dlgLines += line + (r'\\N' + t_line.strip())\n",
-        "\n",
-        "                        print(line + (r'\\N' + t_line.strip()))\n",
-        "                    else:\n",
-        "                        t_line = self.translate_model.translate(line)\n",
-        "                        dlgLines += \"\\n\" + line + (r'\\N' + t_line.strip())\n",
-        "\n",
-        "                        print(line + (r'\\N' + t_line.strip()))\n",
-        "                lineCount += 1\n",
-        "            ln += 1\n",
-        "\n",
-        "        subLines += dlgLines + \"\\n\"\n",
-        "\n",
-        "        subLines = re.sub(r'\\d(\\d:\\d{2}:\\d{2}),(\\d{2})\\d', '\\\\1.\\\\2', subLines)\n",
-        "        subLines = re.sub(r'\\s+-->\\s+', ',', subLines)\n",
-        "\n",
-        "        if self.sub_style == 'default':\n",
-        "            head_name = 'head_str_default'\n",
-        "        elif self.sub_style == 'ikedaCN':\n",
-        "            head_name = 'head_str_ikeda'\n",
-        "        elif self.sub_style == 'sugawaraCN':\n",
-        "            head_name = 'head_str_sugawara'\n",
-        "        elif self.sub_style == 'kaedeCN':\n",
-        "            head_name = 'head_str_kaede'\n",
-        "        elif self.sub_style == \"taniguchiCN\":\n",
-        "            head_name = 'head_str_taniguchi'\n",
-        "\n",
-        "        head_str = STYLE_DICT.get(head_name)\n",
-        "        output_str = utf8bom + head_str + '\\n' + subLines\n",
-        "        # encode again for head string\n",
-        "        output_str = output_str.encode('utf8')\n",
-        "\n",
-        "        with open(output_file, 'wb') as output:\n",
-        "            output.write(output_str)\n",
-        "\n",
-        "        output_file = output_file.replace('\\\\', '\\\\\\\\')\n",
-        "        output_file = output_file.replace('/', '//')\n",
-        "        return output_file\n",
+        "        sub_trans = pysubs2.load(self.sub_src)\n",
+        "        total_lines = len(sub_trans)\n",
+        "        for line in tqdm(sub_trans,total = total_lines):\n",
+        "            line_trans = self.translate_model.translate(line.text)\n",
+        "            line.text += (r'\\N'+ line_trans)\n",
+        "            print(line_trans)\n",
         "\n",
+        "        return sub_trans\n",
         "\n",
         "\n",
         "clear_output()\n",
         "\n",
         "translate_model = ChatGPTAPI\n",
-        "openai_key = '' # @param {type:\"string\"}\n",
-        "target_language = 'zh-hans'\n",
-        "srt_file = file_basename + \".srt\"\n",
         "\n",
         "assert translate_model is not None, \"unsupported model\"\n",
         "OPENAI_API_KEY = openai_key\n",
@@ -663,23 +567,29 @@
         "#     OPENAI_API_KEY = openai_key\n",
         "\n",
         "t = SubtitleTranslator(\n",
-        "    srt_src=srt_file,\n",
+        "    sub_src=sub_name,\n",
         "    model= translate_model,\n",
         "    key = OPENAI_API_KEY,\n",
-        "    language=target_language,\n",
-        "    sub_style = sub_style)\n",
+        "    language=target_language)\n",
         "\n",
         "translation = t.translate_by_line()\n",
         "\n",
-        "#Anonymous usage data for stats\n",
-        "#Comment out this block if you do not want send your data\n",
+        "#Download ass file\n",
+        "\n",
+        "if output_format == 'ass':\n",
+        "  translation.save(sub_basename + '_translation.ass')\n",
+        "  files.download(sub_basename + '_translation.ass')\n",
+        "elif output_format == 'srt':\n",
+        "  translation.save(sub_basename + '_translation.srt')\n",
+        "  files.download(sub_basename + '_translation.srt')\n",
+        "\n",
+        "# #Anonymous usage data for stats\n",
+        "# #Comment out this block if you do not want send your data\n",
         "try: \n",
-        "  requests.get(f'https://api.callmebot.com/whatsapp.php?phone=61402628080&text={file_basename}+OpenAI&apikey=8080872')\n",
+        "  requests.get(f'https://api.callmebot.com/whatsapp.php?phone=61402628080&text={sub_basename}+OpenAI&apikey=8080872')\n",
         "except Exception as e:\n",
         "  pass\n",
         "\n",
-        "#Download ass file\n",
-        "files.download(translation)\n",
         "print('双语字幕生成完毕 All done!')\n",
         "\n",
         "# @markdown **</br>**<font size='4'>**实验功能的开发亦是为了尝试帮助大家更有效率的制作字幕。但是只有在用户实际使用体验反馈的基础上，此应用才能不断完善，如果您有任何想法，都欢迎以任何方式联系我，提出[issue](https://github.com/Ayanaminn/N46Whisper/issues)或者分享在[讨论区](https://github.com/Ayanaminn/N46Whisper/discussions)。**\n",