Skip to content

Commit

Permalink
Update translation block: 1)Support use translate function indpendent…
Browse files Browse the repository at this point in the history
…ly 2) Support upload local ass or srt file for translation & export translated file as ass or srt format.3) Support translate to English. 4) Use pysubs2 library to simplify the code and improve efficiency.5)Show translation progress bar.
  • Loading branch information
Ayanaminn committed Apr 1, 2023
1 parent ef2e1b5 commit 8e1d796
Showing 1 changed file with 110 additions and 200 deletions.
310 changes: 110 additions & 200 deletions N46Whisper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -434,223 +434,127 @@
"# @markdown **</br>**<font size=\"2\"> This feature allow users to translate previously transcribed subtitle text line by line using AI translation.\n",
"# @markdown **</br>**Then generate bilingual subtitle files in same sub style.Read documentaion to learn more.</font>\n",
"\n",
"# @markdown **</br><font size=\"3\">Select subtitle file source</br>\n",
"# @markdown <font size=\"3\">选择字幕文件(使用上一步的转录-use_transcribed/新上传-upload_new)</br>**\n",
"# @markdown <font size=\"2\">支持SRT与ASS文件\n",
"sub_source = \"upload_new\" # @param [\"use_transcribed\",\"upload_new\"]\n",
"\n",
"# @markdown **chatGPT:**\n",
"# @markdown **</br>**<font size=\"2\"> 要使用chatGPT翻译,请填入你自己的OpenAI API Key,然后执行单元格。</font>\n",
"# @markdown **</br>**<font size=\"2\"> 要使用chatGPT翻译,请填入你自己的OpenAI API Key,目标语言,输出类型,然后执行单元格。</font>\n",
"# @markdown **</br>**<font size=\"2\"> Please input your own OpenAI API Key, then execute this cell.</font>\n",
"# @markdown **</br>**<font size=\"2\">【注意】 免费的API对速度有所限制,需要较长时间,用户可以自行考虑付费方案。</font>\n",
"# @markdown **</br>**<font size=\"2\">【Note】There are limitaions on usage for free API, consider paid plan to speed up.</font>\n",
"openai_key = '' # @param {type:\"string\"}\n",
"target_language = 'zh-hans'# @param [\"zh-hans\",\"english\"]\n",
"output_format = \"ass\" # @param [\"ass\",\"srt\"]\n",
"\n",
"!pip install openai\n",
"import sys\n",
"import os\n",
"import re\n",
"import codecs\n",
"import regex as re\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"from google.colab import files\n",
"from IPython.display import clear_output \n",
"\n",
"clear_output()\n",
"\n",
"if sub_source == 'upload_new':\n",
" uploaded = files.upload()\n",
" sub_name = list(uploaded.keys())[0]\n",
" sub_basename = Path(sub_name).stem\n",
"elif sub_source == 'use_transcribed':\n",
" sub_name = file_basename +'.ass'\n",
" sub_basename = file_basename\n",
"\n",
"!pip install openai\n",
"!pip install pysubs2\n",
"import openai\n",
"from srt2ass import STYLE_DICT\n",
"\n",
"# test for code obfuscation\n",
"class ChatGPTAPI ():#line:12\n",
" def __init__ (OO000OOOO0OOOOOOO ,OO00O0000O0O0O0O0 ,O00O00OO0OO0OO0OO ):#line:13\n",
" OO000OOOO0OOOOOOO .key =OO00O0000O0O0O0O0 #line:14\n",
" OO000OOOO0OOOOOOO .language =O00O00OO0OO0OO0OO #line:16\n",
" OO000OOOO0OOOOOOO .key_len =len (OO00O0000O0O0O0O0 .split (\",\"))#line:17\n",
" def translate (OOO0OO000O0OOO0OO ,OO00O0OOOO00O0O00 ):#line:23\n",
" # print (OO00O0OOOO00O0O00 )#line:24\n",
" openai .api_key =OOO0OO000O0OOO0OO .key #line:26\n",
" try :#line:27\n",
" OO00OO000O0O000O0 =openai .ChatCompletion .create (model =\"gpt-3.5-turbo\",messages =[{\"role\":\"user\",\"content\":f\"Please help me to translate,`{OO00O0OOOO00O0O00}` to {OOO0OO000O0OOO0OO.language}, please return only translated content not include the origin text\",}],)#line:37\n",
" O00O000000000O00O =(OO00OO000O0O000O0 [\"choices\"][0 ].get (\"message\").get (\"content\").encode (\"utf8\").decode ())#line:44\n",
" except Exception as OO0O000000OO00000 :#line:45\n",
" O00O00O00OOOO00O0 =int (60 /OOO0OO000O0OOO0OO .key_len )#line:47\n",
" time .sleep (O00O00O00OOOO00O0 )#line:48\n",
" print (OO0O000000OO00000 ,f\"will sleep {O00O00O00OOOO00O0} seconds\")#line:49\n",
" openai .api_key =OOO0OO000O0OOO0OO .key #line:51\n",
" OO00OO000O0O000O0 =openai .ChatCompletion .create (model =\"gpt-3.5-turbo\",messages =[{\"role\":\"user\",\"content\":f\"Please help me to translate,`{OO00O0OOOO00O0O00}` to {OOO0OO000O0OOO0OO.language}, please return only translated content not include the origin text\",}],)#line:60\n",
" O00O000000000O00O =(OO00OO000O0O000O0 [\"choices\"][0 ].get (\"message\").get (\"content\").encode (\"utf8\").decode ())#line:67\n",
" return O00O000000000O00O #line:69\n",
"import pysubs2\n",
"\n",
"clear_output()\n",
"\n",
"# original code\n",
"# class ChatGPTAPI():\n",
"# def __init__(self, key, language):\n",
"# self.key = key\n",
"# # self.keys = itertools.cycle(key.split(\",\"))\n",
"# self.language = language\n",
"# self.key_len = len(key.split(\",\"))\n",
"\n",
"\n",
"# # def rotate_key(self):\n",
"# # openai.api_key = next(self.keys)\n",
"\n",
"# def translate(self, text):\n",
"# print(text)\n",
"# # self.rotate_key()\n",
"# openai.api_key = self.key\n",
"# try:\n",
"# completion = openai.ChatCompletion.create(\n",
"# model=\"gpt-3.5-turbo\",\n",
"# messages=[\n",
"# {\n",
"# \"role\": \"user\",\n",
"# # english prompt here to save tokens\n",
"# \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
"# }\n",
"# ],\n",
"# )\n",
"# t_text = (\n",
"# completion[\"choices\"][0]\n",
"# .get(\"message\")\n",
"# .get(\"content\")\n",
"# .encode(\"utf8\")\n",
"# .decode()\n",
"# )\n",
"# except Exception as e:\n",
"# # TIME LIMIT for open api , pay to reduce the waiting time\n",
"# sleep_time = int(60 / self.key_len)\n",
"# time.sleep(sleep_time)\n",
"# print(e, f\"will sleep {sleep_time} seconds\")\n",
"# # self.rotate_key()\n",
"# openai.api_key = self.key\n",
"# completion = openai.ChatCompletion.create(\n",
"# model=\"gpt-3.5-turbo\",\n",
"# messages=[\n",
"# {\n",
"# \"role\": \"user\",\n",
"# \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
"# }\n",
"# ],\n",
"# )\n",
"# t_text = (\n",
"# completion[\"choices\"][0]\n",
"# .get(\"message\")\n",
"# .get(\"content\")\n",
"# .encode(\"utf8\")\n",
"# .decode()\n",
"# )\n",
"# # print(t_text)\n",
"# return t_text\n",
"class ChatGPTAPI():\n",
" def __init__(self, key, language):\n",
" self.key = key\n",
" # self.keys = itertools.cycle(key.split(\",\"))\n",
" self.language = language\n",
" self.key_len = len(key.split(\",\"))\n",
"\n",
"class SubtitleTranslator():\n",
" def __init__(self, srt_src, model, key, language, sub_style):\n",
" self.srt_src = srt_src\n",
" self.translate_model = model(key, language)\n",
" self.sub_style = sub_style\n",
"\n",
"\n",
" def read_srt(self, srt_src):\n",
" # use correct codec to encode the input file\n",
" encodings = [\"utf-32\", \"utf-16\", \"utf-8\", \"cp1252\", \"gb2312\", \"gbk\", \"big5\"]\n",
" tmp = ''\n",
" for enc in encodings:\n",
" try:\n",
" with codecs.open(srt_src, mode=\"r\", encoding=enc) as fd:\n",
" # return an instance of StreamReaderWriter\n",
" tmp = fd.read()\n",
" break\n",
" except:\n",
" # print enc + ' failed'\n",
" continue\n",
" return [tmp, enc]\n",
"\n",
" def extract_srt(self):\n",
" src = self.read_srt(self.srt_src)\n",
" content = src[0]\n",
" # encoding = src[1] # Will not encode so do not need to pass codec para\n",
" src = ''\n",
" utf8bom = ''\n",
" # def rotate_key(self):\n",
" # openai.api_key = next(self.keys)\n",
"\n",
" if u'\\ufeff' in content:\n",
" content = content.replace(u'\\ufeff', '')\n",
" utf8bom = u'\\ufeff'\n",
" def translate(self, text):\n",
" # print(text)\n",
" # self.rotate_key()\n",
" openai.api_key = self.key\n",
" try:\n",
" completion = openai.ChatCompletion.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" # english prompt here to save tokens\n",
" \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
" }\n",
" ],\n",
" )\n",
" t_text = (\n",
" completion[\"choices\"][0]\n",
" .get(\"message\")\n",
" .get(\"content\")\n",
" .encode(\"utf8\")\n",
" .decode()\n",
" )\n",
" except Exception as e:\n",
" # TIME LIMIT for open api , pay to reduce the waiting time\n",
" sleep_time = int(60 / self.key_len)\n",
" time.sleep(sleep_time)\n",
" print(e, f\"will sleep {sleep_time} seconds\")\n",
" # self.rotate_key()\n",
" openai.api_key = self.key\n",
" completion = openai.ChatCompletion.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": f\"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text\",\n",
" }\n",
" ],\n",
" )\n",
" t_text = (\n",
" completion[\"choices\"][0]\n",
" .get(\"message\")\n",
" .get(\"content\")\n",
" .encode(\"utf8\")\n",
" .decode()\n",
" )\n",
" # print(t_text)\n",
" return t_text\n",
"\n",
" content = content.replace(\"\\r\", \"\")\n",
" sub_lines = [x.strip() for x in content.split(\"\\n\") if x.strip()]\n",
" return sub_lines\n",
"class SubtitleTranslator():\n",
"\n",
" def __init__(self, sub_src, model, key, language):\n",
" self.sub_src = sub_src\n",
" self.translate_model = model(key, language)\n",
"\n",
" def translate_by_line(self):\n",
" utf8bom = ''\n",
" subLines = ''\n",
" dlgLines = ''\n",
" lineCount = 0\n",
" sub_lines = self.extract_srt()\n",
" output_file = '.'.join(self.srt_src.split('.')[:-1])\n",
" output_file += '_translate.ass'\n",
"\n",
" for ln in range(len(sub_lines)):\n",
" line = sub_lines[ln]\n",
" # if line index element\n",
" if line.isdigit() and re.match('-?\\d\\d:\\d\\d:\\d\\d', sub_lines[(ln + 1)]):\n",
" # for each index, create an empty dialogue line for construct ass line\n",
" if dlgLines:\n",
" subLines += dlgLines + \"\\n\"\n",
" dlgLines = ''\n",
" lineCount = 0\n",
" continue\n",
" else:\n",
" # if time stamp element, construct the time stamp part for the dialogue line\n",
" if re.match('-?\\d\\d:\\d\\d:\\d\\d', line):\n",
" line = line.replace('-0', '0')\n",
" if self.sub_style == 'default':\n",
" dlgLines += 'Dialogue: 0,' + line + ',default,,0,0,0,,'\n",
" elif self.sub_style == 'ikedaCN':\n",
" dlgLines += 'Dialogue: 0,' + line + ',池田字幕1080p,,0,0,0,,'\n",
" elif self.sub_style == 'sugawaraCN':\n",
" dlgLines += 'Dialogue: 0,' + line + ',中字 1080P,,0,0,0,,'\n",
" elif self.sub_style == 'kaedeCN':\n",
" dlgLines += 'Dialogue: 0,' + line + ',den SR红色,,0,0,0,,'\n",
" elif self.sub_style == 'taniguchiCN':\n",
" dlgLines += 'Dialogue: 0,' + line + ',正文_1080P,,0,0,0,,'\n",
" # if text element, construct(append) the text part for the dialogue line\n",
" else:\n",
" if lineCount < 2:\n",
" t_line = self.translate_model.translate(line)\n",
" dlgLines += line + (r'\\N' + t_line.strip())\n",
"\n",
" print(line + (r'\\N' + t_line.strip()))\n",
" else:\n",
" t_line = self.translate_model.translate(line)\n",
" dlgLines += \"\\n\" + line + (r'\\N' + t_line.strip())\n",
"\n",
" print(line + (r'\\N' + t_line.strip()))\n",
" lineCount += 1\n",
" ln += 1\n",
"\n",
" subLines += dlgLines + \"\\n\"\n",
"\n",
" subLines = re.sub(r'\\d(\\d:\\d{2}:\\d{2}),(\\d{2})\\d', '\\\\1.\\\\2', subLines)\n",
" subLines = re.sub(r'\\s+-->\\s+', ',', subLines)\n",
"\n",
" if self.sub_style == 'default':\n",
" head_name = 'head_str_default'\n",
" elif self.sub_style == 'ikedaCN':\n",
" head_name = 'head_str_ikeda'\n",
" elif self.sub_style == 'sugawaraCN':\n",
" head_name = 'head_str_sugawara'\n",
" elif self.sub_style == 'kaedeCN':\n",
" head_name = 'head_str_kaede'\n",
" elif self.sub_style == \"taniguchiCN\":\n",
" head_name = 'head_str_taniguchi'\n",
"\n",
" head_str = STYLE_DICT.get(head_name)\n",
" output_str = utf8bom + head_str + '\\n' + subLines\n",
" # encode again for head string\n",
" output_str = output_str.encode('utf8')\n",
"\n",
" with open(output_file, 'wb') as output:\n",
" output.write(output_str)\n",
"\n",
" output_file = output_file.replace('\\\\', '\\\\\\\\')\n",
" output_file = output_file.replace('/', '//')\n",
" return output_file\n",
" sub_trans = pysubs2.load(self.sub_src)\n",
" total_lines = len(sub_trans)\n",
" for line in tqdm(sub_trans,total = total_lines):\n",
" line_trans = self.translate_model.translate(line.text)\n",
" line.text += (r'\\N'+ line_trans)\n",
" print(line_trans)\n",
"\n",
" return sub_trans\n",
"\n",
"\n",
"clear_output()\n",
"\n",
"translate_model = ChatGPTAPI\n",
"openai_key = '' # @param {type:\"string\"}\n",
"target_language = 'zh-hans'\n",
"srt_file = file_basename + \".srt\"\n",
"\n",
"assert translate_model is not None, \"unsupported model\"\n",
"OPENAI_API_KEY = openai_key\n",
Expand All @@ -663,23 +567,29 @@
"# OPENAI_API_KEY = openai_key\n",
"\n",
"t = SubtitleTranslator(\n",
" srt_src=srt_file,\n",
" sub_src=sub_name,\n",
" model= translate_model,\n",
" key = OPENAI_API_KEY,\n",
" language=target_language,\n",
" sub_style = sub_style)\n",
" language=target_language)\n",
"\n",
"translation = t.translate_by_line()\n",
"\n",
"#Anonymous usage data for stats\n",
"#Comment out this block if you do not want send your data\n",
"#Download ass file\n",
"\n",
"if output_format == 'ass':\n",
" translation.save(sub_basename + '_translation.ass')\n",
" files.download(sub_basename + '_translation.ass')\n",
"elif output_format == 'srt':\n",
" translation.save(sub_basename + '_translation.srt')\n",
" files.download(sub_basename + '_translation.srt')\n",
"\n",
"# #Anonymous usage data for stats\n",
"# #Comment out this block if you do not want send your data\n",
"try: \n",
" requests.get(f'https://api.callmebot.com/whatsapp.php?phone=61402628080&text={file_basename}+OpenAI&apikey=8080872')\n",
" requests.get(f'https://api.callmebot.com/whatsapp.php?phone=61402628080&text={sub_basename}+OpenAI&apikey=8080872')\n",
"except Exception as e:\n",
" pass\n",
"\n",
"#Download ass file\n",
"files.download(translation)\n",
"print('双语字幕生成完毕 All done!')\n",
"\n",
"# @markdown **</br>**<font size='4'>**实验功能的开发亦是为了尝试帮助大家更有效率的制作字幕。但是只有在用户实际使用体验反馈的基础上,此应用才能不断完善,如果您有任何想法,都欢迎以任何方式联系我,提出[issue](https://github.com/Ayanaminn/N46Whisper/issues)或者分享在[讨论区](https://github.com/Ayanaminn/N46Whisper/discussions)。**\n",
Expand Down

0 comments on commit 8e1d796

Please sign in to comment.