In [1]:
import google.generativeai as genai
from bs4 import BeautifulSoup
import re
import time
import sys
import time
import os
from openai import OpenAI
from config import Config
mode = 'gemini'
# 配置Gemini API
genai.configure(api_key=Config.GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-flash')
#model = genai.GenerativeModel('gemini-2.5-flash-lite-preview-06-17')
# 初始化OpenAI API 
client = OpenAI(api_key="key", base_url="https://api.deepseek.com")
# 配置参数
CONFIG = {
    "model": "deepseek-chat",
    "translate_prompt": """请严格遵循以下要求处理文本：
1. 仅翻译日文句子和短语为简体中文
2. 保留所有ASCII艺术图形字符
3. 保持HTML标签（<...>）完整不变
4. 保留数字、专有名词和拟声词
5. 若无非日文内容，返回原始文本
输出格式要求：仅返回处理后的文本，不要添加任何说明""",
    "temperature": 1,
    "batch_size": 10,
    "max_workers": 5,
    "max_tokens": 8190
}
# API调用间隔（秒）
API_COOLDOWN = 5

def translate_japanese_to_chinese(text):
    """同步翻译函数"""

    if mode == 'gemini':
        # API调用频率控制
        time.sleep(API_COOLDOWN)  
        response = model.generate_content(f"识别出以下文本中所有的日文句子或短语并翻译成简体中文，仅输出翻译部分的文本内容。不要产出任何其余与翻译无关的回复文本：{text}")
        if not response.candidates or not response.parts:
            translated = ""
        else:
            translated = response.text
    elif mode == 'chatgpt':
        response = client.chat.completions.create(
                    model=CONFIG["model"],
                    messages=[
                        {"role": "system", "content": CONFIG["translate_prompt"]},
                        {"role": "user", "content": text}
                    ],
                    temperature=CONFIG["temperature"],
                    max_tokens=CONFIG["max_tokens"]
                )
        translated = response.choices[0].message.content
    else: print("Unknown mode: ",mode)
    #cut = translated.replace(' ', '<br/>')
    return translated
def extract_dt_content(input_files):
    """从输入文件中提取所有<dt>标签内容"""
    all_dt = []
    if not os.path.exists(input_files):
        print(f"警告：文件 {input_files} 不存在，已跳过")

    with open(input_files, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        for dt in soup.find_all('div', class_='content entry grid_content p_area'):
            # 克隆标签避免跨文档污染
            all_dt.append(dt)
    return all_dt
def split_posts(html_text):
    # 定义匹配每楼开头的正则表达式模式
    pattern = r'(\d+\s*：.*?.*?)'
    #pattern = "疫病 ◆fJrHveEGcs"
    # 使用正则表达式进行分割，保留分隔符
    parts = re.split(pattern, html_text)
    
    # # 过滤空字符串并重新组合
    # posts = []
    # for i in range(1, len(parts), 2):
    #     if i + 1 < len(parts):
    #         post = parts[i] + parts[i + 1]
    #         posts.append(post)
    
    return parts
def string_to_tag(html_str):
    """将字符串转换为 bs4.element.Tag 对象"""
    # 重新解析字符串（需要包裹在<html>标签中确保结构完整）
    soup = BeautifulSoup(f"<html>{html_str}</html>", 'lxml')
    # 提取<body>内的第一个子元素（即原始字符串对应的标签）
    return soup.html.body.next_element
def append_dt_to_file(output_file, dt_contents):
    """将<dt>内容追加到目标文件末尾"""
    with open(output_file, 'w', encoding='utf-8') as f:
        html_content = """
        <html><div style="display: none;"><link rel="stylesheet" type="text/css" href="https://transtemple.github.io/aaFont/aaFont.css"></div><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><title>疫病.html</title></head><body class="AA_Text"><div>
        <div style="display: none;">&nbsp;</div>
        </div>
        """
        """同步处理HTML内容"""
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 创建容器标签
        container = soup.new_tag('div', id='appended-dt-content')
        container.append(soup.new_tag('hr'))  # 添加分割线
        
        # 插入所有DT内容
        dl_tag = soup.new_tag('hr')
        for dt in dt_contents:
            dl_tag.append(dt)
        container.append(dl_tag)
        
        # 定位到body末尾
        if soup.body:
            soup.body.append(container)
        else:
            # 如果原文件没有body则创建
            body = soup.new_tag('body')
            body.append(container)
            soup.html.append(body)
        
        dls = soup.find('div', class_='fc2button-clap')
        # 方式2：删除所有后续元素
        for element in dls.find_all_next():
            element.decompose()
        dls.decompose()
        
        
        target_cell = soup.find('h2', class_='entry_header')
        if target_cell:
            text = str(target_cell)
            insert_pos = text.find('</h2>')
            modified_html = ""
            modified_html += text[:insert_pos]
            modified_html += '<br/>'
            modified_html += translate_japanese_to_chinese(text)
            modified_html += text[insert_pos:]
            tag_obj = string_to_tag(modified_html)
            target_cell.replace_with(tag_obj)
        li = soup.find('ul')
        li.decompose()
        # 重建文档结构
        new_html = str(soup)
        # 确保基本结构完整性
        if '</html>' not in new_html:
            new_html += '\n</html>'

        coll = soup.find('div', class_='entry_body')
        text = str(coll)
        posts = split_posts(text)
        # print(posts)
        # print(len(posts))
        # 进行翻译
        i=1
        new_text = posts[0]
        total = int(len(posts[1:])/2)
        for j in range(total):
            # 输出进度条 
            sys.stdout.write(f"\r{i} / {total}")
            sys.stdout.flush()
            i += 1
            if posts[1:][j]:
                new_text += posts[1:][2*j]
                text = str(posts[1:][2*j+1])
                modified_html = ""
                cut_pos = text.find('<br/><br/><br/><br/>')
                while cut_pos != -1:
                    modified_html += text[:cut_pos]
                    modified_html += '<br/><br/>'
                    if text[:cut_pos]:
                        #print("input: ",text[:cut_pos])
                        check = translate_japanese_to_chinese(text[:cut_pos])
                        #print("output: ",check)
                        modified_html += check
                    text = text[15+cut_pos:]
                    cut_pos = text.find('<br/><br/><br/><br/>')
                modified_html += text
                check = translate_japanese_to_chinese(text)
                #print("output: ",check)
                modified_html += check
                # if 2*j+1 == len(posts[1:])-1:
                #     print(text)
                #     print("output: ",check)
                modified_html += '<br/><br/>'
                new_text += modified_html
        # 执行转换

        tag_obj = string_to_tag(new_text)
        coll.replace_with(tag_obj)
        # 美化格式后写入
        f.seek(0)
        f.truncate()
        f.write(str(soup))  # 使用HTML5格式

if __name__ == "__main__":
    # 配置参数
    input_files = r"C:\Users\ningn\Downloads\やる夫たちがカタストロフィを生き延びるようです　Ep.33 「反逆する牙」 - だっておｗｗｗキャンセル.html"
    output_file = r"C:\Users\ningn\Desktop\やる夫たちが黒い玉の部屋に行くようです\「黑玉」二部第33话.html"
    
    # 执行流程
    dt_elements = extract_dt_content(input_files)
    if dt_elements:
        append_dt_to_file(output_file, dt_elements)
        print(f"成功追加 {len(dt_elements)} 个标签到 {output_file} 末尾")
    else:
        print("未找到目标标签内容")

  from .autonotebook import tqdm as notebook_tqdm


113 / 113成功追加 1 个标签到 C:\Users\ningn\Desktop\やる夫たちが黒い玉の部屋に行くようです\「黑玉」二部第33话.html 末尾


In [7]:
text="言われなくともそうするさ 元よりおまえたちが「勝手に」ついてきただけだからな"
translate_japanese_to_chinese(text)

'就算你不说，我也会那么做的。本来就是你们擅自跟来的而已嘛。'