In [1]:
import yaml

class ConfigLoader:
    def __init__(self, config_path):
        self.config_path = config_path

    def load_config(self):
        with open(self.config_path, "r") as f:
            config = yaml.safe_load(f)
        return config

import argparse

class ArgumentParser:
    def __init__(self):
        self.parser = argparse.ArgumentParser(description='Translate English PDF book to Chinese.')
        self.parser.add_argument('--config', type=str, default='config.yaml', help='Configuration file with model and API settings.')
        self.parser.add_argument('--book', type=str, help='PDF file to translate.')
        self.parser.add_argument('--model_url', type=str, help='The URL of the translation model API.')
        self.parser.add_argument('--timeout', type=int, help='Timeout for the API request in seconds.')

    def parse_arguments(self):
        args = self.parser.parse_args()
        return args

class Environment:
    @staticmethod
    def is_jupyter():
        try:
            get_ipython()
            return True
        except NameError:
            return False



In [2]:
from enum import Enum, auto

class ModelType(Enum):
    ChatGLM = auto()
    GPT_3 = auto()
    GPT_3_5 = auto()

class Model:
    def __init__(self, model_url, timeout, model_type=ModelType.ChatGLM):
        self.model_url = model_url
        self.timeout = timeout
        self.model_type = model_type


In [3]:
class TranslatorException:
    def __init__(self, e):
        self.exception = e

    def handle_exception(self):
        if isinstance(self.exception, requests.exceptions.RequestException):
            return f"请求异常：{self.exception}"
        elif isinstance(self.exception, requests.exceptions.Timeout):
            return f"请求超时：{self.exception}"
        elif isinstance(self.exception, simplejson.errors.JSONDecodeError):
            return "Error: response is not valid JSON format."
        else:
            return f"发生了未知错误：{self.exception}"
        
        
class PageOutOfRangeException(Exception):
    def __init__(self, book_pages, requested_pages):
        self.book_pages = book_pages
        self.requested_pages = requested_pages
        super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.")


In [9]:
from dataclasses import dataclass
from enum import Enum, auto

class ContentType(Enum):
    TEXT = auto()
    TABLE = auto()
    IMAGE = auto()

@dataclass
class Content:
    content_type: ContentType
    original: any
    translation: any = None

    def set_translation(s
                        elf, translation):
        self.translation = translation

class Page:
    def __init__(self):
        self.contents = []

    def add_content(self, content: Content):
        self.contents.append(content)

class Book:
    def __init__(self):
        self.pages = []

    def add_page(self, page: Page):
        self.pages.append(page)

In [8]:
import pdfplumber
import requests
import os
import sys

class PDFTranslator:
    def __init__(self, model):
        self.model = model
        self.book = Book()
        self.translated_book = Book()
        self.translate_status = []
        self.success_rate = 0
        self.headers = {
            'Content-Type': 'application/json;charset=utf-8'
        }

    def parse_pdf(self, book, pages=None):
        self.book = Book()
        with pdfplumber.open(book) as pdf:
            if pages is not None:
                page_numbers = range(min(pages, len(pdf.pages)))
            else:
                page_numbers = range(len(pdf.pages))

            for i in page_numbers:
                page = pdf.pages[i]
                page_contents = Page()

                # 提取文本
                text = page.extract_text()
                if text:
                    paragraphs = text.split("\n")
                    for paragraph in paragraphs:
                        page_contents.add_text(paragraph)

                # 提取表格
                tables = page.extract_tables()
                if tables:
                    for table in tables:
                        page_contents.add_table(table)

                # 提取图片
                images = page.images
                if images:
                    for img_obj in images:
                        img = Image.open(img_obj)
                        page_contents.add_image(img)

                self.book.add_page(page_contents)

    # def handle_translation_response(self, response):
    #     try:
    #         response.raise_for_status()
    #         response_dict = response.json()
    #         translation = response_dict["response"]
    #         self.translated_book.append(translation)
    #         self.translate_status.append(1)
    #     except Exception as e:
    #         exception_handler = TranslatorException(e)
    #         error_message = exception_handler.handle_exception()
    #         print(error_message, file=sys.stderr)
    #         self.translated_book.append("[翻译失败]")
    #         self.translate_status.append(0)

    def handle_translation_response(self, response, content):
        try:
            response.raise_for_status()
            response_dict = response.json()
            translation = response_dict["response"]
            content.translation = translation
            self.translate_status.append(1)
        except Exception as e:
            exception_handler = TranslatorException(e)
            error_message = exception_handler.handle_exception()
            print(error_message, file=sys.stderr)
            content.translation = "[翻译失败]"
            self.translate_status.append(0)
            

    def translate_contents(self, book):
        successful_requests = 0
        total_requests = len(self.book)

        with open(f"{book[:-4]}_analyzed.txt", "w", encoding="utf-8") as f:
            for i, text in enumerate(self.book):
                prompt = f"翻译为中文：{text}"
                data = {
                    "prompt": prompt,
                    "history": []
                }
                response = requests.post(self.model.model_url, json=data, timeout=self.model.timeout)
                self.handle_translation_response(response)
                f.write(self.translated_book[-1])

    def calculate_success_rate(self):
        successful_requests = sum(self.translate_status)
        total_requests = len(self.translate_status)
        self.success_rate = successful_requests / total_requests * 100
    
    def translate_pdf(self, book, pages=None):
        self.parse_pdf(book, pages)
        self.translate_contents(book)
        self.calculate_success_rate()

In [14]:
# 使用示例
environment = Environment()

args = None
if not environment.is_jupyter():
    argument_parser = ArgumentParser()
    args = argument_parser.parse_arguments()
    config_loader = ConfigLoader(args.config)
else:
    config_loader = ConfigLoader('config.yaml')

config = config_loader.load_config()
book = args.book if hasattr(args, 'book') and args.book else config['book']
model_url = args.model_url if hasattr(args, 'model_url') and args.model_url else config['model_url']
timeout = args.timeout if hasattr(args, 'timeout') and args.timeout else config['timeout']

model = Model(model_url=model_url, timeout=timeout)

# 实例化 PDFTranslator 类，并调用 translate_pdf() 方法
translator = PDFTranslator(model)
translator.translate_pdf(book, pages=5)

print("翻译完成！")
print("翻译成功率：", round(translator.success_rate, 4))
print("翻译状态：", translator.translate_status)

NameError: name 'Image' is not defined

In [13]:
translator.book_contents[3]

'LanguageModelsareUnsupervisedMultitaskLearners\nlappingtrainingdatawithtestevaluationtasks.\nParameters Layers d\nmodel\n2.2.InputRepresentation 117M 12 768\n345M 24 1024\nAgenerallanguagemodel(LM)shouldbeabletocompute\n762M 36 1280\ntheprobabilityof(andalsogenerate)anystring. Current 1542M 48 1600\nlargescaleLMsincludepre-processingstepssuchaslower-\ncasing,tokenization,andout-of-vocabularytokenswhich Table2.Architecturehyperparametersforthe4modelsizes.\nrestrictthespaceofmodel-ablestrings. Whileprocessing\nUnicodestringsasasequenceofUTF-8byteselegantlyful-\nfillsthisrequirementasexemplifiedinworksuchasGillick few modifications. Layer normalization (Ba et al., 2016)\net al. (2015), current byte-level LMs are not competitive was moved to the input of each sub-block, similar to a\nwith word-level LMs on large scale datasets such as the pre-activation residual network (He et al., 2016) and an\nOneBillionWordBenchmark(Al-Rfouetal.,2018). We additionallayernormalizationwasaddedafterthefin

SyntaxError: EOL while scanning string literal (2568351515.py, line 1)