In [1]:
import requests
import json
import ast
from paragraph import extract_paragraphs_from_docx
import os

def get_response_gpt4(content):
    """ 获取gpt-4模型的回复

    Args:
        content (_type_): 给gpt-4的问题

    Returns:
        _type_: 模型的回答
    """
    url = "https://gateway.ai.cloudflare.com/v1/05c43e30f91a115d8153715954fd70ee/lingyue-ai/openai/chat/completions"
    headers = {
        "Authorization": "Bearer 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",
        "Content-Type": "application/json"
    }
    data = {
        "model": "gpt-4-0613",
        
        "messages": [
            {
            "role": "user",
            "content": f"{content}"
            }
        ],
        }
        

    response = requests.post(url, json=data, headers=headers)

    # 假设 response.text 是一个字符串，内容是有效的JSON
    json_string = response.text
    # 将JSON字符串转换为字典
    data_json = json.loads(json_string)
    return data_json["choices"][0]["message"]["content"]


In [2]:
def get_entity_ques_list(paragraphs):
    """输入段落返回提取的实体、相关句子和提问问题

    Args:
        paragraphs (_type_): 段落

    Returns:
        _type_: 实体、相关句子和提问问题组成的json
    """
    json_string = get_response_gpt4(f"{paragraphs} \n 请根据我给出的上述段落，提取出其中的实体（识别和分类文本中的组织、个人、行业、地点、时间、数值、货币、交易、证券、法律等关键信息）和提到实体的相关语句,实体的提取尽量细致一些，实体尽可能多一些，假设只知道段落主题而并不知道段落内容，而是通过问题引导来完成段落的写作，给出得到相关语句可能会问到的问题，使得问题能够引导出相关句子，问题不要包含具体时间，年份，问题不应出现倾向和定性说法，问题可以包含具体地点和行业，主要体现作者的思路和逻辑，并以json的形式返回[{{{{{{'entity': '','sentence': '', 'question': ''}}}}, {{{{'entity': '','sentence': '', 'question': ''}}}}}}],除了这个json信息其余的不要返回。")
    actual_list = ast.literal_eval(json_string)
    return actual_list


In [3]:

def update_questions(data_dict, question_list):
    """整合entity层级和对应question的函数

    Args:
        data_dict (_type_): entity层级数据
        question_list (_type_): entity对应question的json数据
    """
    # 如果存在 'subentity'，临时移除
    subentities = data_dict.pop('subentity', None)
    # 通过entity值在question_list中查找匹配的question，并更新data_dict
    if 'entity' in data_dict:
        entity = data_dict['entity']
        # 在列表中查找对应的entity，并更新question
        for item in question_list:
            if item['entity'] == entity:
                data_dict['question'] = item['question']
                data_dict['sentence'] = item['sentence']
                break  # 找到匹配后即退出循环

    # 如果有 'subentity' 属性，递归处理每个子字典
    if 'subentity' in data_dict:
        for sub in data_dict['subentity']:
            update_questions(sub, question_list)
    # 在其他更新完成后，如果存在，将 'subentity' 添加回data_dict
    if subentities:
        data_dict['subentity'] = subentities
        # 递归处理每个子字典
        for sub in data_dict['subentity']:
            update_questions(sub, question_list)
    else:
        data_dict['subentity'] = []


In [8]:
def get_entity_list_from_docx(input_file,output_json=None):
    extract_paragraphs_from_docx(input_file)
    # 获取文件名和文件夹路径
    file_name = os.path.basename(input_file)
    output_folder = os.path.dirname(input_file)
    # 构建输出文件路径
    output_file = os.path.join(output_folder, os.path.splitext(file_name)[0] + ".txt")
    with open(output_file,"r",encoding="utf-8") as file:
        text = file.read()
    paragraphs = text.split("段落：")
    paragraphs = [item for item in paragraphs if item != ""]
    result = []
    max_attempts = 3
    for index,paragraph in enumerate(paragraphs):
        attempts = 0
        while attempts < max_attempts:
            try :
                sub_result = {"paragraph":paragraph,"entities":[]}
                actual_list = get_entity_ques_list(paragraph)
                entity_list = [item["entity"] for item in actual_list]
                # 获取实体层级
                entity_str = get_response_gpt4(f"{entity_list} \n 根据以上我给的实体列表根据其含义和逻辑上的层级，给出一个树状的数据结构[{{entity:'',subentity:[]}}],只返回这个json数据，不要返回别的信息")
                # 使用literal_eval来转换字符串到列表
                sub_result["entities"] = ast.literal_eval(entity_str)
                for data_dict in sub_result["entities"]:
                    update_questions(data_dict,actual_list)
                result.append(sub_result)
                break  # 成功后退出循环
            except Exception as e:
                attempts += 1
                # 记录错误信息和发生错误时的i值
                print(f"段落{index}发生错误：{e}")
                continue  # 自动跳到下一个循环迭代
    if output_json == None:
        with open(os.path.join(output_folder, os.path.splitext(file_name)[0] + ".json"), 'w',encoding="utf-8") as file:
            json.dump(result, file,ensure_ascii=False, indent=4)
    else:
        with open(output_json, 'w',encoding="utf-8") as file:
            json.dump(result, file,ensure_ascii=False, indent=4)
get_entity_list_from_docx("/home/luzhenye/PythonProject/gpt/（可公开）次高端白酒行业深度报告：势能向上，成长可期.docx")
erro = [6,12,22,36]


正在创建 Document 对象...
Document 对象创建成功
正在提取段落...
文件已成功转换并保存至: /home/luzhenye/PythonProject/gpt/（可公开）次高端白酒行业深度报告：势能向上，成长可期.txt
总共提取出 52 个段落
段落3发生错误：invalid character '，' (U+FF0C) (<unknown>, line 20)
段落24发生错误：'str' object has no attribute 'pop'
段落30发生错误：'str' object has no attribute 'pop'
段落39发生错误：'str' object has no attribute 'pop'
段落43发生错误：invalid character '，' (U+FF0C) (<unknown>, line 24)
段落48发生错误：invalid character '：' (U+FF1A) (<unknown>, line 1)
段落49发生错误：'str' object has no attribute 'pop'
