In [1]:
import json
import re
import time
import concurrent.futures
from zhipuai import ZhipuAI

# 初始化 ZhipuAI 客户端
client = ZhipuAI(api_key="ad670367ab3640d78468ddc62b7ca3f2.GhKvhDzmSOe3QN9O")

# 定义关键词提取任务描述模板
task_description_template = """
请根据以下论文摘要提取三个关键词，这些关键词应该能够概括论文的主要内容。

论文摘要: {abstract}

输出格式：
{{
    "keyword": ["1", "2", "3"]
}}
注意，你的回应需要是英文关键词，不要翻译为中文，不要包含多余的信息。输出一个类似json的文件就可以了
"""

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存处理后的论文数据
def save_papers(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 发送请求并解析结果
def process_paper(paper, idx):
    abstract = paper.get('abstract', "")
    task_description = task_description_template.format(abstract=abstract)

    try:
        # 调用大模型 API
        response = client.chat.completions.create(
            model="glm-4-flash",  # 使用 glm-4-flash 模型
            messages=[
                {"role": "system", "content": "你是一个关键词提取助手。"},
                {"role": "user", "content": task_description}
            ],
            temperature=0.1
        )

        # 提取回复内容
        response_text = response.choices[0].message.content.strip()

        # 记录大模型返回内容
        with open("model_response_log_keywords.txt", "a", encoding="utf-8") as log_file:
            log_file.write(f"Paper-{idx} Response:\n{response_text}\n\n")

        print(f"大模型返回内容 (paper-{idx}): {response_text}")

        # 验证返回内容
        match = re.search(r"({.*?})", response_text, re.DOTALL)
        if match is None:
            raise ValueError("未找到 JSON 格式的内容")

        # 解析 JSON 内容
        json_content = json.loads(match.group(1))
        return {
            "keywords": json_content.get("keyword", [])
        }

    except Exception as e:
        print(f"解析失败: {e} for paper-{idx}")
        
        # 如果解析失败，也记录完整响应
        with open("error_log_keywords.txt", "a", encoding="utf-8") as error_file:
            error_file.write(f"Paper-{idx} Error:\n{e}\nResponse:\n{response_text}\n\n")
        
        return {
            "keywords": []
        }

# 并行处理论文数据
def process_papers_concurrently(papers, max_workers=5):
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_idx = {executor.submit(process_paper, paper, idx): idx for idx, paper in enumerate(papers, 1)}
        for future in concurrent.futures.as_completed(future_to_idx):
            idx = future_to_idx[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"任务 paper-{idx} 失败: {e}")
                results.append({"keywords": []})
    return results

# 主函数
def main(input_json, output_json, max_workers=5):
    print("加载论文数据...")
    papers = load_papers(input_json)
    print(f"共加载 {len(papers)} 篇论文。")

    print("并行处理论文数据...")
    extracted_results = process_papers_concurrently(papers, max_workers=max_workers)

    print("更新论文数据...")
    for paper, result in zip(papers, extracted_results):
        if paper['venue'] == 'TMLR 2024' and not paper['keywords']:
            paper['keywords'] = result.get("keywords", [])

    print("保存处理后的数据...")
    save_papers(papers, output_json)
    print(f"处理完成，已保存到 {output_json}")

if __name__ == "__main__":
    input_json_file = 'paper_metadata_1212_10k.json'  # 输入的JSON文件路径
    output_json_file = 'papers_with_keywords.json'  # 输出的JSON文件路径
    max_workers = 10  # 最大并行线程数

    main(input_json_file, output_json_file, max_workers=max_workers)




加载论文数据...
共加载 10526 篇论文。
并行处理论文数据...
大模型返回内容 (paper-4): {
    "keyword": ["Neural PDE Solvers", "Pretraining", "Factorized Fourier Neural Operator"]
}
大模型返回内容 (paper-10): {
    "keyword": ["calibration attacks", "adversarial defense", "model trustworthiness"]
}
大模型返回内容 (paper-11): {
    "keyword": ["Text-to-SQL", "LLMs", "Instruction Fine-tuning"]
}
大模型返回内容 (paper-12): {
    "keyword": ["α-Stable ReLU-NNs", "large-width asymptotics", "training dynamics"]
}
大模型返回内容 (paper-3): {
    "keyword": ["Data-Centric Approach", "Model Inversion Attack", "Privacy Protection"]
}
大模型返回内容 (paper-1): {
    "keyword": ["language models", "contamination detection", "benchmark evaluation"]
}
大模型返回内容 (paper-15): {
    "keyword": ["Federated Learning", "Adversarial Attacks", "Security Risk"]
}
大模型返回内容 (paper-14): {
    "keyword": ["Harmonic Indel Distance", "Distance Metric", "Biomedical Sequence Data"]
}
大模型返回内容 (paper-13): {
    "keyword": ["Graph Neural Networks", "Uncertainty Quantification", "Graph Le

In [2]:
import json

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存处理后的论文数据
def save_papers(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 解析错误日志，返回一个字典
def parse_error_log(error_log_file):
    error_data = {}
    with open(error_log_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    current_paper = None
    for line in lines:
        line = line.strip()
        if line.startswith("Paper-"):
            current_paper = line.split()[0]  # e.g., "Paper-2080"
            error_data[current_paper] = []
        elif line.startswith('Response:'):
            continue  # Skip the "Response:" line
        elif current_paper and line.startswith('{'):
            try:
                response_json = json.loads(line)
                error_data[current_paper] = response_json.get("keyword", [])
            except json.JSONDecodeError:
                error_data[current_paper] = []
    return error_data

# 更新论文数据
def update_papers_with_error_data(papers, error_data):
    for idx, paper in enumerate(papers, 1):
        paper_id = f"Paper-{idx}"
        if paper['venue'] == 'TMLR 2024' and not paper['keywords']:
            paper['keywords'] = error_data.get(paper_id, [])
    return papers

# 主函数
def main(paper_json_file, error_log_file, output_json_file):
    print("加载论文数据...")
    papers = load_papers(paper_json_file)
    print(f"共加载 {len(papers)} 篇论文。")

    print("解析错误日志...")
    error_data = parse_error_log(error_log_file)
    print(f"从错误日志中解析了 {len(error_data)} 条数据。")

    print("更新论文数据...")
    updated_papers = update_papers_with_error_data(papers, error_data)

    print("保存更新后的论文数据...")
    save_papers(updated_papers, output_json_file)
    print(f"处理完成，已保存到 {output_json_file}")

if __name__ == "__main__":
    paper_json_file = 'papers_with_keywords.json'  # 输入的论文数据文件
    error_log_file = 'error_log_keywords.txt'      # 错误日志文件
    output_json_file = 'updated_papers.json'       # 输出的更新后论文数据文件

    main(paper_json_file, error_log_file, output_json_file)


加载论文数据...
共加载 10526 篇论文。
解析错误日志...
从错误日志中解析了 8 条数据。
更新论文数据...
保存更新后的论文数据...
处理完成，已保存到 updated_papers.json
