In [None]:
import json
import re
import time
import concurrent.futures
from zhipuai import ZhipuAI

# 初始化 ZhipuAI 客户端
client = ZhipuAI(api_key="ad670367ab3640d78468ddc62b7ca3f2.GhKvhDzmSOe3QN9O")

# 定义代码链接提取任务描述模板
task_description_template_code = """
请根据以下论文摘要提取所有提到的代码链接（如 GitHub 链接）。

论文摘要: {abstract}

输出格式：
{{
    "Code_src": ["link1", "link2", ...]
}}
如果没有找到代码链接，请返回一个空列表。
注意，不要包含多余的信息，输出一个类似json的文件即可。
"""

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存处理后的论文数据
def save_papers(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 发送请求并解析代码链接
def process_paper_code(paper, idx):
    abstract = paper.get('abstract', "")
    task_description = task_description_template_code.format(abstract=abstract)

    try:
        # 调用大模型 API
        response = client.chat.completions.create(
            model="glm-4-flash",  # 使用 glm-4-flash 模型
            messages=[
                {"role": "system", "content": "你是一个代码链接提取助手。"},
                {"role": "user", "content": task_description}
            ],
            temperature=0.1
        )

        # 提取回复内容
        response_text = response.choices[0].message.content.strip()

        # 记录大模型返回内容
        with open("model_response_log_code.txt", "a", encoding="utf-8") as log_file:
            log_file.write(f"Paper-{idx} Response:\n{response_text}\n\n")

        # 验证返回内容
        match = re.search(r"({.*?})", response_text, re.DOTALL)
        if match is None:
            raise ValueError("未找到 JSON 格式的内容")

        # 解析 JSON 内容
        json_content = json.loads(match.group(1))
        return {
            "Code_src": json_content.get("Code_src", [])
        }

    except Exception as e:
        print(f"解析失败: {e} for paper-{idx}")
        # 记录错误信息
        with open("error_log_code.txt", "a", encoding="utf-8") as error_file:
            error_file.write(f"Paper-{idx} Error:\n{e}\n\n")
        return {
            "Code_src": []
        }

# 并行处理论文数据
def process_papers_concurrently(papers, max_workers=5):
    results_code = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures_code = {executor.submit(process_paper_code, paper, idx): idx for idx, paper in enumerate(papers, 1)}

        for future in concurrent.futures.as_completed(futures_code):
            idx = futures_code[future]
            try:
                result = future.result()
                results_code.append(result)
            except Exception as e:
                print(f"代码链接任务失败 paper-{idx}: {e}")
                results_code.append({"Code_src": []})

    return results_code

# 主函数
def main(input_json, output_json, max_workers=5):
    print("加载论文数据...")
    papers = load_papers(input_json)
    print(f"共加载 {len(papers)} 篇论文。")

    print("并行处理论文数据...")
    extracted_code = process_papers_concurrently(papers, max_workers=max_workers)

    print("更新论文数据...")
    for paper, result_code in zip(papers, extracted_code):
        paper['Code_src'] = result_code.get("Code_src", [])

    print("保存处理后的数据...")
    save_papers(papers, output_json)
    print(f"处理完成，已保存到 {output_json}")

if __name__ == "__main__":
    input_json_file = 'paper_metadata_1212_10k.json'  # 输入的JSON文件路径
    output_json_file = 'papers_with_code.json'  # 输出的JSON文件路径
    max_workers = 10 # 最大并行线程数

    main(input_json_file, output_json_file, max_workers=max_workers)




加载论文数据...
共加载 10526 篇论文。
并行处理论文数据...
论文归类任务失败 paper-4727: '\n    "Classes"'
论文归类任务失败 paper-7756: '\n    "Classes"'
论文归类任务失败 paper-7416: '\n    "Classes"'
论文归类任务失败 paper-2146: '\n    "Classes"'
论文归类任务失败 paper-1806: '\n    "Classes"'
论文归类任务失败 paper-5027: '\n    "Classes"'
论文归类任务失败 paper-4854: '\n    "Classes"'
论文归类任务失败 paper-9059: '\n    "Classes"'
论文归类任务失败 paper-6424: '\n    "Classes"'
论文归类任务失败 paper-3449: '\n    "Classes"'
论文归类任务失败 paper-8067: '\n    "Classes"'
论文归类任务失败 paper-1352: '\n    "Classes"'
论文归类任务失败 paper-5509: '\n    "Classes"'
论文归类任务失败 paper-7755: '\n    "Classes"'
论文归类任务失败 paper-7415: '\n    "Classes"'
论文归类任务失败 paper-2145: '\n    "Classes"'
论文归类任务失败 paper-1805: '\n    "Classes"'
论文归类任务失败 paper-9058: '\n    "Classes"'
论文归类任务失败 paper-6423: '\n    "Classes"'
论文归类任务失败 paper-3448: '\n    "Classes"'
论文归类任务失败 paper-8066: '\n    "Classes"'
论文归类任务失败 paper-1351: '\n    "Classes"'
论文归类任务失败 paper-5475: '\n    "Classes"'
论文归类任务失败 paper-4726: '\n    "Classes"'
论文归类任务失败 paper-7754: '\n   