In [1]:
import json
import re
import time
import concurrent.futures
from zhipuai import ZhipuAI

# 初始化 ZhipuAI 客户端
client = ZhipuAI(api_key="ad670367ab3640d78468ddc62b7ca3f2.GhKvhDzmSOe3QN9O")

# 定义“Introduction”扩展任务描述模板
task_description_template_intro = """
请根据以下论文摘要，扩展出一个简短的“Introduction”部分，介绍论文的背景、研究问题、方法和主要贡献。

论文摘要: {abstract}

输出格式：
{{
    "Introduction": "扩展出的Introduction内容"
}}
请确保生成的Introduction简洁明了，能够准确概括论文的背景、研究问题、方法和贡献。
你回答的Introduction必须是英文，而且你的回答中不应该包含除可解析的json外的注释等任何内容
"""

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存处理后的论文数据
def save_papers(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 记录模型的回复日志
def log_model_response(idx, response_text):
    with open("model_response_log_intro.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"Paper-{idx} Response: {response_text}\n\n")

# 记录错误日志
def log_error(idx, error_message, model_response=None):
    with open("error_log_intro.txt", "a", encoding="utf-8") as error_file:
        error_file.write(f"Error for Paper-{idx}: {error_message}\n")
        if model_response:
            error_file.write(f"Model Response: {model_response}\n\n")

# 清洗模型返回的内容，提取最外层大括号中的内容
def extract_outer_braces_content(response_text):
    # 使用正则提取最外层的括号内的内容
    match = re.search(r"^\{(.*)\}$", response_text.strip(), re.DOTALL)
    if match:
        return match.group(1).strip()  # 返回大括号内的内容
    else:
        return response_text.strip()  # 如果无法匹配，返回原始内容

# 发送请求并解析“Introduction”扩展内容
def process_paper_intro(paper, idx):
    abstract = paper.get('abstract', "")
    task_description = task_description_template_intro.format(abstract=abstract)

    try:
        # 调用大模型 API
        response = client.chat.completions.create(
            model="glm-4-flash",  # 使用 glm-4-flash 模型
            messages=[
                {"role": "system", "content": "你是一个生成论文Introduction部分的助手。"},
                {"role": "user", "content": task_description}
            ],
            temperature=0.1
        )

        # 提取回复内容
        response_text = response.choices[0].message.content.strip()

        # 记录模型回复日志
        log_model_response(idx, response_text)

        # 提取最外层大括号中的内容
        extracted_content = extract_outer_braces_content(response_text)

        # 返回处理后的内容
        return {
            "Introduction": extracted_content
        }

    except Exception as e:
        # 记录错误日志，且记录模型原始返回内容
        log_error(idx, str(e), model_response=response.choices[0].message.content.strip())
        print(f"解析失败: {e} for paper-{idx}")

        # 返回大模型返回的原始内容
        return {
            "Introduction": f"模型返回原始内容: {response.choices[0].message.content.strip()}"
        }

# 并行处理论文数据
def process_papers_concurrently(papers, max_workers=5):
    results_intro = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures_intro = {executor.submit(process_paper_intro, paper, idx): idx for idx, paper in enumerate(papers, 1)}

        for future in concurrent.futures.as_completed(futures_intro):
            idx = futures_intro[future]
            try:
                result = future.result()
                results_intro.append(result)
            except Exception as e:
                print(f"Introduction扩展任务失败 paper-{idx}: {e}")
                results_intro.append({"Introduction": ""})

    return results_intro

# 主函数
def main(input_json, output_json, max_workers=5):
    print("加载论文数据...")
    papers = load_papers(input_json)
    print(f"共加载 {len(papers)} 篇论文。")

    print("并行处理论文数据...")
    extended_intros = process_papers_concurrently(papers, max_workers=max_workers)

    print("更新论文数据...")
    for paper, result_intro in zip(papers, extended_intros):
        paper['Introduction'] = result_intro.get("Introduction", "")

    print("保存处理后的数据...")
    save_papers(papers, output_json)
    print(f"处理完成，已保存到 {output_json}")

if __name__ == "__main__":
    input_json_file = 'paper_metadata_1212_10k.json'  # 输入的JSON文件路径
    output_json_file = 'papers_with_intro.json'  # 输出的JSON文件路径
    max_workers = 10  # 最大并行线程数

    main(input_json_file, output_json_file, max_workers=max_workers)





加载论文数据...
共加载 10526 篇论文。
并行处理论文数据...
Introduction扩展任务失败 paper-710: cannot access local variable 'response' where it is not associated with a value
Introduction扩展任务失败 paper-4089: cannot access local variable 'response' where it is not associated with a value
更新论文数据...
保存处理后的数据...
处理完成，已保存到 papers_with_intro.json


In [4]:
import json
import re

# 清洗“Introduction”字段中的前后多余部分
def clean_introduction(intro_text):
    # 提取最外层两个大括号内部的内容
    match = re.search(r'^"Introduction":\s*"(.*?)"$', intro_text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return intro_text  # 如果没有匹配到，返回原始内容

# 处理并清洗所有论文的Introduction字段
def clean_papers(papers):
    for paper in papers:
        if 'Introduction' in paper:
            paper['Introduction'] = clean_introduction(paper['Introduction'])
    return papers

# 加载 JSON 数据
def load_json(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存清洗后的 JSON 数据
def save_json(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 主函数
def main(input_json_file, output_json_file):
    # 加载论文数据
    papers = load_json(input_json_file)
    
    # 清洗 Introduction 字段
    cleaned_papers = clean_papers(papers)
    
    # 保存清洗后的数据
    save_json(cleaned_papers, output_json_file)
    print(f"清洗完成，已保存到 {output_json_file}")

if __name__ == "__main__":
    input_json_file = 'papers_with_intro.json'  # 输入的JSON文件路径
    output_json_file = 'cleaned_papers_with_intro.json'  # 输出的JSON文件路径

    main(input_json_file, output_json_file)


清洗完成，已保存到 cleaned_papers_with_intro.json


In [9]:
import json

# 定义“Introduction”扩展任务描述模板
task_description_template_intro = """
请根据以下论文摘要，扩展出一个简短的“Introduction”部分，介绍论文的背景、研究问题、方法和主要贡献。

论文摘要: {abstract}

输出格式：
{{
    "Introduction": "扩展出的Introduction内容"
}}
请确保生成的Introduction简洁明了，能够准确概括论文的背景、研究问题、方法和贡献。
"""

# 定义直接生成的Introduction内容
def get_introduction_for_paper_710():
    return {
        "Introduction": """
Large language models (LLMs) are increasingly trained on vast and diverse internet data, which raises concerns about their ability to memorize benchmark datasets, leading to contamination of test sets. This type of contamination is particularly challenging to detect as proprietary models' pretraining data is often not publicly available. In this paper, we propose a novel procedure to detect test set contamination in language models without needing access to the pretraining data or model weights. Our method is based on the principle that without contamination, all orderings of an exchangeable benchmark dataset should have an equal probability. In contrast, a contaminated model tends to memorize the order of examples, making certain canonical orderings significantly more likely. We demonstrate the effectiveness of our approach in detecting contamination, even in challenging scenarios with smaller models, limited test sets, and rare datasets. Our results are consistent with existing evaluations and offer a reliable way to identify test set contamination in language models, including realistic evaluations using the LLaMA-2 model.
        """
    }

def get_introduction_for_paper_4089():
    return {
        "Introduction": """
Continual learning for generative models faces the challenge of learning new target modes with limited samples while retaining previously learned ones. This paper introduces a novel continual learning approach for generative modeling, tailored for conditional generative adversarial networks. The method involves generating samples of existing modes for replay, using a discriminator to compute mode similarity, and generating labels for the target mode based on a weighted average of similar existing modes. The model is extended by training on target data with the newly-generated labels, while employing memory replay to prevent catastrophic forgetting. Our experimental results on benchmark datasets showcase the effectiveness of our approach, demonstrating superior performance over state-of-the-art methods with fewer training samples.
        """
    }

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存处理后的论文数据
def save_papers(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 主函数
def main(input_json, output_json):
    print("加载论文数据...")
    papers = load_papers(input_json)
    print(f"共加载 {len(papers)} 篇论文。")

    # 提取指定索引的论文
    target_indices = [710, 4089]  # 索引（从1开始）
    for idx in target_indices:
        if 1 <= idx <= len(papers):
            paper = papers[idx - 1]  # 转为从0开始的索引
            if idx == 710:
                result = get_introduction_for_paper_710()
            elif idx == 4089:
                result = get_introduction_for_paper_4089()

            paper['Introduction'] = result['Introduction']
            print(f"Paper-{idx} 结果:\n{result}\n")
        else:
            print(f"索引 {idx} 超出范围，无法处理。")

    print("保存处理后的数据...")
    save_papers(papers, output_json)
    print(f"处理完成，已保存到 {output_json}")

if __name__ == "__main__":
    input_json_file = 'cleaned_papers_with_intro.json'  # 输入的JSON文件路径
    output_json_file = 'cleaned_papers_with_introductions.json'  # 输出的JSON文件路径
    main(input_json_file, output_json_file)



加载论文数据...
共加载 10526 篇论文。
Paper-710 结果:
{'Introduction': "\nLarge language models (LLMs) are increasingly trained on vast and diverse internet data, which raises concerns about their ability to memorize benchmark datasets, leading to contamination of test sets. This type of contamination is particularly challenging to detect as proprietary models' pretraining data is often not publicly available. In this paper, we propose a novel procedure to detect test set contamination in language models without needing access to the pretraining data or model weights. Our method is based on the principle that without contamination, all orderings of an exchangeable benchmark dataset should have an equal probability. In contrast, a contaminated model tends to memorize the order of examples, making certain canonical orderings significantly more likely. We demonstrate the effectiveness of our approach in detecting contamination, even in challenging scenarios with smaller models, limited test sets, and ra

Paper-4089:Continual learning for generative models faces the challenge of learning new target modes with limited samples while retaining previously learned ones. This paper introduces a novel continual learning approach for generative modeling, tailored for conditional generative adversarial networks. The method involves generating samples of existing modes for replay, using a discriminator to compute mode similarity, and generating labels for the target mode based on a weighted average of similar existing modes. The model is extended by training on target data with the newly-generated labels, while employing memory replay to prevent catastrophic forgetting. Our experimental results on benchmark datasets showcase the effectiveness of our approach, demonstrating superior performance over state-of-the-art methods with fewer training samples.

Paper-710:Large language models (LLMs) are increasingly trained on vast and diverse internet data, which raises concerns about their ability to memorize benchmark datasets, leading to contamination of test sets. This type of contamination is particularly challenging to detect as proprietary models' pretraining data is often not publicly available. In this paper, we propose a novel procedure to detect test set contamination in language models without needing access to the pretraining data or model weights. Our method is based on the principle that without contamination, all orderings of an exchangeable benchmark dataset should have an equal probability. In contrast, a contaminated model tends to memorize the order of examples, making certain canonical orderings significantly more likely. We demonstrate the effectiveness of our approach in detecting contamination, even in challenging scenarios with smaller models, limited test sets, and rare datasets. Our results are consistent with existing evaluations and offer a reliable way to identify test set contamination in language models, including realistic evaluations using the LLaMA-2 model.
