In [1]:
import json
from zhipuai import ZhipuAI

# 初始化 ZhipuAI 客户端
client = ZhipuAI(api_key="ae569c6c31ac4c5c9aa03619d0013a89.ZbCV0v4L6OUpOwOc")

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 提取关键词并整理数据
def extract_keywords(papers):
    keywords_list = []
    for paper in papers:
        paper_id = paper.get('title')
        keywords = paper.get('keywords', [])
        keywords_list.append({
            'paper_id': paper_id,
            'keywords': keywords
        })
    return keywords_list

# 调用大模型 API 进行聚类
def cluster_keywords(keywords_list):
    # 格式化为合适的任务描述
    task_description = f"""
请根据以下论文的关键词对论文进行聚类分析，将其按研究主题分为若干，并为每篇论文分配一个主题标签，生成“Topic”字段，标明该论文属于哪个研究主题。
注意，你只能将他们聚为任意多类，可以尽量详细一点
论文关键词数据：{json.dumps(keywords_list, ensure_ascii=False)}

输出格式你只需要罗列类名称就行，不需要写其他的，用英文
"""
    
    try:
        # 调用大模型 API
        response = client.chat.completions.create(
            model="glm-4-plus",  # 使用 glm-4-plus 模型
            messages=[{
                "role": "system", 
                "content": "你是一个专家，帮助进行论文聚类分析。"
            }, {
                "role": "user", 
                "content": task_description
            }],
            temperature=0.3
        )

        # 提取并返回大模型的响应
        response_text = response.choices[0].message.content.strip()
        return response_text

    except Exception as e:
        print(f"聚类失败: {e}")
        return None

# 将聚类结果保存为文本文件
def save_clustering_result(result, output_file):
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(result)  # 将结果直接保存为文本
        print(f"聚类结果已保存至 {output_file}")
    except Exception as e:
        print(f"保存失败: {e}")

# 主函数
def main(input_json_file, output_txt_file):
    print("加载论文数据...")
    papers = load_papers(input_json_file)
    print(f"共加载 {len(papers)} 篇论文。")

    # 提取关键词数据
    keywords_list = extract_keywords(papers)
    
    # 打印提取的关键词数据
    print(f"提取的关键词数据：{json.dumps(keywords_list, ensure_ascii=False, indent=2)}")
    
    # 调用大模型进行聚类分析
    print("正在进行聚类分析...")
    clustering_result = cluster_keywords(keywords_list)
    
    if clustering_result:
        print("聚类分析结果已生成，正在保存...")
        save_clustering_result(clustering_result, output_txt_file)
    else:
        print("聚类分析失败。")

# 运行主函数
if __name__ == "__main__":
    input_json_file = 'papers_with_keywords.json'  # 输入的JSON文件路径
    output_txt_file = 'clustered_papers.txt'  # 输出的聚类结果文本文件路径
    main(input_json_file, output_txt_file)


In [27]:
import json

# 定义类别和关键词
categories = {
  "AI": [
    "AI Alignment",
    "AI Safety",
    "Application in Healthcare",
    "Automated Reasoning", 
    "Bioinformatics",
    "Biomedical Knowledge Graph Construction",
    "Brain-Computer Interfaces",
    "Chess",
    "Clinical Decision-Making",
    "Clinical Notes",
    "Code Generation",
    "Communication",
    "Computer Graphics",
    "Computer Vision",
    "Control Theory",
    "Drug Discovery",
    "Earth System Modeling", 
    "Edge Computing",
    "EEG",
    "EHR",
    "Electronic Health Records",
    "Explainable AI",
    "Human-Agent Collaboration",
    "Human Feedback",
    "Human Perception",
    "Human Vision",
    "Language Model",
    "Large Language Models",
    "Machine Translation",
    "Material Science",
    "Mathematical Reasoning",
    "Computational Linguistics",
    "Computational Neuroscience",
    "Computational Physics",
    "Dynamic Scene Understanding",
    "Learning from Demonstration",
    "Learning from Textual Feedback",
    "Emergent Communication",
    "Game Theory",
    "Information Theory",
    "Chemistry",
    "Chemical Reaction",
    "Deformation",
    "Dehazing",
    "Detection",
    "Differentiable Physics",
    "Elasticity",
    "Electricity",
    "Event Camera",
    "Fluid Dynamics",
    "Human Activity Recognition",
    "Imitation Learning",
    "Interpretability",
    "Instance-Level Explanation",
    "Inpainting",
    "Input-Output Maps"
  ],
  "DL": [
    "Attention Mechanism",
    "Autoencoder",
    "CLIP",
    "Concept Bottleneck Models", 
    "Deep Learning",
    "Deep Reinforcement Learning",
    "Deformable Convolution",
    "Diagnosing Transformers",
    "Diffusion Models",
    "Efficient Deep Learning",
    "Efficient Inference",
    "Efficient Training",
    "Embedding Learning",
    "Equivariant Neural Networks",
    "Foundation Models",
    "Generative Pretraining",
    "Geometric Deep Learning",
    "Graph Neural Networks",
    "Implicit Neural Representations",
    "Memory-Augmented Neural Networks",
    "Neural Architecture Search",
    "Audio Processing",
    "Generative Models",
    "High-Resolution Image Synthesis",
    "In-Context Learning",
    "In-Context Pretraining",
    "Latent Dynamics",
    "Heterogeneous Graph Learning",
    "Feature Learning",
    "Discrete Representation Learning",
    "Conditional Generative Modeling",
    "Contrastive Learning",
    "Cross-Modal Learning",
    "Catastrophic Forgetting",
    "Chain-of-Thought",
    "Collaborative Filtering",
    "Compositional Generalization",
    "Curriculum Learning",
    "Data Augmentation",
    "Data Distillation",
    "Dataset Distillation",
    "Denoising",
    "Depth Completion",
    "Depth Estimation",
    "Disentanglement",
    "Few-shot Learning",
    "Fine-tuning",
    "Information Bottleneck",
    "Long-tailed Learning"
  ],
  "ML": [
    "Active Learning",
    "Adversarial Attacks",
    "Adversarial Defense",
    "Adversarial Training",
    "Anomaly Detection",
    "Average Reward Markov Decision Processes",
    "Bayesian Deep Learning",
    "Bayesian Inference",
    "Benchmark",
    "Biased Learning",
    "Bilevel Optimization",
    "Bird Migration",
    "Black-box Attack",
    "Black-box Defense",
    "Brownian Dynamics",
    "Causal Discovery",
    "Causal Inference",
    "Causal Representation Learning",
    "Circuit Complexity",
    "Classification",
    "Clustering",
    "Compressed Sensing",
    "Compressive Learning",
    "Computational Complexity",
    "Computational Efficiency",
    "Computational Geometry",
    "Continual Learning",
    "Cooperative Multi-Agent Reinforcement Learning",
    "Counterfactual Generation",
    "Covariate Shift",
    "Credit Assignment",
    "Data Curation",
    "Data Efficiency",
    "Data Heterogeneity",
    "Data Poisoning",
    "Data Privacy",
    "Density Estimation",
    "Dimension Reduction",
    "Distribution Shift",
    "Domain Adaptation",
    "Energy Efficiency",
    "Episodic Memory",
    "Error Analysis",
    "Evolutionary Algorithms",
    "Fairness",
    "Fitness Landscape Analysis",
    "Foveated Vision",
    "Function Approximation",
    "Generalization",
    "Geometry",
    "Group Robustness",
    "Heterogeneous Learning",
    "Heterophily",
    "High-Dimensional Data",
    "Hyperbolic Space",
    "In-distribution Generalization",
    "Incremental Learning",
    "Kernel Methods",
    "Koopman Theory",
    "Learning Theory",
    "Lipschitz Continuity",
    "Manifold Learning",
    "Markov Decision Processes",
    "Matrix Completion",
    "Meta Learning"
  ]
}


def classify_article(article):
    """根据文章的标签分类"""
    article_tags = article.get("tags", [])
    
    # 如果没有标签，记录为未分类
    if not article_tags:
        return "Unclassified"
    
    # 先按标签在tags列表中的顺序，选择类别
    for tag in article_tags:
        for category, category_keywords in categories.items():
            if tag in category_keywords:
                return category

    # 如果没有匹配的标签，记录为未分类
    return "Unclassified"  # 默认分类为"未分类"（如果没有匹配）

def classify_articles_from_json(json_data):
    """根据给定的 JSON 数据对所有论文进行分类，并输出未分类论文的tags"""
    classified_articles = []
    unclassified_tags = set()  # 用于收集所有未分类文章的tags
    
    for article in json_data:
        category = classify_article(article)
        article["category"] = category
        classified_articles.append(article)
        
        # 如果论文被标记为未分类，记录其tags
        if category == "Unclassified":
            article_tags = article.get("tags", [])
            unclassified_tags.update(article_tags)
    
    # 打印所有未分类文章的tags
    if unclassified_tags:
        print("\n未分类文章的所有tags:")
        print(sorted(list(unclassified_tags)))
    
    return classified_articles

def count_categories(classified_articles):
    """统计每个类别的文章数量"""
    category_counts = {"ML": 0, "AI": 0, "DL": 0, "Unclassified": 0}
    
    for article in classified_articles:
        category = article.get("category", "Unclassified")
        category_counts[category] += 1
    
    return category_counts

# 步骤 1: 读取 JSON 文件
def load_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# 步骤 2: 保存分类后的 JSON 数据
def save_json_file(filename, data):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

# 主流程
if __name__ == "__main__":
    # 加载 JSON 文件（假设文件名为 papers.json）
    input_filename = "papers_with_tags.json"  # 输入文件名
    output_filename = "classified_papers.json"  # 输出文件名
    
    # 步骤 1: 读取论文数据
    try:
        papers_data = load_json_file(input_filename)
        print(f"成功加载文件: {input_filename}")
    except FileNotFoundError:
        print(f"文件 {input_filename} 不存在!")
        exit(1)
    
    # 步骤 2: 分类所有论文
    classified_papers = classify_articles_from_json(papers_data)
    
    # 步骤 3: 统计各类别文章数量
    category_counts = count_categories(classified_papers)
    print("各类别文章数量统计：", category_counts)
    
    # 步骤 4: 保存分类后的数据
    save_json_file(output_filename, classified_papers)
    print(f"分类后的文件已保存为: {output_filename}")




成功加载文件: papers_with_tags.json

未分类文章的所有tags:
['1-Wasserstein Metric', '3D Detection', '3D Segmentation', 'AI for Climate Science', 'AI for Education', 'AIops', 'Abstraction', 'Action Model Learning', 'Action Pruning', 'Action Recognition', 'Activation Function', 'AdaGrad', 'Adaptation', 'Adaptive Asynchronous Updates', 'Adaptive Deferral Policy', 'Adaptive Learning', 'Adaptive Method', 'Adaptive Methods', 'Adaptive Sparse Approximation', 'Adaptive Stepsize', 'Aggregation', 'Algorithm', 'Algorithm Design', 'Algorithm Discovery', 'Algorithmic Inductive Bias', 'Algorithmic Interventions', 'Algorithmic Recourse', 'Algorithmic Regularization', 'Ambiguity', 'Anonymization', 'Approximate Search', 'Approximation Theory', 'Approximation-Estimation Decomposition', 'Architecture Search', 'Artificial Intelligence', 'Artificial Neural Network', 'Artificial Neural Networks', 'Asset Pricing', 'Asymmetrical Learning', 'Asynchronous Learning', 'AutoML', 'Automatic Differentiation', 'Automatic Speech Re

In [2]:
import json
import logging
from concurrent.futures import ThreadPoolExecutor
from zhipuai import ZhipuAI

# 初始化 ZhipuAI 客户端
client = ZhipuAI(api_key="ad670367ab3640d78468ddc62b7ca3f2.GhKvhDzmSOe3QN9O")

# 设置日志记录
def setup_logger(log_file='logs111.txt'):
    logger = logging.getLogger('paper_classification')
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(log_file)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger

# 加载论文数据
def load_papers(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# 保存处理后的论文数据
def save_papers(papers, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)

# 记录模型的回复日志
def log_model_response(idx, category):
    with open("model_response_log.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"Paper-{idx} categorized as: {category}\n")

# 记录错误日志
def log_error(idx, error_message, model_response=None):
    with open("error_log.txt", "a", encoding="utf-8") as error_file:
        error_file.write(f"Error for Paper-{idx}: {error_message}\n")
        if model_response:
            error_file.write(f"Model Response: {model_response}\n\n")

# 调用大模型 API 进行分类
def classify_paper(keywords, idx, logger):
    task_description = f"""
请根据以下论文的关键词将论文分类为 AI (人工智能)、DL (深度学习)、ML (机器学习) 其中之一。

论文关键词：{json.dumps(keywords, ensure_ascii=False)}

输出格式：AI, DL, ML,注意，只能聚为一类，不能多聚，而且输出不能加括号，只能是AI, DL, ML这样的缩写
"""
    try:
        response = client.chat.completions.create(
            model="glm-4-flash",  # 使用 glm-4-plus 模型
            messages=[{
                "role": "system", 
                "content": "你是一个专家，帮助对论文进行分类。"
            }, {
                "role": "user", 
                "content": task_description
            }],
            temperature=0.1      )

        # 提取并记录分类结果
        category = response.choices[0].message.content.strip()
        log_model_response(idx, category)
        logger.info(f"Paper-{idx} classified as: {category}")

    except Exception as e:
        # 记录错误
        log_error(idx, str(e))
        logger.error(f"Error classifying Paper-{idx}: {str(e)}")

# 并行处理论文分类
def classify_papers_in_parallel(papers, logger, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        for idx, paper in enumerate(papers):
            keywords = paper.get('keywords', [])
            future = executor.submit(classify_paper, keywords, idx, logger)
            futures.append(future)
        
        # 等待所有任务完成
        for future in futures:
            future.result()  # 阻塞，直到所有任务完成

if __name__ == "__main__":
    # 设置日志
    logger = setup_logger()

    # 加载论文数据
    input_file = "papers_with_keywords.json"
    papers = load_papers(input_file)
    
    # 分类论文
    classify_papers_in_parallel(papers, logger)

    # 保存处理后的数据（可以选用）
    output_file = "classified_papers.json"
    save_papers(papers, output_file)

