#### 文献关键词探究

实现的功能：
- 输入想要探究的关键词，生成与该关键词经常同时出现的频率最高的10个关键词
- 输出该关键词所有的年份的高引论文5篇和最近3年的高引论文5篇
- 输出该关键词发表论文最多的机构


In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
import re
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from matplotlib.font_manager import FontProperties
import warnings
warnings.filterwarnings('ignore')


In [2]:
# 设置matplotlib中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [3]:
# 下载nltk词库（如果首次使用）
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def load_data(file_path):
    """加载CSV数据文件"""
    df = pd.read_csv(file_path, encoding='utf-8')
    print(f"数据集大小: {df.shape}")
    print(f"数据集列名: {df.columns.tolist()}")
    return df

In [5]:
insight_keywords = ['continuous fibers', 'path planning method', 'robot programming', 'optimization', 'topology optimisation']

In [6]:
file_path = './results/CFpathPlanning101_20250510_17/CFpathPlanning101_replaced_synonyms.csv'
save_path = './results'
df = load_data(file_path)

数据集大小: (101, 26)
数据集列名: ['作者', 'Author full names', '作者 ID', '文献标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '摘要', '作者关键字', '索引关键字', '通讯地址', '文献类型', '出版阶段', '开放获取', '来源出版物', 'EID']


In [7]:
base_filename = os.path.basename(file_path)
file_name_without_ext = os.path.splitext(base_filename)[0]
timestamp = datetime.now().strftime("%Y%m%d_%H")
output_dir = f"{save_path}/{file_name_without_ext}_{timestamp}"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"创建输出目录: {output_dir}")


In [8]:
def clean_keywords(keywords_str):
    """清理关键词字符串并返回关键词列表"""
    if pd.isna(keywords_str) or keywords_str == '':
        return []
    # 移除引号和额外的空格，并拆分成列表
    return [k.strip().lower() for k in re.split(r'[;,]', str(keywords_str)) if k.strip()]

In [9]:
def extract_all_keywords(df):
    """从作者关键词和索引关键词中提取所有关键词"""
    author_keywords = []
    for keywords in df['作者关键字'].dropna():
        author_keywords.extend(clean_keywords(keywords))

    index_keywords = []
    for keywords in df['索引关键字'].dropna():
        index_keywords.extend(clean_keywords(keywords))

    # 合并两种关键词
    all_keywords = author_keywords + index_keywords
    return all_keywords

In [10]:
def find_cooccurring_keywords(df, target_keyword, top_n=10):
    """找到与目标关键词经常共现的其他关键词"""
    cooccurring_keywords = []

    # 创建一个包含作者关键词和索引关键词的列
    df['all_keywords'] = df.apply(
        lambda row: clean_keywords(str(row['作者关键字'])) + clean_keywords(str(row['索引关键字'])),
        axis=1
    )

    # 筛选包含目标关键词的文章
    target_keyword_lower = target_keyword.lower()
    relevant_papers = df[df['all_keywords'].apply(lambda keywords: target_keyword_lower in [k.lower() for k in keywords])]

    print(f"包含关键词 '{target_keyword}' 的论文数量: {len(relevant_papers)}")

    if len(relevant_papers) == 0:
        return []

    # 从这些论文中提取所有其他关键词并计数
    for keywords in relevant_papers['all_keywords']:
        keywords_without_target = [k for k in keywords if k.lower() != target_keyword_lower]
        cooccurring_keywords.extend(keywords_without_target)

    # 计算频率
    keyword_counts = Counter(cooccurring_keywords)

    # 筛选出现次数大于等于2的关键词
    filtered_keyword_counts = Counter({k: v for k, v in keyword_counts.items() if v >= 2})

    # 如果没有符合条件的关键词，返回空列表
    if not filtered_keyword_counts:
        return []

    # 返回出现频率最高的top_n个关键词
    top_keywords_with_counts = filtered_keyword_counts.most_common(top_n)

    return top_keywords_with_counts


In [11]:
def find_highly_cited_papers(df, target_keyword, top_n=5):
    """获取关键词相关的高引论文"""
    # 创建一个包含作者关键词和索引关键词的列
    if 'all_keywords' not in df.columns:
        df['all_keywords'] = df.apply(
            lambda row: clean_keywords(str(row['作者关键字'])) + clean_keywords(str(row['索引关键字'])),
            axis=1
        )

    # 筛选包含目标关键词的文章
    target_keyword_lower = target_keyword.lower()
    relevant_papers = df[df['all_keywords'].apply(lambda keywords: target_keyword_lower in [k.lower() for k in keywords])]

    if len(relevant_papers) == 0:
        return pd.DataFrame(), pd.DataFrame()

    # 将'施引文献'转换为数值型
    relevant_papers['citation_count'] = pd.to_numeric(relevant_papers['施引文献'], errors='coerce').fillna(0)

    # 按引用次数排序
    all_time_top_papers = relevant_papers.sort_values('citation_count', ascending=False).head(top_n)

    # 获取当前年份
    current_year = datetime.now().year

    # 筛选最近3年的论文
    recent_papers = relevant_papers[relevant_papers['年份'] >= current_year - 3]
    recent_top_papers = recent_papers.sort_values('citation_count', ascending=False).head(top_n)

    return all_time_top_papers, recent_top_papers

In [12]:
def find_top_institutions(df, target_keyword, top_n=10):
    """找到发表与关键词相关论文最多的机构"""
    # 创建一个包含作者关键词和索引关键词的列
    if 'all_keywords' not in df.columns:
        df['all_keywords'] = df.apply(
            lambda row: clean_keywords(str(row['作者关键字'])) + clean_keywords(str(row['索引关键字'])),
            axis=1
        )

    # 筛选包含目标关键词的文章
    target_keyword_lower = target_keyword.lower()
    relevant_papers = df[df['all_keywords'].apply(lambda keywords: target_keyword_lower in [k.lower() for k in keywords])]

    if len(relevant_papers) == 0:
        return []

    # 提取所有机构
    all_institutions = []
    for affiliation in relevant_papers['归属机构'].dropna():
        # 使用分号拆分多个机构
        institutions = [inst.strip() for inst in str(affiliation).split(';') if inst.strip()]
        all_institutions.extend(institutions)

    # 统计每个机构的论文数量
    institution_counts = Counter(all_institutions)

    # 返回发表论文最多的top_n个机构
    top_institutions = institution_counts.most_common(top_n)

    return top_institutions

In [13]:
def generate_keyword_report(df, keyword, output_dir):
    """为指定关键词生成分析报告"""
    # 创建报告文件名
    report_filename = f"{output_dir}/keyword_analysis_{keyword.replace(' ', '_')}.txt"

    # 查找共现关键词
    cooccurring_keywords = find_cooccurring_keywords(df, keyword)

    # 查找高引论文
    all_time_top_papers, recent_top_papers = find_highly_cited_papers(df, keyword)

    # 查找顶级机构
    top_institutions = find_top_institutions(df, keyword)

    # 生成报告
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write(f"Keyword Analysis Report: '{keyword}'\n")
        f.write("=" * 60 + "\n\n")

        # 1. 写入共现关键词
        f.write("# 1. Top 10 Co-occurring Keywords:\n")
        f.write("-" * 40 + "\n")
        if cooccurring_keywords:
            for i, (kw, count) in enumerate(cooccurring_keywords, 1):
                f.write(f"{i}. {kw} (Count: {count})\n")
        else:
            f.write("No co-occurring keywords found.\n")
        f.write("\n")

        # 2. 写入所有时期的高引论文
        f.write("# 2. Top 5 Highly Cited Papers (All Time):\n")
        f.write("-" * 40 + "\n")
        if not all_time_top_papers.empty:
            for i, (_, paper) in enumerate(all_time_top_papers.iterrows(), 1):
                f.write(f"{i}. Title: {paper['文献标题']}\n")
                f.write(f"   Authors: {paper['作者'] if not pd.isna(paper['作者']) else 'N/A'}\n")
                f.write(f"   Year: {paper['年份']}\n")
                f.write(f"   Citations: {int(paper['citation_count'])}\n")
                f.write(f"   DOI: {paper['DOI'] if not pd.isna(paper['DOI']) else 'N/A'}\n")
        else:
            f.write("No papers found.\n")

        # 3. 写入近3年的高引论文
        current_year = datetime.now().year
        f.write(f"# 3. Top 5 Highly Cited Papers (Last 3 Years, {current_year-3}-{current_year}):\n")
        f.write("-" * 40 + "\n")
        if not recent_top_papers.empty:
            for i, (_, paper) in enumerate(recent_top_papers.iterrows(), 1):
                f.write(f"{i}. Title: {paper['文献标题']}\n")
                f.write(f"   Authors: {paper['作者'] if not pd.isna(paper['作者']) else 'N/A'}\n")
                f.write(f"   Year: {paper['年份']}\n")
                f.write(f"   Citations: {int(paper['citation_count'])}\n")
                f.write(f"   DOI: {paper['DOI'] if not pd.isna(paper['DOI']) else 'N/A'}\n")
        else:
            f.write("No recent papers found.\n")

        # 4. 写入顶级机构
        f.write("# 4. Top Institutions Publishing Papers on this Keyword:\n")
        f.write("-" * 40 + "\n")
        if top_institutions:
            for i, (inst, count) in enumerate(top_institutions, 1):
                f.write(f"{i}. {inst} (Paper Count: {count})\n")
        else:
            f.write("No institution data available.\n")

    print(f"报告已生成: {report_filename}")

    # 返回报告路径和生成的图表路径
    return report_filename

In [14]:

def analyze_multiple_keywords(df, keywords, output_dir):
    """分析多个关键词并生成报告"""
    results = []

    for keyword in keywords:
        print(f"\n正在分析关键词: {keyword}")
        report_path = generate_keyword_report(df, keyword, output_dir)
        results.append((keyword, report_path))

    return results

In [15]:
results = analyze_multiple_keywords(df, insight_keywords, output_dir)


正在分析关键词: continuous fibers
包含关键词 'continuous fibers' 的论文数量: 10
报告已生成: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_continuous_fibers.txt

正在分析关键词: path planning method
包含关键词 'path planning method' 的论文数量: 16
报告已生成: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_path_planning_method.txt

正在分析关键词: robot programming
包含关键词 'robot programming' 的论文数量: 15
报告已生成: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_robot_programming.txt

正在分析关键词: optimization
包含关键词 'optimization' 的论文数量: 6
报告已生成: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_optimization.txt

正在分析关键词: topology optimisation
包含关键词 'topology optimisation' 的论文数量: 3
报告已生成: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_topology_optimisation.txt


In [16]:
    html_source_files = []
    print("\n分析完成！生成的报告:")
    for keyword, report_path in results:
        print(f"- {keyword}: {report_path}")
        html_source_files.append(report_path)


分析完成！生成的报告:
- continuous fibers: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_continuous_fibers.txt
- path planning method: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_path_planning_method.txt
- robot programming: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_robot_programming.txt
- optimization: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_optimization.txt
- topology optimisation: ./results/CFpathPlanning101_replaced_synonyms_20250512_10/keyword_analysis_topology_optimisation.txt
