#### 文献关键词画图



In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter, defaultdict
import re
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from matplotlib.font_manager import FontProperties
from pyecharts import options as opts
from pyecharts.charts import Graph
from IPython.display import display, HTML, IFrame
import json
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 设置matplotlib中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [3]:
# 下载nltk词库（如果首次使用）
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def load_data(file_path):
    """加载CSV数据文件"""
    df = pd.read_csv(file_path, encoding='utf-8')
    print(f"数据集大小: {df.shape}")
    print(f"数据集列名: {df.columns.tolist()}")
    return df

In [5]:
insight_keywords = ['continuous fibers', 'path planning method', 'robot programming', 'optimization', 'topology optimisation']

In [6]:
file_path = './results/CFpathPlanning101_20250510_17/CFpathPlanning101_replaced_synonyms.csv'
save_path = './results'
df = load_data(file_path)

数据集大小: (101, 26)
数据集列名: ['作者', 'Author full names', '作者 ID', '文献标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '摘要', '作者关键字', '索引关键字', '通讯地址', '文献类型', '出版阶段', '开放获取', '来源出版物', 'EID']


In [7]:
base_filename = os.path.basename(file_path)
file_name_without_ext = os.path.splitext(base_filename)[0]
timestamp = datetime.now().strftime("%Y%m%d_%H")
output_dir = f"{save_path}/{file_name_without_ext}_{timestamp}"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"创建输出目录: {output_dir}")


创建输出目录: ./results/CFpathPlanning101_replaced_synonyms_20250512_13


In [8]:
# 添加清理关键词的函数
def clean_keywords(keywords_str):
    """清理和提取关键词"""
    if pd.isna(keywords_str) or keywords_str == 'nan' or not keywords_str.strip():
        return []
    # 分割关键词
    return [k.strip().lower() for k in re.split(r'[;,，；]', keywords_str) if k.strip()]

In [9]:
# 获取高引用文章的函数
def get_top_cited_papers(df, keyword, recent_years=3, top_n=5):
    """获取关键词相关的高引论文"""
    print(f"\n获取关键词 '{keyword}' 的高引用文章...")

    # 创建一个包含作者关键词和索引关键词的列
    if 'all_keywords' not in df.columns:
        df['all_keywords'] = df.apply(
            lambda row: clean_keywords(str(row.get('作者关键字', ''))) + clean_keywords(str(row.get('索引关键字', ''))),
            axis=1
        )
    # 筛选包含目标关键词的文章
    target_keyword_lower = keyword.lower()
    relevant_papers = df[df['all_keywords'].apply(
        lambda keywords: target_keyword_lower in [k.lower() for k in keywords]
    )]
    if len(relevant_papers) == 0:
        print(f"未找到与'{keyword}'相关的文章")
        return [], []
    # 将'施引文献'转换为数值型
    relevant_papers['citation_count'] = pd.to_numeric(relevant_papers['施引文献'], errors='coerce').fillna(0)
    # 获取当前年份
    current_year = datetime.now().year
    recent_cutoff_year = current_year - recent_years
    # 分割最近几年和更早的论文
    recent_df = relevant_papers[relevant_papers['年份'] >= recent_cutoff_year]
    older_df = relevant_papers[relevant_papers['年份'] < recent_cutoff_year]
    # 按引用次数排序并取前N篇
    recent_top_papers = recent_df.sort_values('citation_count', ascending=False).head(top_n)
    older_top_papers = older_df.sort_values('citation_count', ascending=False).head(top_n)
    # 转换为字典列表
    recent_papers = []
    for _, paper in recent_top_papers.iterrows():
        recent_papers.append({
            '标题': paper.get('文献标题', '无标题'),
            '作者': paper.get('作者', '未知作者'),
            '来源': paper.get('来源出版物名称', '未知来源'),
            '年份': paper.get('年份', '未知年份'),
            '被引频次': int(paper.get('citation_count', 0)),
            'DOI': paper.get('DOI', '')
        })
    older_papers = []
    for _, paper in older_top_papers.iterrows():
        older_papers.append({
            '标题': paper.get('文献标题', '无标题'),
            '作者': paper.get('作者', '未知作者'),
            '来源': paper.get('来源出版物名称', '未知来源'),
            '年份': paper.get('年份', '未知年份'),
            '被引频次': int(paper.get('citation_count', 0)),
            'DOI': paper.get('DOI', '')
        })
    print(f"找到与'{keyword}'相关的最近{recent_years}年高引用文章: {len(recent_papers)}")
    print(f"找到与'{keyword}'相关的其他年份高引用文章: {len(older_papers)}")

    return recent_papers, older_papers

In [10]:
# 修改关键词关系图函数，添加高引用文章支持
def create_keyword_graph(df, target_keyword, output_dir, top_related=30, min_co_occurrence=2):
    """
    为目标关键词创建关系图并生成包含高引用文章的HTML报告

    参数:
    - df: 包含文献数据的DataFrame
    - target_keyword: 要分析的目标关键词
    - output_dir: 输出文件保存目录
    - top_related: 要包含的相关关键词数量
    - min_co_occurrence: 最小共现次数阈值
    """
    print(f"\n开始为关键词 '{target_keyword}' 创建关系图...")
    # 获取关键词与其他关键词的共现关系
    keyword_relations = defaultdict(Counter)
    # 分析每篇文献中关键词的共现情况
    for _, row in df.iterrows():
        # 获取文献的关键词列表
        keywords = []
        # 从作者关键字和索引关键字中提取
        for col in ['作者关键字', '索引关键字']:
            if col in df.columns and isinstance(row[col], str) and row[col].strip():
                # 分割关键词
                kws = [k.strip().lower() for k in re.split(r'[;,，；]', row[col]) if k.strip()]
                keywords.extend(kws)
        # 去重
        keywords = list(set(keywords))
        # 如果关键词列表中包含目标关键词
        if target_keyword.lower() in [k.lower() for k in keywords]:
            # 更新共现计数
            for i, kw1 in enumerate(keywords):
                for j, kw2 in enumerate(keywords):
                    if i != j:  # 避免自身与自身的关系
                        keyword_relations[kw1.lower()][kw2.lower()] += 1
    # 检查目标关键词是否被找到
    if target_keyword.lower() not in keyword_relations:
        print(f"在数据集中未找到关键词 '{target_keyword}'")
        return None
    # 获取与目标关键词共现最多的关键词
    related_keywords = keyword_relations[target_keyword.lower()]
    print(f"找到与 '{target_keyword}' 共现的关键词数量: {len(related_keywords)}")
    # 过滤掉共现次数低于阈值的关键词
    filtered_related = {k: v for k, v in related_keywords.items() if v >= min_co_occurrence}
    print(f"共现次数 >= {min_co_occurrence} 的关键词数量: {len(filtered_related)}")
    # 获取前N个最相关的关键词
    top_keywords = [k for k, _ in sorted(filtered_related.items(), key=lambda x: x[1], reverse=True)[:top_related]]
    # 添加目标关键词
    top_keywords = [target_keyword.lower()] + [k for k in top_keywords if k != target_keyword.lower()]
    # 构建节点和边
    nodes = []
    links = []
    # 添加目标关键词作为中心节点
    nodes.append({
        "id": target_keyword.lower(),
        "name": target_keyword,
        "symbolSize": 50,  # 较大的节点大小
        "value": sum(related_keywords.values()),  # 值为总共现次数
        "category": 0,  # 中心节点类别
        "itemStyle": {"color": "#FF0000"}  # 红色
    })
    # 添加相关关键词节点
    for i, kw in enumerate(top_keywords[1:], 1):  # 跳过中心节点
        # 计算节点大小（基于共现次数）
        size = max(20, min(40, 15 + filtered_related[kw] * 2))
        nodes.append({
            "id": kw,
            "name": kw,
            "symbolSize": size,
            "value": filtered_related[kw],
            "category": 1,  # 关联节点类别
        })
        # 添加与中心节点的连接
        links.append({
            "source": target_keyword.lower(),
            "target": kw,
            "value": filtered_related[kw]
        })
    # 添加关键词之间的连接
    for i, kw1 in enumerate(top_keywords[1:], 1):
        for j, kw2 in enumerate(top_keywords[1:], 1):
            if i < j:  # 避免重复连接
                co_occurrence = keyword_relations[kw1][kw2]
                if co_occurrence >= min_co_occurrence:
                    links.append({
                        "source": kw1,
                        "target": kw2,
                        "value": co_occurrence
                    })
    # 创建类别
    categories = [
        {"name": "中心关键词"},
        {"name": "相关关键词"}
    ]

    # 获取高引用文章
    recent_papers, older_papers = get_top_cited_papers(df, target_keyword)

    # 创建高引用文章HTML
    papers_html = create_cited_papers_html(recent_papers, older_papers)

    # 创建包含图和高引文章的HTML
    output_file = os.path.join(output_dir, f"{target_keyword.replace(' ', '_')}_report.html")

    # HTML内容模板
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>关键词 '{target_keyword}' 分析报告</title>
        <script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 0;
                padding: 0;
                background-color: #f5f5f5;
            }}
            .container {{
                max-width: 1200px;
                margin: 0 auto;
                padding: 20px;
                background-color: white;
                box-shadow: 0 0 10px rgba(0,0,0,0.1);
            }}
            .header {{
                text-align: center;
                padding: 20px 0;
                border-bottom: 1px solid #eee;
                margin-bottom: 30px;
            }}
            .tab-container {{
                width: 100%;
            }}
            .tab-buttons {{
                display: flex;
                margin-bottom: 20px;
                border-bottom: 1px solid #ddd;
            }}
            .tab-button {{
                padding: 15px 20px;
                background-color: #f1f1f1;
                border: none;
                cursor: pointer;
                font-size: 16px;
                margin-right: 5px;
                transition: background-color 0.3s;
                border-radius: 5px 5px 0 0;
            }}
            .tab-button:hover {{
                background-color: #ddd;
            }}
            .tab-button.active {{
                background-color: #4CAF50;
                color: white;
            }}
            .tab-content {{
                display: none;
                padding: 20px;
                border: 1px solid #ddd;
                border-top: none;
                animation: fadeIn 0.5s;
            }}
            .tab-content.active {{
                display: block;
            }}
            .chart-container {{
                width: 100%;
                height: 700px;
                margin-top: 20px;
            }}
            @keyframes fadeIn {{
                from {{ opacity: 0; }}
                to {{ opacity: 1; }}
            }}
            .paper-table {{
                width: 100%;
                border-collapse: collapse;
                margin-bottom: 30px;
            }}
            .paper-table th {{
                background-color: #f2f2f2;
                border: 1px solid #ddd;
                padding: 12px;
                text-align: left;
            }}
            .paper-table td {{
                border: 1px solid #ddd;
                padding: 12px;
                text-align: left;
            }}
            .paper-section {{
                margin-bottom: 40px;
            }}
            .paper-section h3 {{
                margin-bottom: 15px;
                border-bottom: 2px solid #4CAF50;
                padding-bottom: 8px;
                color: #333;
            }}
        </style>
    </head>
    <body>
        <div class="container">
            <div class="header">
                <h1>关键词 '{target_keyword}' 分析报告</h1>
            </div>

            <div class="tab-container">
                <div class="tab-buttons">
                    <button class="tab-button active" onclick="openTab(event, 'tab1')">关键词关系图</button>
                    <button class="tab-button" onclick="openTab(event, 'tab2')">高引用文章</button>
                </div>

                <div id="tab1" class="tab-content active">
                    <div id="relationGraph" class="chart-container"></div>
                </div>

                <div id="tab2" class="tab-content">
                    {papers_html}
                </div>
            </div>
        </div>

        <script>
            // 关系图数据
            const graphData = {{
                nodes: {json.dumps(nodes)},
                links: {json.dumps(links)},
                categories: {json.dumps(categories)}
            }};

            // 初始化图表
            function initCharts() {{
                const relationGraph = echarts.init(document.getElementById('relationGraph'));

                const option = {{
                    title: {{
                        text: '关键词 \'{target_keyword}\' 关系图'
                    }},
                    tooltip: {{
                        trigger: 'item',
                        formatter: '{{a}} <br/>{{b}} : {{c}}'
                    }},
                    legend: {{
                        orient: 'vertical',
                        left: '2%',
                        top: '20%',
                        data: graphData.categories.map(a => a.name)
                    }},
                    series: [
                        {{
                            type: 'graph',
                            layout: 'force',
                            data: graphData.nodes,
                            links: graphData.links,
                            categories: graphData.categories,
                            roam: true,
                            draggable: true,
                            label: {{
                                show: true,
                                position: 'right',
                                fontSize: 12
                            }},
                            lineStyle: {{
                                width: 1.5,
                                curveness: 0.3,
                                opacity: 0.8
                            }},
                            force: {{
                                repulsion: 800,
                                gravity: 0.2,
                                edgeLength: 120
                            }},
                            edgeSymbol: ['none', 'arrow']
                        }}
                    ]
                }};

                relationGraph.setOption(option);

                // 窗口大小改变时调整图表大小
                window.addEventListener('resize', function() {{
                    relationGraph.resize();
                }});
            }}

            // 初始化图表
            initCharts();

            // 切换选项卡
            function openTab(evt, tabName) {{
                var i, tabcontent, tablinks;

                // 隐藏所有选项卡内容
                tabcontent = document.getElementsByClassName("tab-content");
                for (i = 0; i < tabcontent.length; i++) {{
                    tabcontent[i].className = tabcontent[i].className.replace(" active", "");
                }}

                // 移除所有选项卡按钮的活动状态
                tablinks = document.getElementsByClassName("tab-button");
                for (i = 0; i < tablinks.length; i++) {{
                    tablinks[i].className = tablinks[i].className.replace(" active", "");
                }}

                // 显示当前选项卡并添加活动状态
                document.getElementById(tabName).className += " active";
                evt.currentTarget.className += " active";
            }}
        </script>
    </body>
    </html>
    """

    # 保存HTML文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"分析报告已保存至: {output_file}")

    # 在Jupyter中显示
    return IFrame(output_file, width=1000, height=800)

In [11]:
def create_cited_papers_html(recent_papers, older_papers):
    """创建高引用文章的HTML内容"""
    # 生成最近几年高引用文章表格
    recent_rows = ""
    for paper in recent_papers:
        doi_link = f'<a href="https://doi.org/{paper["DOI"]}" target="_blank">{paper["DOI"]}</a>' if paper['DOI'] else ''
        recent_rows += f"""
        <tr>
            <td>{paper['标题']}</td>
            <td>{paper['作者']}</td>
            <td>{paper['来源']}</td>
            <td style="text-align:center">{paper['年份']}</td>
            <td style="text-align:center">{paper['被引频次']}</td>
            <td>{doi_link}</td>
        </tr>
        """

    # 生成其他年份高引用文章表格
    older_rows = ""
    for paper in older_papers:
        doi_link = f'<a href="https://doi.org/{paper["DOI"]}" target="_blank">{paper["DOI"]}</a>' if paper['DOI'] else ''
        older_rows += f"""
        <tr>
            <td>{paper['标题']}</td>
            <td>{paper['作者']}</td>
            <td>{paper['来源']}</td>
            <td style="text-align:center">{paper['年份']}</td>
            <td style="text-align:center">{paper['被引频次']}</td>
            <td>{doi_link}</td>
        </tr>
        """

    # 生成完整的HTML内容
    papers_html = f"""
    <div class="paper-section">
        <h3>最近3年的高引用文章</h3>
        <table class="paper-table">
            <thead>
                <tr>
                    <th>标题</th>
                    <th>作者</th>
                    <th>来源</th>
                    <th>年份</th>
                    <th>被引频次</th>
                    <th>DOI</th>
                </tr>
            </thead>
            <tbody>
                {recent_rows}
            </tbody>
        </table>
    </div>

    <div class="paper-section">
        <h3>所有年份的高引用文章（除最近3年外）</h3>
        <table class="paper-table">
            <thead>
                <tr>
                    <th>标题</th>
                    <th>作者</th>
                    <th>来源</th>
                    <th>年份</th>
                    <th>被引频次</th>
                    <th>DOI</th>
                </tr>
            </thead>
            <tbody>
                {older_rows}
            </tbody>
        </table>
    </div>
    """

    return papers_html


In [12]:
# 修改分析单个关键词的函数
def analyze_keyword(df, keyword, output_dir):
    """分析单个关键词并生成报告"""
    print(f"\n{'='*50}\n开始分析关键词: {keyword}\n{'='*50}")
    # 创建关键词报告（包含关系图和高引用文章）
    result = create_keyword_graph(df, keyword, output_dir)
    if result:
        return result
    else:
        return None

In [13]:
# 分析关键词列表的函数
def analyze_keywords(df, keywords_list, output_dir):
    """依次分析多个关键词并生成关系图"""
    results = {}

    for keyword in keywords_list:
        result = analyze_keyword(df, keyword, output_dir)
        results[keyword] = result

    return results

In [14]:
# 对单个关键词进行分析的函数
def run_analysis_for_keyword():
    """交互式输入一个关键词并进行分析"""
    keyword = input("请输入要分析的关键词: ")
    if keyword.strip():
        return analyze_keyword(df, keyword.strip(), output_dir)
    else:
        print("未输入关键词")
        return None

In [15]:
# 运行单个关键词分析
result = run_analysis_for_keyword()

KeyboardInterrupt: Interrupted by user

In [16]:
for keyword in insight_keywords:
    print(f"\n正在分析关键词: {keyword}")
    result = analyze_keyword(df, keyword, output_dir)
    display(result)  # 在Jupyter中显示结果


正在分析关键词: continuous fibers

开始分析关键词: continuous fibers

开始为关键词 'continuous fibers' 创建关系图...
找到与 'continuous fibers' 共现的关键词数量: 122
共现次数 >= 2 的关键词数量: 20

获取关键词 'continuous fibers' 的高引用文章...
找到与'continuous fibers'相关的最近3年高引用文章: 5
找到与'continuous fibers'相关的其他年份高引用文章: 0
分析报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_13\continuous_fibers_report.html



正在分析关键词: path planning method

开始分析关键词: path planning method

开始为关键词 'path planning method' 创建关系图...
找到与 'path planning method' 共现的关键词数量: 185
共现次数 >= 2 的关键词数量: 27

获取关键词 'path planning method' 的高引用文章...
找到与'path planning method'相关的最近3年高引用文章: 5
找到与'path planning method'相关的其他年份高引用文章: 5
分析报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_13\path_planning_method_report.html



正在分析关键词: robot programming

开始分析关键词: robot programming

开始为关键词 'robot programming' 创建关系图...
找到与 'robot programming' 共现的关键词数量: 235
共现次数 >= 2 的关键词数量: 27

获取关键词 'robot programming' 的高引用文章...
找到与'robot programming'相关的最近3年高引用文章: 5
找到与'robot programming'相关的其他年份高引用文章: 5
分析报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_13\robot_programming_report.html



正在分析关键词: optimization

开始分析关键词: optimization

开始为关键词 'optimization' 创建关系图...
找到与 'optimization' 共现的关键词数量: 73
共现次数 >= 2 的关键词数量: 9

获取关键词 'optimization' 的高引用文章...
找到与'optimization'相关的最近3年高引用文章: 5
找到与'optimization'相关的其他年份高引用文章: 1
分析报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_13\optimization_report.html



正在分析关键词: topology optimisation

开始分析关键词: topology optimisation

开始为关键词 'topology optimisation' 创建关系图...
找到与 'topology optimisation' 共现的关键词数量: 41
共现次数 >= 2 的关键词数量: 7

获取关键词 'topology optimisation' 的高引用文章...
找到与'topology optimisation'相关的最近3年高引用文章: 3
找到与'topology optimisation'相关的其他年份高引用文章: 0
分析报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_13\topology_optimisation_report.html
