#### 文献关键词画图



In [14]:
# 导入必要的库
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from collections import defaultdict, Counter
import re
from IPython.display import IFrame, display, HTML
from pyecharts import options as opts
from pyecharts.charts import Graph, Bar, Map, Page, Tab
from pyecharts.commons.utils import JsCode
import webbrowser
import pycountry
import json
import json
from pyecharts.commons.utils import JsCode
import webbrowser

In [15]:
def load_data(file_path):
    """加载CSV数据文件"""
    df = pd.read_csv(file_path, encoding='utf-8')
    print(f"数据集大小: {df.shape}")
    print(f"数据集列名: {df.columns.tolist()}")
    return df

In [16]:
insight_keywords = ['continuous fibers', 'path planning method', 'robot programming', 'optimization', 'topology optimisation']

In [17]:
file_path = './results/CFpathPlanning101_20250510_17/CFpathPlanning101_replaced_synonyms.csv'
save_path = './results'
df = load_data(file_path)

数据集大小: (101, 26)
数据集列名: ['作者', 'Author full names', '作者 ID', '文献标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '摘要', '作者关键字', '索引关键字', '通讯地址', '文献类型', '出版阶段', '开放获取', '来源出版物', 'EID']


In [18]:
base_filename = os.path.basename(file_path)
file_name_without_ext = os.path.splitext(base_filename)[0]
timestamp = datetime.now().strftime("%Y%m%d_%H")
output_dir = f"{save_path}/{file_name_without_ext}_{timestamp}"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"创建输出目录: {output_dir}")


In [19]:
# 1. 找到关键词相关的高引用文章
def get_top_cited_papers(df, keyword, recent_years=3, top_recent=5, top_all=5):
    """
    获取与特定关键词相关的高引用文章

    参数:
    - df: 包含文献数据的DataFrame
    - keyword: 要分析的目标关键词
    - recent_years: 最近几年的定义
    - top_recent: 最近几年要返回的高引用文章数
    - top_all: 所有年份要返回的高引用文章数（排除最近几年的文章）

    返回:
    - recent_papers: 最近几年的高引用文章
    - all_time_papers: 所有年份的高引用文章（排除最近几年）
    """
    # 确保数据包含必要的列
    required_columns = ['出版年', '被引频次', '作者关键字', '索引关键字', '标题', '作者', '来源出版物', 'DOI']
    for col in required_columns:
        if col not in df.columns:
            print(f"警告: 数据集中缺少'{col}'列")

    # 当前年份
    current_year = datetime.now().year

    # 筛选包含关键词的文章
    keyword_papers = []

    for _, row in df.iterrows():
        keywords = []

        # 从作者关键字和索引关键字中提取
        for col in ['作者关键字', '索引关键字']:
            if col in df.columns and isinstance(row[col], str) and row[col].strip():
                kws = [k.strip().lower() for k in re.split(r'[;,，；]', row[col]) if k.strip()]
                keywords.extend(kws)

        # 如果文章包含目标关键词
        if keyword.lower() in [k.lower() for k in keywords]:
            # 提取被引频次并转换为数字
            cited_freq = 0
            if '被引频次' in df.columns and pd.notna(row['被引频次']):
                try:
                    cited_freq = int(row['被引频次'])
                except (ValueError, TypeError):
                    # 如果转换失败，使用0
                    pass

            # 提取出版年
            pub_year = None
            if '出版年' in df.columns and pd.notna(row['出版年']):
                try:
                    pub_year = int(row['出版年'])
                except (ValueError, TypeError):
                    # 如果转换失败，使用None
                    pass

            # 只有当出版年有效时才添加文章
            if pub_year is not None:
                paper_info = {
                    '标题': row.get('标题', '无标题'),
                    '作者': row.get('作者', '未知作者'),
                    '来源': row.get('来源出版物', '未知来源'),
                    '年份': pub_year,
                    '被引频次': cited_freq,
                    'DOI': row.get('DOI', '')
                }
                keyword_papers.append(paper_info)

    # 区分最近几年的文章和所有年份的文章
    recent_cutoff_year = current_year - recent_years
    recent_papers = [p for p in keyword_papers if p['年份'] >= recent_cutoff_year]
    older_papers = [p for p in keyword_papers if p['年份'] < recent_cutoff_year]

    # 按被引频次排序
    recent_papers = sorted(recent_papers, key=lambda x: x['被引频次'], reverse=True)
    older_papers = sorted(older_papers, key=lambda x: x['被引频次'], reverse=True)

    # 获取前N个高引用文章
    top_recent_papers = recent_papers[:top_recent]
    top_older_papers = older_papers[:top_all]

    print(f"找到与'{keyword}'相关的最近{recent_years}年高引用文章: {len(top_recent_papers)}")
    print(f"找到与'{keyword}'相关的其他年份高引用文章: {len(top_older_papers)}")

    return top_recent_papers, top_older_papers

In [20]:
# 2. 找到发表论文最多的机构
def get_top_institutions(df, keyword, top_n=20):
    """
    获取与特定关键词相关的发表论文最多的机构

    参数:
    - df: 包含文献数据的DataFrame
    - keyword: 要分析的目标关键词
    - top_n: 返回的顶级机构数量

    返回:
    - institutions_data: 按论文数量排序的机构列表，包含机构名称、国家/地区和论文数量
    """
    # 检查必要的列
    if '通讯作者地址' not in df.columns and '作者地址' not in df.columns:
        print("警告: 数据集中缺少机构相关列")
        return []

    # 机构计数器
    institution_counter = Counter()
    institution_countries = {}

    # 筛选包含关键词的文章
    for _, row in df.iterrows():
        keywords = []

        # 从作者关键字和索引关键字中提取
        for col in ['作者关键字', '索引关键字']:
            if col in df.columns and isinstance(row[col], str) and row[col].strip():
                kws = [k.strip().lower() for k in re.split(r'[;,，；]', row[col]) if k.strip()]
                keywords.extend(kws)

        # 如果文章包含目标关键词
        if keyword.lower() in [k.lower() for k in keywords]:
            # 提取机构信息
            addr_cols = ['通讯作者地址', '作者地址']

            for col in addr_cols:
                if col in df.columns and isinstance(row[col], str) and row[col].strip():
                    # 分割地址信息（多个地址可能用分号分隔）
                    addresses = row[col].split(';')

                    for addr in addresses:
                        if addr.strip():
                            # 提取机构名称和国家/地区
                            parts = addr.strip().split(',')
                            if len(parts) >= 1:
                                # 第一部分通常是机构名称
                                institution = parts[0].strip()

                                # 最后一部分通常是国家/地区
                                country = parts[-1].strip() if len(parts) > 1 else "未知"

                                # 更新计数器
                                institution_counter[institution] += 1
                                institution_countries[institution] = country

    # 获取发表论文最多的机构
    top_institutions = institution_counter.most_common(top_n)

    # 格式化结果
    institutions_data = [
        {
            "name": inst,
            "country": institution_countries.get(inst, "未知"),
            "count": count
        }
        for inst, count in top_institutions
    ]

    print(f"找到与'{keyword}'相关的顶级机构: {len(institutions_data)}")

    return institutions_data


In [21]:
# 3. 地图数据处理函数
def process_geo_data(institutions_data):
    """处理机构数据，准备地图可视化"""
    # 国家计数
    country_counter = Counter()

    # 国家名称标准化映射（常见国家的简写或别名）
    country_mapping = {
        "USA": "United States",
        "UK": "United Kingdom",
        "U.K.": "United Kingdom",
        "U.S.A.": "United States",
        "P.R.China": "China",
        "P. R. China": "China",
        "PR China": "China",
        "Republic of China": "China",
        "People's Republic of China": "China",
        "Korea": "South Korea",
        "Republic of Korea": "South Korea",
    }

    # 统计各国家/地区的机构数量
    for inst in institutions_data:
        country = inst["country"].strip()

        # 应用国家名称映射
        if country in country_mapping:
            country = country_mapping[country]

        country_counter[country] += inst["count"]

    # 将国家名称映射到pyecharts支持的名称
    geo_data = []
    for country, count in country_counter.items():
        # 尝试找到标准国家名称
        try:
            # 尝试直接查找
            country_obj = pycountry.countries.get(name=country)

            if not country_obj:
                # 尝试模糊查找
                country_obj = pycountry.countries.search_fuzzy(country)[0]

            std_country = country_obj.name

            # 特殊处理某些国家名称，以匹配pyecharts地图
            if std_country == "United States":
                std_country = "United States of America"
            elif std_country == "Russian Federation":
                std_country = "Russia"

            geo_data.append((std_country, count))
        except (AttributeError, IndexError, LookupError):
            # 如果找不到匹配，保留原始名称
            geo_data.append((country, count))

    return geo_data

In [22]:
# 现在使用修改后的create_keyword_graph函数
def create_keyword_graph(df, target_keyword, output_dir=None, top_related=30, min_co_occurrence=2):
    """
    为目标关键词创建关系图

    参数:
    - df: 包含文献数据的DataFrame
    - target_keyword: 要分析的目标关键词
    - output_dir: 输出文件保存目录
    - top_related: 要包含的相关关键词数量
    - min_co_occurrence: 最小共现次数阈值

    返回:
    - 图表对象和数据
    """
    print(f"\n开始为关键词 '{target_keyword}' 创建关系图...")

    # 获取关键词与其他关键词的共现关系
    keyword_relations = defaultdict(Counter)

    # 分析每篇文献中关键词的共现情况
    for _, row in df.iterrows():
        # 获取文献的关键词列表
        keywords = []

        # 从作者关键字和索引关键字中提取
        for col in ['作者关键字', '索引关键字']:
            if col in df.columns and isinstance(row[col], str) and row[col].strip():
                # 分割关键词
                kws = [k.strip().lower() for k in re.split(r'[;,，；]', row[col]) if k.strip()]
                keywords.extend(kws)

        # 去重
        keywords = list(set(keywords))

        # 如果关键词列表中包含目标关键词
        if target_keyword.lower() in [k.lower() for k in keywords]:
            # 更新共现计数
            for i, kw1 in enumerate(keywords):
                for j, kw2 in enumerate(keywords):
                    if i != j:  # 避免自身与自身的关系
                        keyword_relations[kw1.lower()][kw2.lower()] += 1

    # 检查目标关键词是否被找到
    if target_keyword.lower() not in keyword_relations:
        print(f"在数据集中未找到关键词 '{target_keyword}'")
        return None, None

    # 获取与目标关键词共现最多的关键词
    related_keywords = keyword_relations[target_keyword.lower()]
    print(f"找到与 '{target_keyword}' 共现的关键词数量: {len(related_keywords)}")

    # 过滤掉共现次数低于阈值的关键词
    filtered_related = {k: v for k, v in related_keywords.items() if v >= min_co_occurrence}
    print(f"共现次数 >= {min_co_occurrence} 的关键词数量: {len(filtered_related)}")

    # 获取前N个最相关的关键词
    top_keywords = [k for k, _ in sorted(filtered_related.items(), key=lambda x: x[1], reverse=True)[:top_related]]

    # 添加目标关键词
    top_keywords = [target_keyword.lower()] + [k for k in top_keywords if k != target_keyword.lower()]

    # 构建节点和边
    nodes = []
    links = []

    # 添加目标关键词作为中心节点
    nodes.append({
        "id": target_keyword.lower(),
        "name": target_keyword,
        "symbolSize": 50,  # 较大的节点大小
        "value": sum(related_keywords.values()),  # 值为总共现次数
        "category": 0,  # 中心节点类别
        "itemStyle": {"color": "#FF0000"}  # 红色
    })

    # 添加相关关键词节点
    for i, kw in enumerate(top_keywords[1:], 1):  # 跳过中心节点
        # 计算节点大小（基于共现次数）
        size = max(20, min(40, 15 + filtered_related[kw] * 2))

        nodes.append({
            "id": kw,
            "name": kw,
            "symbolSize": size,
            "value": filtered_related[kw],
            "category": 1,  # 关联节点类别
        })

        # 添加与中心节点的连接
        links.append({
            "source": target_keyword.lower(),
            "target": kw,
            "value": filtered_related[kw]
        })

    # 添加关键词之间的连接
    for i, kw1 in enumerate(top_keywords[1:], 1):
        for j, kw2 in enumerate(top_keywords[1:], 1):
            if i < j:  # 避免重复连接
                co_occurrence = keyword_relations[kw1][kw2]
                if co_occurrence >= min_co_occurrence:
                    links.append({
                        "source": kw1,
                        "target": kw2,
                        "value": co_occurrence
                    })

    # 创建类别
    categories = [
        {"name": "中心关键词"},
        {"name": "相关关键词"}
    ]

    # 创建ECharts图表
    graph = (
        Graph(init_opts=opts.InitOpts(width="900px", height="700px"))
        .add(
            "",
            nodes=nodes,
            links=links,
            categories=categories,
            layout="force",
            is_roam=True,
            is_focusnode=True,
            is_rotate_label=True,
            is_draggable=True,
            linestyle_opts=opts.LineStyleOpts(width=1.5, curve=0.3, opacity=0.8),
            label_opts=opts.LabelOpts(is_show=True, position="right", font_size=12),
            repulsion=800,
            edge_symbol=["none", "arrow"],
            gravity=0.2,
            edge_length=120,
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title=f"关键词 '{target_keyword}' 关系图"),
            tooltip_opts=opts.TooltipOpts(
                trigger="item",
                formatter="{a} <br/>{b} : {c}"
            ),
            legend_opts=opts.LegendOpts(
                orient="vertical",
                pos_left="2%",
                pos_top="20%",
                is_show=True
            )
        )
    )

    # 返回图表对象和数据
    graph_data = {
        "nodes": nodes,
        "links": links,
        "categories": categories
    }

    return graph, graph_data


In [23]:
# 4. 创建高引用文章表格
def create_cited_papers_chart(recent_papers, older_papers, keyword):
    """创建高引用文章的表格图表"""
    # 准备表格数据
    table_html = f"""
    <div style="padding: 20px;">
        <h2>关键词 '{keyword}' 的高引用文章</h2>

        <h3>最近3年的高引用文章</h3>
        <table style="width:100%; border-collapse: collapse; margin-bottom: 30px;">
            <thead>
                <tr style="background-color: #f2f2f2;">
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">标题</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">作者</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">来源</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">年份</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">被引频次</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">DOI</th>
                </tr>
            </thead>
            <tbody>
    """

    # 添加最近文章
    for paper in recent_papers:
        doi_link = f'<a href="https://doi.org/{paper["DOI"]}" target="_blank">{paper["DOI"]}</a>' if paper["DOI"] else ""
        table_html += f"""
                <tr>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: left;">{paper["标题"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: left;">{paper["作者"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: left;">{paper["来源"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: center;">{paper["年份"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: center;">{paper["被引频次"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: center;">{doi_link}</td>
                </tr>
        """

    # 添加所有年份文章标题
    table_html += """
            </tbody>
        </table>

        <h3>所有年份的高引用文章（除最近3年外）</h3>
        <table style="width:100%; border-collapse: collapse;">
            <thead>
                <tr style="background-color: #f2f2f2;">
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">标题</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">作者</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">来源</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">年份</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">被引频次</th>
                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">DOI</th>
                </tr>
            </thead>
            <tbody>
    """

    # 添加旧文章
    for paper in older_papers:
        doi_link = f'<a href="https://doi.org/{paper["DOI"]}" target="_blank">{paper["DOI"]}</a>' if paper["DOI"] else ""
        table_html += f"""
                <tr>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: left;">{paper["标题"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: left;">{paper["作者"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: left;">{paper["来源"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: center;">{paper["年份"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: center;">{paper["被引频次"]}</td>
                    <td style="border: 1px solid #ddd; padding: 12px; text-align: center;">{doi_link}</td>
                </tr>
        """

    table_html += """
            </tbody>
        </table>
    </div>
    """

    return table_html


In [24]:
# 5. 创建机构分布地图和柱状图
def create_institutions_chart(institutions_data, keyword):
    """创建机构分布地图和柱状图"""
    # 处理地理数据
    geo_data = process_geo_data(institutions_data)

    # 准备Bar图数据
    inst_names = []
    inst_counts = []
    inst_tooltips = []

    # 截取机构名称，保留完整信息作为tooltip
    for inst in institutions_data[:20]:  # 只显示前20个机构
        # 截取机构名称（如果太长）
        full_name = inst["name"]
        short_name = full_name[:20] + "..." if len(full_name) > 20 else full_name

        inst_names.append(short_name)
        inst_counts.append(inst["count"])
        inst_tooltips.append(full_name)

    # 创建Bar图
    bar = (
        Bar(init_opts=opts.InitOpts(width="900px", height="500px"))
        .add_xaxis(inst_names)
        .add_yaxis("论文数量", inst_counts)
        .set_global_opts(
            title_opts=opts.TitleOpts(title=f"关键词 '{keyword}' 的顶级机构"),
            tooltip_opts=opts.TooltipOpts(
                trigger="axis",
                formatter=JsCode(
                    """
                    function(params) {
                        var tooltips = {tooltips};
                        var index = params[0].dataIndex;
                        var fullName = tooltips[index];
                        return fullName + '<br/>论文数量：' + params[0].value;
                    }
                    """.replace("{tooltips}", json.dumps(inst_tooltips))
                )
            ),
            xaxis_opts=opts.AxisOpts(
                axislabel_opts=opts.LabelOpts(rotate=45, font_size=10)
            ),
            datazoom_opts=[
                opts.DataZoomOpts(type_="slider", range_start=0, range_end=100)
            ],
        )
        .set_series_opts(
            label_opts=opts.LabelOpts(is_show=False),
            # 添加点击事件复制全名
            itemstyle_opts=opts.ItemStyleOpts(
                color=JsCode(
                    """
                    function(params) {
                        var colorList = ['#c23531','#2f4554','#61a0a8','#d48265','#91c7ae',
                            '#749f83','#ca8622','#bda29a','#6e7074','#546570','#c4ccd3'];
                        return colorList[params.dataIndex % colorList.length];
                    }
                    """
                )
            ),
        )
    )

    # 尝试创建地图
    if geo_data:
        try:
            # 只取前15个国家，避免地图过于拥挤
            top_geo_data = sorted(geo_data, key=lambda x: x[1], reverse=True)[:15]

            world_map = (
                Map(init_opts=opts.InitOpts(width="900px", height="500px"))
                .add(
                    "论文数量",
                    top_geo_data,
                    "world",
                    is_map_symbol_show=False,
                )
                .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
                .set_global_opts(
                    title_opts=opts.TitleOpts(title=f"关键词 '{keyword}' 的国家分布"),
                    visualmap_opts=opts.VisualMapOpts(
                        min_=min([d[1] for d in top_geo_data]),
                        max_=max([d[1] for d in top_geo_data]),
                        is_piecewise=True,
                        range_color=["#E0F7FA", "#80DEEA", "#26C6DA", "#00ACC1", "#00838F", "#006064"],
                    ),
                    tooltip_opts=opts.TooltipOpts(
                        trigger="item",
                        formatter="{b}: {c}"
                    ),
                )
            )

            # 将地图和柱状图组合在一起
            page = Page(layout=Page.SimplePageLayout)
            page.add(world_map, bar)

            return page
        except Exception as e:
            print(f"创建地图时出错: {e}")
            return bar

    return bar

In [25]:
# 6. 创建完整的HTML报告
def create_keyword_report(df, keyword, output_dir):
    """
    创建关键词的完整HTML报告，包含关系图、高引用文章和机构分布

    参数:
    - df: 包含文献数据的DataFrame
    - keyword: 要分析的目标关键词
    - output_dir: 输出文件保存目录

    返回:
    - output_file: 生成的HTML报告文件路径
    """
    print(f"\n开始为关键词 '{keyword}' 创建综合报告...")

    # 1. 生成关键词关系图
    graph, graph_data = create_keyword_graph(df, keyword)

    # 2. 获取高引用文章
    recent_papers, older_papers = get_top_cited_papers(df, keyword)

    # 3. 获取顶级机构
    institutions_data = get_top_institutions(df, keyword)

    # 如果没有找到相关数据，返回None
    if not graph or not graph_data:
        print(f"警告: 未能为关键词 '{keyword}' 生成关系图")
        return None

    # 创建高引用文章HTML
    papers_html = create_cited_papers_chart(recent_papers, older_papers, keyword)

    # 创建机构图表
    institutions_chart = create_institutions_chart(institutions_data, keyword)

    # 准备HTML文件
    file_name = keyword.replace(" ", "_").replace("/", "_")
    output_file = os.path.join(output_dir, f"{file_name}_report.html")

    # 把图表转换为JSON数据，以便在HTML中使用
    graph_json = json.dumps(graph_data)

    # 机构数据
    inst_data = []
    for inst in institutions_data[:20]:  # 只显示前20个机构
        inst_data.append({
            "name": inst["name"],
            "country": inst["country"],
            "count": inst["count"]
        })

    inst_json = json.dumps(inst_data)

    # 文章数据
    recent_papers_json = json.dumps(recent_papers)
    older_papers_json = json.dumps(older_papers)


    # 创建HTML内容
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>关键词 '{keyword}' 分析报告</title>
        <script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 0;
                padding: 0;
                background-color: #f5f5f5;
            }}
            .container {{
                max-width: 1200px;
                margin: 0 auto;
                padding: 20px;
                background-color: white;
                box-shadow: 0 0 10px rgba(0,0,0,0.1);
            }}
            .header {{
                text-align: center;
                padding: 20px 0;
                border-bottom: 1px solid #eee;
                margin-bottom: 30px;
            }}
            .tab-container {{
                width: 100%;
            }}
            .tab-buttons {{
                display: flex;
                margin-bottom: 20px;
                border-bottom: 1px solid #ddd;
            }}
            .tab-button {{
                padding: 15px 20px;
                background-color: #f1f1f1;
                border: none;
                cursor: pointer;
                font-size: 16px;
                margin-right: 5px;
                transition: background-color 0.3s;
                border-radius: 5px 5px 0 0;
            }}
            .tab-button:hover {{
                background-color: #ddd;
            }}
            .tab-button.active {{
                background-color: #4CAF50;
                color: white;
            }}
            .tab-content {{
                display: none;
                padding: 20px;
                border: 1px solid #ddd;
                border-top: none;
                animation: fadeIn 0.5s;
            }}
            .tab-content.active {{
                display: block;
            }}
            .chart-container {{
                width: 100%;
                height: 700px;
                margin-top: 20px;
            }}
            @keyframes fadeIn {{
                from {{ opacity: 0; }}
                to {{ opacity: 1; }}
            }}
            .tooltip {{
                position: relative;
                display: inline-block;
                cursor: pointer;
            }}
            .institution-name {{
                cursor: pointer;
                text-decoration: underline dotted;
            }}
            #copyMsg {{
                display: none;
                position: fixed;
                top: 50%;
                left: 50%;
                transform: translate(-50%, -50%);
                background-color: rgba(0,0,0,0.7);
                color: white;
                padding: 10px 20px;
                border-radius: 5px;
                z-index: 1000;
            }}
        </style>
    </head>
    <body>
        <div class="container">
            <div class="header">
                <h1>关键词 '{keyword}' 分析报告</h1>
            </div>

            <div class="tab-container">
                <div class="tab-buttons">
                    <button class="tab-button active" onclick="openTab(event, 'tab1')">关键词关系图</button>
                    <button class="tab-button" onclick="openTab(event, 'tab2')">高引用文章</button>
                    <button class="tab-button" onclick="openTab(event, 'tab3')">机构分布</button>
                </div>

                <div id="tab1" class="tab-content active">
                    <div id="relationGraph" class="chart-container"></div>
                </div>

                <div id="tab2" class="tab-content">
                    {papers_html}
                </div>

                <div id="tab3" class="tab-content">
                    <div class="chart-container">
                        <div id="mapChart" style="width: 100%; height: 45%;"></div>
                        <div id="barChart" style="width: 100%; height: 45%; margin-top: 20px;"></div>
                    </div>

                    <div style="margin-top: 30px;">
                        <h3>机构列表</h3>
                        <table style="width:100%; border-collapse: collapse;">
                            <thead>
                                <tr style="background-color: #f2f2f2;">
                                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">机构</th>
                                    <th style="border: 1px solid #ddd; padding: 12px; text-align: left;">国家/地区</th>
                                    <th style="border: 1px solid #ddd; padding: 12px; text-align: center;">论文数量</th>
                                </tr>
                            </thead>
                            <tbody id="institutionsTable"></tbody>
                        </table>
                    </div>
                </div>
            </div>
        </div>

        <div id="copyMsg">已复制到剪贴板</div>

        <script>
            // 存储数据
            const graphData = {graph_json};
            const instData = {inst_json};
            const recentPapers = {recent_papers_json};
            const olderPapers = {older_papers_json};

            // 初始化图表
            function initCharts() {{
                // 1. 关系图
                const relationGraph = echarts.init(document.getElementById('relationGraph'));

                const option = {{
                    title: {{
                        text: '关键词 \'{keyword}\' 关系图'
                    }},
                    tooltip: {{
                        trigger: 'item',
                        formatter: '{{a}} <br/>{{b}} : {{c}}'
                    }},
                    legend: {{
                        orient: 'vertical',
                        left: '2%',
                        top: '20%',
                        data: graphData.categories.map(a => a.name)
                    }},
                    series: [
                        {{
                            type: 'graph',
                            layout: 'force',
                            data: graphData.nodes,
                            links: graphData.links,
                            categories: graphData.categories,
                            roam: true,
                            draggable: true,
                            label: {{
                                show: true,
                                position: 'right',
                                fontSize: 12
                            }},
                            lineStyle: {{
                                width: 1.5,
                                curveness: 0.3,
                                opacity: 0.8
                            }},
                            force: {{
                                repulsion: 800,
                                gravity: 0.2,
                                edgeLength: 120
                            }},
                            edgeSymbol: ['none', 'arrow']
                        }}
                    ]
                }};

                relationGraph.setOption(option);

                // 2. 世界地图和柱状图
                const mapChart = echarts.init(document.getElementById('mapChart'));
                const barChart = echarts.init(document.getElementById('barChart'));

                // 处理国家数据
                const countryData = [];
                const countryCount = {{}};

                instData.forEach(inst => {{
                    if (!countryCount[inst.country]) {{
                        countryCount[inst.country] = 0;
                    }}
                    countryCount[inst.country] += inst.count;
                }});

                for (const [country, count] of Object.entries(countryCount)) {{
                    countryData.push({{
                        name: country,
                        value: count
                    }});
                }}

                // 处理柱状图数据
                const instNames = [];
                const instCounts = [];
                const tooltips = [];

                instData.forEach(inst => {{
                    const shortName = inst.name.length > 20 ? inst.name.substring(0, 20) + '...' : inst.name;
                    instNames.push(shortName);
                    instCounts.push(inst.count);
                    tooltips.push(inst.name);
                }});

                // 排序后的国家数据
                const sortedCountryData = [...countryData].sort((a, b) => b.value - a.value).slice(0, 15);

                // 世界地图
                mapChart.setOption({{
                    title: {{
                        text: '关键词 \'{keyword}\' 的国家分布',
                        left: 'center'
                    }},
                    tooltip: {{
                        trigger: 'item',
                        formatter: '{{b}}: {{c}}'
                    }},
                    visualMap: {{
                        type: 'piecewise',
                        left: 'right',
                        min: Math.min(...sortedCountryData.map(d => d.value)),
                        max: Math.max(...sortedCountryData.map(d => d.value)),
                        inRange: {{
                            color: ['#E0F7FA', '#80DEEA', '#26C6DA', '#00ACC1', '#00838F', '#006064']
                        }}
                    }},
                    series: [
                        {{
                            name: '论文数量',
                            type: 'map',
                            map: 'world',
                            roam: true,
                            emphasis: {{
                                label: {{
                                    show: true
                                }}
                            }},
                            data: sortedCountryData
                        }}
                    ]
                }});

                // 机构柱状图
                barChart.setOption({{
                    title: {{
                        text: '关键词 \'{keyword}\' 的顶级机构',
                        left: 'center'
                    }},
                    tooltip: {{
                        trigger: 'axis',
                        formatter: function(params) {{
                            const index = params[0].dataIndex;
                            return tooltips[index] + '<br/>论文数量：' + params[0].value;
                        }}
                    }},
                    grid: {{
                        left: '5%',
                        right: '5%',
                        bottom: '15%',
                        containLabel: true
                    }},
                    xAxis: {{
                        type: 'category',
                        data: instNames,
                        axisLabel: {{
                            rotate: 45,
                            fontSize: 10
                        }}
                    }},
                    yAxis: {{
                        type: 'value'
                    }},
                    dataZoom: [
                        {{
                            type: 'slider',
                            start: 0,
                            end: 100
                        }}
                    ],
                    series: [
                        {{
                            name: '论文数量',
                            type: 'bar',
                            data: instCounts,
                            itemStyle: {{
                                color: function(params) {{
                                    const colorList = ['#c23531','#2f4554','#61a0a8','#d48265','#91c7ae',
                                        '#749f83','#ca8622','#bda29a','#6e7074','#546570','#c4ccd3'];
                                    return colorList[params.dataIndex % colorList.length];
                                }}
                            }}
                        }}
                    ]
                }});

                // 窗口大小改变时调整图表大小
                window.addEventListener('resize', function() {{
                    relationGraph.resize();
                    mapChart.resize();
                    barChart.resize();
                }});

                // 填充机构表格
                const tableBody = document.getElementById('institutionsTable');
                instData.forEach(inst => {{
                    const row = document.createElement('tr');

                    const nameCell = document.createElement('td');
                    nameCell.style.border = '1px solid #ddd';
                    nameCell.style.padding = '12px';
                    nameCell.style.textAlign = 'left';
                    nameCell.innerHTML = `<span class="institution-name" onclick="copyText('${{inst.name}}')">${{inst.name}}</span>`;

                    const countryCell = document.createElement('td');
                    countryCell.style.border = '1px solid #ddd';
                    countryCell.style.padding = '12px';
                    countryCell.style.textAlign = 'left';
                    countryCell.textContent = inst.country;

                    const countCell = document.createElement('td');
                    countCell.style.border = '1px solid #ddd';
                    countCell.style.padding = '12px';
                    countCell.style.textAlign = 'center';
                    countCell.textContent = inst.count;

                    row.appendChild(nameCell);
                    row.appendChild(countryCell);
                    row.appendChild(countCell);

                    tableBody.appendChild(row);
                }});
            }}

            // 切换选项卡
            function openTab(evt, tabName) {{
                const tabContents = document.getElementsByClassName("tab-content");
                for (let i = 0; i < tabContents.length; i++) {{
                    tabContents[i].classList.remove("active");
                }}

                const tabButtons = document.getElementsByClassName("tab-button");
                for (let i = 0; i < tabButtons.length; i++) {{
                    tabButtons[i].classList.remove("active");
                }}

                document.getElementById(tabName).classList.add("active");
                evt.currentTarget.classList.add("active");

                // 确保图表正确显示
                window.dispatchEvent(new Event('resize'));
            }}

            // 复制文本到剪贴板
            function copyText(text) {{
                navigator.clipboard.writeText(text).then(() => {{
                    const copyMsg = document.getElementById('copyMsg');
                    copyMsg.style.display = 'block';
                    setTimeout(() => {{
                        copyMsg.style.display = 'none';
                    }}, 2000);
                }});
            }}

            // 页面加载后初始化
            window.onload = function() {{
                initCharts();
            }};
        </script>
    </body>
    </html>
    """

    # 保存HTML文件
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_content)

    print(f"综合报告已保存至: {output_file}")

    return output_file

In [26]:
# 主函数：分析多个关键词并生成报告
def analyze_keywords(df, keywords, output_dir):
    """
    分析多个关键词并为每个关键词生成HTML报告

    参数:
    - df: 包含文献数据的DataFrame
    - keywords: 关键词列表
    - output_dir: 输出目录

    返回:
    - report_files: 生成的报告文件路径列表
    """
    report_files = []

    for keyword in keywords:
        print(f"\n\n{'='*60}\n开始分析关键词: {keyword}\n{'='*60}")

        # 创建报告
        report_file = create_keyword_report(df, keyword, output_dir)

        if report_file:
            report_files.append(report_file)
            print(f"关键词 '{keyword}' 的报告已完成!")

            # 尝试自动打开生成的HTML文件
            try:
                webbrowser.open('file://' + os.path.abspath(report_file))
            except:
                print("无法自动打开报告文件，请手动打开。")
        else:
            print(f"无法为关键词 '{keyword}' 创建报告。")

    return report_files

In [27]:

# 封装数据加载功能
def load_data(file_path):
    """加载CSV数据文件"""
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"成功加载数据: {file_path}")
        print(f"数据集包含 {len(df)} 条记录和 {len(df.columns)} 个字段")
        return df
    except Exception as e:
        print(f"加载数据失败: {e}")
        return None

In [28]:
report_files = analyze_keywords(df, insight_keywords, output_dir)



开始分析关键词: continuous fibers

开始为关键词 'continuous fibers' 创建综合报告...

开始为关键词 'continuous fibers' 创建关系图...
找到与 'continuous fibers' 共现的关键词数量: 122
共现次数 >= 2 的关键词数量: 20
警告: 数据集中缺少'出版年'列
警告: 数据集中缺少'被引频次'列
警告: 数据集中缺少'标题'列
找到与'continuous fibers'相关的最近3年高引用文章: 0
找到与'continuous fibers'相关的其他年份高引用文章: 0
警告: 数据集中缺少机构相关列
综合报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_11\continuous_fibers_report.html
关键词 'continuous fibers' 的报告已完成!


开始分析关键词: path planning method

开始为关键词 'path planning method' 创建综合报告...

开始为关键词 'path planning method' 创建关系图...
找到与 'path planning method' 共现的关键词数量: 185
共现次数 >= 2 的关键词数量: 27
警告: 数据集中缺少'出版年'列
警告: 数据集中缺少'被引频次'列
警告: 数据集中缺少'标题'列
找到与'path planning method'相关的最近3年高引用文章: 0
找到与'path planning method'相关的其他年份高引用文章: 0
警告: 数据集中缺少机构相关列
综合报告已保存至: ./results/CFpathPlanning101_replaced_synonyms_20250512_11\path_planning_method_report.html
关键词 'path planning method' 的报告已完成!


开始分析关键词: robot programming

开始为关键词 'robot programming' 创建综合报告...

开始为关键词 'robot programming' 创建关系图...
