#### 文献关键词画图

- 输入为keywords_extract处理后的_replaced_synonyms.csv
- 并且指定关键词
- 绘制指定关键词相关的关键词图，并且查询top cite论文

In [25]:
# 导入必要的库
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter, defaultdict
import re
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from matplotlib.font_manager import FontProperties
from pyecharts import options as opts
from pyecharts.charts import Graph
from pyecharts.globals import ThemeType
from IPython.display import display, HTML, IFrame
import json
import warnings

warnings.filterwarnings('ignore')

In [26]:
def load_data(file_path):
    """加载CSV数据文件"""
    df = pd.read_csv(file_path, encoding='utf-8')
    print(f"数据集大小: {df.shape}")
    print(f"数据集列名: {df.columns.tolist()}")
    return df

In [27]:
insight_keywords = ['machine learning models']

In [28]:
file_path = './results/4DPrinting3147/4DPrinting3147_replaced_synonyms.csv'
save_path = './results'
df = load_data(file_path)

数据集大小: (3147, 26)
数据集列名: ['作者', 'Author full names', '作者 ID', '文献标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '摘要', '作者关键字', '索引关键字', '通讯地址', '文献类型', '出版阶段', '开放获取', '来源出版物', 'EID']


In [29]:
directory_path = os.path.dirname(file_path)
output_dir = f"{directory_path}/Specific"

relation_graph_dir = f"{directory_path}/Specific/Relation_Graph"
topcite_dir = f"{directory_path}/Specific/TopCite"

for dir in [output_dir, relation_graph_dir, topcite_dir]:
    if not os.path.exists(dir):
        os.makedirs(dir)


In [30]:
# 定义函数来为特定关键词创建关系图
def create_keyword_graph(df, target_keyword, output_dir, top_related=30, min_co_occurrence=2):
    """
    为目标关键词创建关系图

    参数:
    - df: 包含文献数据的DataFrame
    - target_keyword: 要分析的目标关键词
    - output_dir: 输出文件保存目录
    - top_related: 要包含的相关关键词数量
    - min_co_occurrence: 最小共现次数阈值
    """
    print(f"\n开始为关键词 '{target_keyword}' 创建关系图...")

    # 获取关键词与其他关键词的共现关系
    keyword_relations = defaultdict(Counter)

    # 分析每篇文献中关键词的共现情况
    for _, row in df.iterrows():
        # 获取文献的关键词列表
        keywords = []

        # 从作者关键字和索引关键字中提取
        for col in ['作者关键字', '索引关键字']:
            if col in df.columns and isinstance(row[col], str) and row[col].strip():
                # 分割关键词
                kws = [k.strip().lower() for k in re.split(r'[;,，；]', row[col]) if k.strip()]
                keywords.extend(kws)

        # 去重
        keywords = list(set(keywords))

        # 如果关键词列表中包含目标关键词
        if target_keyword.lower() in [k.lower() for k in keywords]:
            # 更新共现计数
            for i, kw1 in enumerate(keywords):
                for j, kw2 in enumerate(keywords):
                    if i != j:  # 避免自身与自身的关系
                        keyword_relations[kw1.lower()][kw2.lower()] += 1

    # 检查目标关键词是否被找到
    if target_keyword.lower() not in keyword_relations:
        print(f"在数据集中未找到关键词 '{target_keyword}'")
        return None

    # 获取与目标关键词共现最多的关键词
    related_keywords = keyword_relations[target_keyword.lower()]
    print(f"找到与 '{target_keyword}' 共现的关键词数量: {len(related_keywords)}")

    # 过滤掉共现次数低于阈值的关键词
    filtered_related = {k: v for k, v in related_keywords.items() if v >= min_co_occurrence}
    print(f"共现次数 >= {min_co_occurrence} 的关键词数量: {len(filtered_related)}")

    # 获取前N个最相关的关键词
    top_keywords = [k for k, _ in sorted(filtered_related.items(), key=lambda x: x[1], reverse=True)[:top_related]]

    # 添加目标关键词
    top_keywords = [target_keyword.lower()] + [k for k in top_keywords if k != target_keyword.lower()]

    # 构建节点和边
    nodes = []
    links = []

    # 添加目标关键词作为中心节点
    nodes.append({
        "id": target_keyword.lower(),
        "name": target_keyword,
        "symbolSize": 50,  # 较大的节点大小
        "value": sum(related_keywords.values()),  # 值为总共现次数
        "category": 0,  # 中心节点类别
        "itemStyle": {"color": "#FF0000"}  # 红色
    })

    # 添加相关关键词节点
    for i, kw in enumerate(top_keywords[1:], 1):  # 跳过中心节点
        # 计算节点大小（基于共现次数）
        size = max(20, min(40, 15 + filtered_related[kw] * 2))

        nodes.append({
            "id": kw,
            "name": kw,
            "symbolSize": size,
            "value": filtered_related[kw],
            "category": 1,  # 关联节点类别
        })

        # 添加与中心节点的连接
        links.append({
            "source": target_keyword.lower(),
            "target": kw,
            "value": filtered_related[kw]
        })

    # 添加关键词之间的连接
    for i, kw1 in enumerate(top_keywords[1:], 1):
        for j, kw2 in enumerate(top_keywords[1:], 1):
            if i < j:  # 避免重复连接
                co_occurrence = keyword_relations[kw1][kw2]
                if co_occurrence >= min_co_occurrence:
                    links.append({
                        "source": kw1,
                        "target": kw2,
                        "value": co_occurrence
                    })

    # 创建类别
    categories = [
        {"name": "中心关键词"},
        {"name": "相关关键词"}
    ]

    # 创建ECharts图表
    c = (
        Graph(init_opts=opts.InitOpts(width="900px", height="700px", theme=ThemeType.LIGHT))
        .add(
            "",
            nodes=nodes,
            links=links,
            categories=categories,
            layout="force",
            is_roam=True,
            is_focusnode=True,
            is_rotate_label=True,
            is_draggable=True,
            linestyle_opts=opts.LineStyleOpts(width=1.5, curve=0.3, opacity=0.8),
            label_opts=opts.LabelOpts(is_show=True, position="right", font_size=12),
            repulsion=800,
            edge_symbol=["none", "arrow"],
            gravity=0.2,
            edge_length=120,
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title=f"关键词 '{target_keyword}' 关系图"),
            tooltip_opts=opts.TooltipOpts(
                trigger="item",
                formatter="{a} <br/>{b} : {c}"
            ),
            legend_opts=opts.LegendOpts(
                orient="vertical",
                pos_left="2%",
                pos_top="20%",
                is_show=True
            )
        )
    )

    # 保存为HTML文件
    output_file = os.path.join(output_dir, f"{target_keyword.replace(' ', '_')}_relation_graph.html")
    c.render(output_file)
    print(f"关系图已保存至: {output_file}")

    # 在Jupyter中显示
    IFrame(output_file, width=1000, height=800)

    return output_file

In [31]:
# 分析单个关键词的函数
def analyze_keyword(df, keyword, output_dir):
    """分析单个关键词并生成关系图"""
    print(f"\n{'='*50}\n开始分析关键词: {keyword}\n{'='*50}")

    # 创建关键词关系图
    graph_file = create_keyword_graph(df, keyword, output_dir)

    if graph_file:
        return IFrame(graph_file, width=1000, height=800)
    else:
        return None

In [32]:
# 分析关键词列表的函数
def analyze_keywords(df, keywords_list, output_dir):
    """依次分析多个关键词并生成关系图"""
    results = {}

    for keyword in keywords_list:
        result = analyze_keyword(df, keyword, output_dir)
        results[keyword] = result

    return results

In [33]:
# 对单个关键词进行分析的函数
def run_analysis_for_keyword():
    """交互式输入一个关键词并进行分析"""
    keyword = input("请输入要分析的关键词: ")
    if keyword.strip():
        return analyze_keyword(df, keyword.strip(), output_dir)
    else:
        print("未输入关键词")
        return None

In [34]:
for keyword in insight_keywords:
    print(f"\n正在分析关键词: {keyword}")
    result = analyze_keyword(df, keyword, relation_graph_dir)

display(result)  # 在Jupyter中显示结果


正在分析关键词: machine learning models

开始分析关键词: machine learning models

开始为关键词 'machine learning models' 创建关系图...
找到与 'machine learning models' 共现的关键词数量: 768
共现次数 >= 2 的关键词数量: 171
关系图已保存至: ./results/4DPrinting3147/Specific/Relation_Graph\machine_learning_models_relation_graph.html


In [35]:
def analyze_top_cited_papers(df, keywords, output_dir, current_year=2025):
    """
    分析每个关键词的引用排名最高的文章

    参数:
    - df: 包含文献数据的DataFrame
    - keywords: 要分析的关键词列表
    - output_dir: 输出文件保存目录
    - current_year: 当前年份，用于计算最近3年
    """
    # 确保施引文献列是数值型的
    df['施引文献'] = pd.to_numeric(df['施引文献'], errors='coerce').fillna(0).astype(int)

    results = {}

    for keyword in keywords:
        print(f"\n{'-'*70}")
        print(f"分析关键词 '{keyword}' 的高引用文章:")
        print(f"{'-'*70}")

        # 筛选包含该关键词的文章
        keyword_papers = []

        for _, row in df.iterrows():
            keywords_found = False

            # 检查摘要
            if isinstance(row['摘要'], str) and keyword.lower() in row['摘要'].lower():
                keywords_found = True

            # 检查标题
            if isinstance(row['文献标题'], str) and keyword.lower() in row['文献标题'].lower():
                keywords_found = True

            # 检查作者关键字和索引关键字
            for col in ['作者关键字', '索引关键字']:
                if col in df.columns and isinstance(row[col], str):
                    if keyword.lower() in row[col].lower():
                        keywords_found = True

            if keywords_found:
                keyword_papers.append(row)

        if not keyword_papers:
            print(f"没有找到包含关键词 '{keyword}' 的文章")
            continue

        keyword_df = pd.DataFrame(keyword_papers)
        print(f"找到包含关键词 '{keyword}' 的文章数量: {len(keyword_df)}")

        # 确保年份是数值型的
        keyword_df['年份'] = pd.to_numeric(keyword_df['年份'], errors='coerce')

        # 计算最近3年的范围
        recent_years = list(range(current_year-2, current_year+1))

        # 最近3年的高引用文章
        recent_papers = keyword_df[keyword_df['年份'].isin(recent_years)]
        top_recent = recent_papers.sort_values('施引文献', ascending=False).head(5)

        # 所有年份(除去最近3年)的高引用文章
        older_papers = keyword_df[~keyword_df['年份'].isin(recent_years)]
        top_older = older_papers.sort_values('施引文献', ascending=False).head(5)

        # 显示最近3年的高引用文章
        print(f"\n最近3年 ({', '.join(map(str, recent_years))}) 引用最高的5篇文章:")
        if len(top_recent) > 0:
            for i, (_, paper) in enumerate(top_recent.iterrows(), 1):
                print(f"{i}. {paper['文献标题']} ({paper['年份']})")
                print(f"   作者: {paper['作者']}")
                print(f"   引用数: {paper['施引文献']}")
                print(f"   DOI: {paper['DOI']}")
                print()
        else:
            print("没有最近3年的相关文章")

        # 显示所有年份(除去最近3年)的高引用文章
        print(f"\n所有年份(除去最近3年)引用最高的5篇文章:")
        if len(top_older) > 0:
            for i, (_, paper) in enumerate(top_older.iterrows(), 1):
                print(f"{i}. {paper['文献标题']} ({paper['年份']})")
                print(f"   作者: {paper['作者']}")
                print(f"   引用数: {paper['施引文献']}")
                print(f"   DOI: {paper['DOI']}")
                print()
        else:
            print("没有较早年份的相关文章")

        # 存储结果
        results[keyword] = {
            'recent': top_recent,
            'older': top_older
        }

    return results

In [36]:
def analyze_top_cited_papers_simple(df, keywords, output_dir, current_year=2025):
    """
    分析每个关键词的引用排名最高的文章并使用简单HTML表格展示

    参数:
    - df: 包含文献数据的DataFrame
    - keywords: 要分析的关键词列表
    - output_dir: 输出文件保存目录
    - current_year: 当前年份，用于计算最近3年
    """
    # 确保施引文献列是数值型的
    df['施引文献'] = pd.to_numeric(df['施引文献'], errors='coerce').fillna(0).astype(int)

    results = {}

    for keyword in keywords:
        print(f"\n{'-'*70}")
        print(f"分析关键词 '{keyword}' 的高引用文章:")
        print(f"{'-'*70}")

        # 筛选包含该关键词的文章
        keyword_papers = []

        for _, row in df.iterrows():
            keywords_found = False

            # 检查摘要
            if isinstance(row['摘要'], str) and keyword.lower() in row['摘要'].lower():
                keywords_found = True

            # 检查标题
            if isinstance(row['文献标题'], str) and keyword.lower() in row['文献标题'].lower():
                keywords_found = True

            # 检查作者关键字和索引关键字
            for col in ['作者关键字', '索引关键字']:
                if col in df.columns and isinstance(row[col], str):
                    if keyword.lower() in row[col].lower():
                        keywords_found = True

            if keywords_found:
                keyword_papers.append(row)

        if not keyword_papers:
            print(f"没有找到包含关键词 '{keyword}' 的文章")
            continue

        keyword_df = pd.DataFrame(keyword_papers)
        print(f"找到包含关键词 '{keyword}' 的文章数量: {len(keyword_df)}")

        # 确保年份是数值型的
        keyword_df['年份'] = pd.to_numeric(keyword_df['年份'], errors='coerce')

        # 计算最近3年的范围
        recent_years = list(range(current_year-2, current_year+1))

        # 最近3年的高引用文章
        recent_papers = keyword_df[keyword_df['年份'].isin(recent_years)]
        top_recent = recent_papers.sort_values('施引文献', ascending=False).head(5)

        # 所有年份(除去最近3年)的高引用文章
        older_papers = keyword_df[~keyword_df['年份'].isin(recent_years)]
        top_older = older_papers.sort_values('施引文献', ascending=False).head(5)

        # 生成简单的HTML表格展示结果
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <meta charset="utf-8">
            <title>关键词 '{keyword}' 高引用文章分析</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; }}
                h1 {{ color: #333; }}
                h2 {{ color: #555; margin-top: 30px; }}
                table {{ border-collapse: collapse; width: 100%; margin-top: 10px; }}
                th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
                th {{ background-color: #f2f2f2; }}
                tr:nth-child(even) {{ background-color: #f9f9f9; }}
                .citations {{ font-weight: bold; color: #e63946; }}
                .year {{ color: #457b9d; }}
            </style>
        </head>
        <body>
            <h1>关键词 '{keyword}' 高引用文章分析</h1>

            <h2>最近3年 ({', '.join(map(str, recent_years))}) 引用最高的文章</h2>
        """

        if len(top_recent) > 0:
            html_content += """
            <table>
                <tr>
                    <th>排名</th>
                    <th>标题</th>
                    <th>作者</th>
                    <th>年份</th>
                    <th>引用数</th>
                    <th>DOI</th>
                </tr>
            """

            for i, (_, paper) in enumerate(top_recent.iterrows(), 1):
                doi_link = f"<a href='https://doi.org/{paper['DOI']}' target='_blank'>{paper['DOI']}</a>" if isinstance(paper['DOI'], str) else "N/A"
                html_content += f"""
                <tr>
                    <td>{i}</td>
                    <td>{paper['文献标题']}</td>
                    <td>{paper['作者']}</td>
                    <td class="year">{int(paper['年份'])}</td>
                    <td class="citations">{int(paper['施引文献'])}</td>
                    <td>{doi_link}</td>
                </tr>
                """

            html_content += "</table>"
        else:
            html_content += "<p>没有最近3年的相关文章</p>"

        html_content += f"""
            <h2>所有年份(除去最近3年)引用最高的文章</h2>
        """

        if len(top_older) > 0:
            html_content += """
            <table>
                <tr>
                    <th>排名</th>
                    <th>标题</th>
                    <th>作者</th>
                    <th>年份</th>
                    <th>引用数</th>
                    <th>DOI</th>
                </tr>
            """

            for i, (_, paper) in enumerate(top_older.iterrows(), 1):
                doi_link = f"<a href='https://doi.org/{paper['DOI']}' target='_blank'>{paper['DOI']}</a>" if isinstance(paper['DOI'], str) else "N/A"
                html_content += f"""
                <tr>
                    <td>{i}</td>
                    <td>{paper['文献标题']}</td>
                    <td>{paper['作者']}</td>
                    <td class="year">{int(paper['年份'])}</td>
                    <td class="citations">{int(paper['施引文献'])}</td>
                    <td>{doi_link}</td>
                </tr>
                """

            html_content += "</table>"
        else:
            html_content += "<p>没有较早年份的相关文章</p>"

        html_content += """
        </body>
        </html>
        """

        # 保存HTML文件
        html_file = os.path.join(output_dir, f"{keyword.replace(' ', '_')}_top_cited_simple.html")
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(html_content)

        print(f"结果已保存到: {html_file}")

        if keyword == keywords[0]:
            # 在Jupyter中显示结果
            display(HTML(html_content))

        # 保存文本版本的结果
        txt_file = os.path.join(output_dir, f"{keyword.replace(' ', '_')}_top_cited_papers.txt")
        with open(txt_file, 'w', encoding='utf-8') as f:
            f.write(f"关键词 '{keyword}' 的高引用文章分析\n")
            f.write(f"{'='*70}\n\n")

            f.write(f"最近3年 ({', '.join(map(str, recent_years))}) 引用最高的5篇文章:\n")
            if len(top_recent) > 0:
                for i, (_, paper) in enumerate(top_recent.iterrows(), 1):
                    f.write(f"{i}. {paper['文献标题']} ({paper['年份']})\n")
                    f.write(f"   作者: {paper['作者']}\n")
                    f.write(f"   引用数: {paper['施引文献']}\n")
                    f.write(f"   DOI: {paper['DOI']}\n\n")
            else:
                f.write("没有最近3年的相关文章\n\n")

            f.write(f"所有年份(除去最近3年)引用最高的5篇文章:\n")
            if len(top_older) > 0:
                for i, (_, paper) in enumerate(top_older.iterrows(), 1):
                    f.write(f"{i}. {paper['文献标题']} ({paper['年份']})\n")
                    f.write(f"   作者: {paper['作者']}\n")
                    f.write(f"   引用数: {paper['施引文献']}\n")
                    f.write(f"   DOI: {paper['DOI']}\n\n")
            else:
                f.write("没有较早年份的相关文章\n")

        # 存储结果
        results[keyword] = {
            'recent': top_recent,
            'older': top_older,
            'html_file': html_file,
            'txt_file': txt_file
        }

    return results

In [37]:
# 调用简化版函数分析所有关键词的高引用文章
top_papers_simple = analyze_top_cited_papers_simple(df, insight_keywords, topcite_dir)


----------------------------------------------------------------------
分析关键词 'machine learning models' 的高引用文章:
----------------------------------------------------------------------
找到包含关键词 'machine learning models' 的文章数量: 52
结果已保存到: ./results/4DPrinting3147/Specific/TopCite\machine_learning_models_top_cited_simple.html


排名,标题,作者,年份,引用数,DOI
1,Recent Advances in the Applications of Additive Manufacturing (3D Printing) in Drug Delivery: A Comprehensive Review,Muhindo D.; Elkanayati R.; Srinivasan P.; Repka M.A.; Ashour E.A.,2023,66,10.1208/s12249-023-02524-9
2,Machine Learning Customized Novel Material for Energy-Efficient 4D Printing,Tan C.; Li Q.; Yao X.; Chen L.; Su J.; Ng F.L.; Liu Y.; Yang T.; Chew Y.; Liu C.T.; DebRoy T.,2023,43,10.1002/advs.202206607
3,Recent Advances in 4D Printing of Advanced Materials and Structures for Functional Applications,Wan X.; Xiao Z.; Tian Y.; Chen M.; Liu F.; Wang D.; Liu Y.; Bartolo P.J.D.S.; Yan C.; Shi Y.; Zhao R.R.; Qi H.J.; Zhou K.,2024,43,10.1002/adma.202312263
4,Gelatin Methacryloyl (GelMA)-Based Biomaterial Inks: Process Science for 3D/4D Printing and Current Status,Das S.; Jegadeesan J.T.; Basu B.,2024,26,10.1021/acs.biomac.3c01271
5,Perspective: Machine Learning in Design for 3D/4D Printing,Sun X.; Zhou K.; Demoly F.; Zhao R.R.; Qi H.J.,2024,21,10.1115/1.4063684

排名,标题,作者,年份,引用数,DOI
1,Closed-loop 4D-printed soft robots,Zolfagharian A.; Kaynak A.; Kouzani A.,2020,177,10.1016/j.matdes.2019.108411
2,Trends in 3D Printing Processes for Biomedical Field: Opportunities and Challenges,Ghilan A.; Chiriac A.P.; Nita L.E.; Rusu A.G.; Neamtu I.; Chiriac V.M.,2020,155,10.1007/s10924-020-01722-x
3,Emerging 3D printing technologies for drug delivery devices: Current status and future perspective,Wang J.; Zhang Y.; Aghda N.H.; Pillai A.R.; Thakkar R.; Nokhodchi A.; Maniruzzaman M.,2021,149,10.1016/j.addr.2021.04.019
4,Machine-learning based design of active composite structures for 4D printing,Hamel C.M.; Roach D.J.; Long K.N.; Demoly F.; Dunn M.L.; Qi H.J.,2019,127,10.1088/1361-665X/ab1439
5,3D printing of tissue engineering scaffolds: a focus on vascular regeneration,Wang P.; Sun Y.; Shi X.; Shen H.; Ning H.; Liu H.,2021,123,10.1007/s42242-020-00109-0
