In [15]:
import os
import json
import numpy as np
import networkx as nx
from sklearn.preprocessing import LabelEncoder
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
    
version = "v2_5"

data_path = config[f'data_path_{version}']
jsonl_path = config[f'jsonl_file_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']

jsonl_file = os.path.join(data_path, jsonl_path)
content_file = os.path.join(data_path, content_data)
cites_file = os.path.join(data_path, cites_data)

# 读取jsonl文件并提取theme_1和topic的关系
edges = []
themes = set()
topics = set()

with open(jsonl_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        theme_1 = f"theme_{data['theme_1']}"  # 添加前缀 "theme_"
        topic_list = data['topic_map'] 
        
        # 如果 topic 是字符串而非列表，转换为单元素列表
        if isinstance(topic_list, str):
            topic_list = [topic_list]
        
        for topic in topic_list:
            topic = f"topic_{topic}"  # 添加前缀 "topic_"
            edges.append((theme_1, topic))
            themes.add(theme_1)
            topics.add(topic)

# 创建一个带权重的图
G = nx.Graph()

# 添加边并处理重复边增加权重
for edge in edges:
    if G.has_edge(*edge):
        G[edge[0]][edge[1]]['weight'] += 1  # 如果边已存在，增加权重
    else:
        G.add_edge(*edge, weight=1)  # 如果边不存在，设置初始权重为1

# 在添加边之后,图构建完成之后
for u, v, d in G.edges(data=True):
    G[u][v]['weight'] = 1 + np.log1p(G[u][v]['weight'])  # log1p(x) = log(1+x)

# 然后再进行归一化
max_weight = max(edge[2]['weight'] for edge in G.edges(data=True))
for u, v, d in G.edges(data=True):
    G[u][v]['weight'] /= max_weight

# 为每个节点添加标签
for theme in themes:
    G.nodes[theme]['label'] = 'theme'  # 给 theme 节点添加标签

for topic in topics:
    G.nodes[topic]['label'] = 'topic'  # 给 topic 节点添加标签

# 构建节点特征矩阵和邻接矩阵
nodes = list(G.nodes)
adj_matrix = nx.adjacency_matrix(G, nodelist=nodes).todense()

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(nodes)
num_classes = len(label_encoder.classes_)
feature_matrix = np.eye(num_classes)[labels]

# 更新 cites_lines
cites_lines = []
for edge in G.edges(data=True):
    cites_line = f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']:.4f}"
    cites_lines.append(cites_line)

# 保存cora.cites文件
with open(cites_file, "w") as f:
    for line in cites_lines:
        f.write(line + '\n')

content_lines = []
node_ids = list(G.nodes)
for node_id in node_ids:
    attributes = [1 if G.has_edge(node_id, other_id) or G.has_edge(other_id, node_id) else 0 for other_id in node_ids]
    data = G.nodes[node_id]
    label = data['label']
    content_line = f"{node_id}\t{' '.join(map(str, attributes))}\t{label}"
    content_lines.append(content_line)

# 保存cora.content文件
with open(content_file, "w") as f:
    for line in content_lines:
        f.write(line + '\n')

print("cora.cites 和 cora.content 文件已保存。")


cora.cites 和 cora.content 文件已保存。


  adj_matrix = nx.adjacency_matrix(G, nodelist=nodes).todense()


In [16]:
import torch
import numpy as np
import os
import random
import pandas as pd
import scipy.sparse as sp
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from node2vec import Node2Vec
import networkx as nx
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
    
version = "v2_5"

data_path = config[f'data_path_{version}']
jsonl_path = config[f'jsonl_file_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']

model_save_path = config['model_save_path']
png_path = config['png_path']
dimensions = int(config['dimensions'])
theme_emb_path = config['theme_emb_path']
topic_emb_path = config['topic_emb_path']

def seed_everything(seed=2023):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()
def load_cora_data(data_path=data_path):
    content_df = pd.read_csv(os.path.join(data_path, content_data), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    index = content_df.index.tolist()

    features = np.array([list(map(float, row[0].split())) for row in content_df.iloc[:, :-1].values])
    features = sp.csr_matrix(features, dtype=np.float32)

    labels = content_df.values[:, -1]
    class_encoder = LabelEncoder()
    labels = class_encoder.fit_transform(labels)

    cites_df = pd.read_csv(os.path.join(data_path, cites_data), delimiter="\t", header=None)
    cites_df[0] = cites_df[0].astype(str)
    cites_df[1] = cites_df[1].astype(str)
    cites_df[2] = cites_df[2].astype(float)  
    
    cites = [tuple(x) for x in cites_df.values]
    edges = [(index.index(cite[0]), index.index(cite[1])) for cite in cites]
    weights = cites_df[2].values  # 提取权重

    edges = np.array(edges).T
    data = Data(x=torch.from_numpy(np.array(features.todense())),
                edge_index=torch.LongTensor(edges),
                edge_attr=torch.FloatTensor(weights).unsqueeze(1),  # 加入边的权重
                y=torch.from_numpy(labels))

    idx_train = range(242)
    idx_val = range(242)
    idx_test = range(242)

    def index_to_mask(index, size):
        mask = np.zeros(size, dtype=bool)
        mask[index] = True
        return mask

    data.train_mask = index_to_mask(idx_train, size=labels.shape[0])
    data.val_mask = index_to_mask(idx_val, size=labels.shape[0])
    data.test_mask = index_to_mask(idx_test, size=labels.shape[0])

    def to_networkx(data):
        edge_index = data.edge_index.to(torch.device('cpu')).numpy()
        edge_weights = data.edge_attr.to(torch.device('cpu')).numpy()
        G = nx.Graph()
        for i, (src, tar) in enumerate(edge_index.T):
            G.add_edge(index[src], index[tar] ,weight=edge_weights[i][0])  # 添加权重
        return G

    networkx_data = to_networkx(data)

    return data, networkx_data, index

pyg_data, networkx_data, node_index = load_cora_data()

def Node2Vec_run(networkx_data, dimensions=128, walk_length=30, num_walks=200, model_save_path="node2vec2.model"):
    p = 0.5
    #q = 2  
    q = 0.5

    node2vec = Node2Vec(
    networkx_data, 
    dimensions=dimensions, 
    walk_length=walk_length, 
    num_walks=num_walks, 
    workers=4,
    p=p,  # 添加 p 参数
    q=q   # 添加 q 参数
)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    nodes = model.wv.index_to_key
    embeddings = model.wv[nodes]
    
    # 保存模型
    model.save(model_save_path)
    
    return model, nodes, embeddings

def plot_embeddings(embeddings, labels, output_path):
    tsne = TSNE(n_components=2)
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 10))
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap="jet", alpha=0.6)
    legend1 = plt.legend(*scatter.legend_elements(), title="Classes")
    plt.gca().add_artist(legend1)
    plt.savefig(output_path)
    plt.close()

def plot_embeddings2(embeddings, labels, output_path, node_index):
    tsne = TSNE(n_components=2, random_state=42)
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap="jet", alpha=0.6)
    
    # 添加颜色条
    plt.colorbar(scatter)

    # 添加一些节点的标签
    num_labels = min(20, len(node_index))  # 限制标签数量以避免过度拥挤
    step = len(node_index) // num_labels
    for i in range(0, len(node_index), step):
        plt.annotate(node_index[i], (reduced_embeddings[i, 0], reduced_embeddings[i, 1]), fontsize=8)

    plt.title("Node2Vec Embeddings Visualization")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

def save_embeddings(embeddings, node_index):
    if len(embeddings) != len(node_index):
        raise ValueError("The length of embeddings and node_index must match.")

    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df['index'] = node_index
    # 将 index 列移动到第一列
    embeddings_df = embeddings_df[['index'] + [col for col in embeddings_df.columns if col != 'index']]
    # 分离开头是'theme_'的数据和开头是'topic_'的数据
    theme_df = embeddings_df[embeddings_df['index'].str.startswith('theme_')]
    topic_df = embeddings_df[embeddings_df['index'].str.startswith('topic_')]
    theme_df.to_csv(os.path.join(data_path, theme_emb_path), index=False)
    topic_df.to_csv(os.path.join(data_path, topic_emb_path), index=False)

_, _, node2vec_embeddings = Node2Vec_run(networkx_data, dimensions=dimensions, walk_length=10, num_walks=500, model_save_path=os.path.join(data_path, model_save_path))
print("node2vec_embeddings:", np.array(node2vec_embeddings).shape)
plot_embeddings2(node2vec_embeddings, pyg_data.y.numpy(), os.path.join(data_path, png_path))
save_embeddings(node2vec_embeddings, node_index)


  from .autonotebook import tqdm as notebook_tqdm
Computing transition probabilities: 100%|██████████| 242/242 [00:01<00:00, 130.31it/s]
Generating walks (CPU: 1): 100%|██████████| 125/125 [00:01<00:00, 65.97it/s]
Generating walks (CPU: 2): 100%|██████████| 125/125 [00:01<00:00, 65.68it/s]
Generating walks (CPU: 3): 100%|██████████| 125/125 [00:01<00:00, 65.21it/s]
Generating walks (CPU: 4): 100%|██████████| 125/125 [00:01<00:00, 66.12it/s]


node2vec_embeddings: (242, 128)


# 分别保存teme和topic的相似度矩阵

In [17]:
import os
import pandas as pd
import numpy as np
import torch
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
    
version = "v2_5"

data_path = config[f'data_path_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']
model_save_path = config['model_save_path']
theme_sim_path = config['theme_sim_path']
topic_sim_path = config['topic_sim_path']


class configs():
    def __init__(self):
        self.data_path = data_path
        self.model_path = os.path.join(data_path, model_save_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cfg = configs()

def load_cora_data(data_path=data_path):
    content_df = pd.read_csv(os.path.join(data_path, content_data), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    index = content_df.index.astype(str).tolist()  # 使用字符串索引

    original_content = content_df.iloc[:, -1].tolist()

    return original_content, index

def load_model(model_path):
    model = Word2Vec.load(model_path)
    
    # 打印所有节点名称
    all_nodes = model.wv.index_to_key
    print("All nodes in the model:")
    for node in all_nodes:
        print(node)
    
    return model

def find_similar_vectors(model, target_node, node_index, top_k=5):
    # 获取目标节点的类别（topic_ 或 theme_）
    if target_node.startswith('topic_'):
        relevant_nodes = [node for node in model.wv.index_to_key if node.startswith('topic_')]
    elif target_node.startswith('theme_'):
        relevant_nodes = [node for node in model.wv.index_to_key if node.startswith('theme_')]
    else:
        raise ValueError(f"Node '{target_node}' does not start with 'topic_' or 'theme_'")
    
    # 获取目标节点的向量
    try:
        target_vector = model.wv[target_node]
    except KeyError:
        raise KeyError(f"Key '{target_node}' not present in the model vocabulary")

    # 计算所有相关节点的相似度
    relevant_vectors = np.array([model.wv[node] for node in relevant_nodes])
    similarities = cosine_similarity([target_vector], relevant_vectors)[0]
    
    # 设置自身相似度为负无穷大，确保不会被选中
    target_index = relevant_nodes.index(target_node)
    similarities[target_index] = -np.inf
    
    # 找到最相似的 top_k 个节点
    top_k_indices = similarities.argsort()[-top_k:][::-1]
    top_k_similarities = similarities[top_k_indices]
    top_k_nodes = [relevant_nodes[i] for i in top_k_indices]

    # 将最相似的节点转换为ID
    top_k_ids = [node_index.index(node) for node in top_k_nodes]

    return top_k_ids, top_k_nodes, top_k_similarities

if __name__ == '__main__':
    original_content, node_index = load_cora_data(cfg.data_path)
    
    # 加载已训练的Node2Vec模型
    model_save_path = cfg.model_path
    model = load_model(model_save_path)
    
    # 分别存储 topic_ 和 theme_ 的结果
    topic_results = []
    theme_results = []

    for idx, target_node in enumerate(node_index):
        if not target_node.startswith('topic_') and not target_node.startswith('theme_'):
            continue  # 忽略非 topic_ 和 theme_ 开头的节点
        
        top_k_ids, top_k_nodes, top_k_similarities = find_similar_vectors(model, target_node, node_index)
        
        result = {
            'id': idx+1,
            '节点实际内容': target_node,
            '最相似的前5个节点id及相似度': [(top_k_ids[i]+1, top_k_similarities[i]) for i in range(5)]
        }
        for i in range(5):
            result[f'最相似的节点 {i+1}'] = top_k_nodes[i]
        
        if target_node.startswith('topic_'):
            topic_results.append(result)
        elif target_node.startswith('theme_'):
            theme_results.append(result)

    # 将结果保存到 DataFrame 并导出为 Excel 文件
    if topic_results:
        topic_results_df = pd.DataFrame(topic_results)
        topic_output_file_path = os.path.join(data_path, topic_sim_path)
        topic_results_df.to_excel(topic_output_file_path, index=False)
        print(f"Topic results have been saved to {topic_output_file_path}.")

    if theme_results:
        theme_results_df = pd.DataFrame(theme_results)
        theme_output_file_path = os.path.join(data_path, theme_sim_path)
        theme_results_df.to_excel(theme_output_file_path, index=False)
        print(f"Theme results have been saved to {theme_output_file_path}.")


All nodes in the model:
theme_日常生活
theme_旅游与交通
theme_学习与教育
theme_兴趣与爱好
theme_学校
theme_家庭
theme_文娱与体育
theme_个人信息
theme_购物
theme_计划与安排
theme_人际交往
theme_地点
theme_卫生与健康
theme_节假日活动
theme_工作
theme_媒体与影视
theme_动物
theme_人物描写
theme_物品
theme_饮食
theme_安全与救护
theme_故事
theme_世界与环境
theme_求助
theme_天气
theme_文学与艺术
theme_服务
theme_情感与情绪
theme_服装与颜色
theme_历史与社会
theme_经历
theme_数字与时间
theme_知识与科普
theme_通讯
topic_经历
topic_家庭
topic_活动
topic_爱好
topic_计划
topic_社会
topic_个人
topic_学习
topic_交流
topic_时间
topic_旅游
topic_体育
topic_情感
topic_通讯
topic_购物
topic_分析
topic_建议
topic_态度
topic_教育
topic_地理
topic_生活
topic_互动
topic_工作
topic_美食
topic_交通
topic_社交
topic_物品
topic_医疗
topic_总结
topic_信息
topic_文化
topic_天气
topic_动物
topic_决策
topic_自然
topic_询问
topic_环境
topic_健康
topic_指引
topic_日常
topic_建筑
topic_承诺
topic_案件
topic_习惯
topic_人物
topic_地点
topic_礼物
topic_前景
topic_心理
topic_媒体
topic_选择
topic_行为
topic_语言
topic_友谊
topic_科技
topic_服装
topic_饮食
topic_图书
topic_娱乐
topic_反馈
topic_特殊情况
topic_艺术
topic_电影
topic_技能
topic_电话
topic_节日
topic_剧情
topic_摄影


# 将 theme 按序拼接到 topic_id文件中
## grapth中需要theme节点，所以需要将theme节点id添加到topic_id文件中

In [1]:

import os
import json
import csv
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
    
version = "v2_5"

data_path = config[f'data_path_{version}']
id4topics = config['id4topics']
topics_themes_id = config['topics_themes_id']
theme_emb_path = config['theme_emb_path']

# 获取数据库中 topic话题 与 id的映射表
topic_id_file = os.path.join(data_path, id4topics)
with open(topic_id_file, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)
# 获取当前最大序号
max_index = max(data.values())

# 读取 theme emb 文件并提取第一列的名字
theme_emb_file = os.path.join(data_path, theme_emb_path)
names = []
with open(theme_emb_file, 'r', encoding='utf-8') as theme_emb:
    reader = csv.reader(theme_emb)
    names = [row[0] for row in reader if row[0].startswith('theme_')]

# 将提取的名字按顺序拼接到 JSON 对象中
for name in names:
    max_index += 1
    data[name] = max_index

# 将更新后的 JSON 对象写回文件
topics_themes_file = os.path.join(data_path, topics_themes_id)
with open(topics_themes_file, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print("JSON 文件已成功更新并保存为 topics_themes_id.json")

JSON 文件已成功更新并保存为 topics_themes_id.json


# 将 边文件 cites 中的theme 和 topic转化为id映射

In [3]:
import os
import json
import csv
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
    
version = "v2_5"

data_path = config[f'data_path_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']
cites_id_path = config['cites_id']

# 需要注意的是topics_themes_id中的 id是从 1 开始的，但是 gnn中要求是从 0 开始的，所以cities映射的时候，要将 id-1 处理
# 读取JSON文件并创建名字到ID的映射
with open(topics_themes_file, 'r', encoding='utf-8') as json_file:
    name_to_id = json.load(json_file)
cities_file = os.path.join(data_path, cites_data)
# 读取cities文件并进行名字到ID的映射转换
with open(cities_file, 'r', encoding='utf-8') as cities_file:
    edges = cities_file.readlines()

edges_with_id = []
for edge in edges:
    parts = edge.strip().split('\t')
    if len(parts) < 2 or len(parts) > 3:
        print(f"Warning: Edge '{edge}' is in an unexpected format.")
        continue
    node1, node2 = parts[0], parts[1]
    weight = parts[2] if len(parts) == 3 else None

    if node1.startswith('topic_') :
        node1 = node1[len('topic_'):]
        
    if node2.startswith('topic_') :
        node2 = node2[len('topic_'):]
    if node1 in name_to_id and node2 in name_to_id:
        node1_id = name_to_id.get(node1) - 1
        node2_id = name_to_id.get(node2) - 1
        if weight is not None:
            edges_with_id.append(f"{node1_id}\t{node2_id}\t{weight}\n")
        else:
            edges_with_id.append(f"{node1_id}\t{node2_id}\n")
    else:
        print(f"Warning: One of the nodes '{node1}' or '{node2}' is not found in the JSON mapping.")

# 将转换后的边写入新的文件

cites_id_file = os.path.join(data_path, cites_id_path)
with open(cites_id_file, 'w', encoding='utf-8') as updated_cities_file:
    updated_cities_file.writelines(edges_with_id)
       
    

# 将两个embedding文件的topic/theme名字改为id，按id序保存在一个文件中

In [4]:
# 将两个embedding文件的名字改为id，按序保存
import os
import json
import csv
import pandas as pd
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
    
version = "v2_5"

data_path = config[f'data_path_{version}']
theme_emb_path = config[f'theme_emb_path']
topic_emb_path = config[f'topic_emb_path']
topics_themes_id = config[f'topics_themes_id']
topics_themes_id_emb = config[f'topics_themes_id_emb']


theme_emb_file = os.path.join(data_path, theme_emb_path)
topic_emb_file = os.path.join(data_path, topic_emb_path)
topics_themes_id_file = os.path.join(data_path, topics_themes_id)
topics_themes_id_emb_file = os.path.join(data_path, topics_themes_id_emb)

# 读取JSON文件
with open(topics_themes_id_file, 'r', encoding='utf-8') as f:
    name_to_id = json.load(f)

theme_emb = pd.read_csv(theme_emb_file,header=0)
topic_emb = pd.read_csv(topic_emb_file,header=0)

theme_emb.iloc[:, 0] = theme_emb.iloc[:, 0].map(name_to_id).apply(lambda x: x - 1 if pd.notna(x) else x)
topic_emb.iloc[:, 0] = topic_emb.iloc[:, 0].str.replace('topic_', '')
topic_emb.iloc[:, 0] = topic_emb.iloc[:, 0].map(name_to_id).apply(lambda x: x - 1 if pd.notna(x) else x)

merged_data = pd.concat([theme_emb, topic_emb])
# 按照 ID 排序
sorted_data = merged_data.sort_values(by=merged_data.columns[0])
# 保存到一个新的文件中
sorted_data.to_csv(topics_themes_id_emb_file, index=False)

### 对齐topic id

In [None]:
# %% 将 top5文件中的topic id 与 处理好的数据集对齐
import pandas as pd
import json
import ast
import os

# v2.5
root_path = '/mnt/new_pfs/liming_team/auroraX/caoying/Graph-Networks/topic_data/v2.5'  # 原始
# root_path = './v2.5'                                                                 # 本地克隆
tid = os.path.join(root_path, 'PROC_id4topics.json')     # 正确的topic ID字典
# tid = os.path.join(root_path, 'topics_themes_id.json')     # 正确的topic & theme ID字典
topk = os.path.join(root_path, 'topic_sim.xlsx')           # top5相似文件(topic)
# topk = os.path.join(root_path, 'theme_sim.xlsx')         # top5相似文件(theme)
out = os.path.join(root_path, 'PROC_topic_sim.xlsx')       # 订正后的文件名(topic)

with open(tid, 'r') as file:
    tid = json.load(file)  # dict

df = pd.read_excel(topk, sheet_name='Sheet1')

# 删除前缀
df['节点实际内容'] = df['节点实际内容'].str.replace(r'^topic_', '', regex=True)
df['最相似的节点 1'] = df['最相似的节点 1'].str.replace(r'^topic_', '', regex=True)
df['最相似的节点 2'] = df['最相似的节点 2'].str.replace(r'^topic_', '', regex=True)
df['最相似的节点 3'] = df['最相似的节点 3'].str.replace(r'^topic_', '', regex=True)
df['最相似的节点 4'] = df['最相似的节点 4'].str.replace(r'^topic_', '', regex=True)
df['最相似的节点 5'] = df['最相似的节点 5'].str.replace(r'^topic_', '', regex=True)

# check
ind_col = df.columns.get_loc('节点实际内容')
if set(df.iloc[:len(tid), ind_col].tolist())==set(tid.keys()):
    print('check pass')
else:
    print('Dismatch!')

# topk文件中的值与id对应dict
dict_topk = dict(zip(df['节点实际内容'], df['id']))

# 映射字典：topk中id -> tid
dict_proj = {}
for key, val in dict_topk.items():
    dict_proj[val] = tid[key]  # 将topk中的id（键）映射为数据集中的id（值）

# 使用映射字典修正‘最相似的前5个节点id及相似度’列的内容
for i in range(df.shape[0]):
    # 逐行遍历df
    temp = df['最相似的前5个节点id及相似度'].iloc[i]
    temp = ast.literal_eval(temp)  # str -> list
    temp_fixed = []
    for _id, _val in temp:
        _id_fixed = dict_proj[_id]
        temp_fixed.append((_id_fixed, _val))
    # 将修正id后的list->str并重新赋值给单元格
    # df['最相似的前5个节点id及相似度'].iloc[i] = str(temp_fixed)
    df.loc[i, '最相似的前5个节点id及相似度'] = str(temp_fixed)
    df.loc[i, 'id'] = dict_proj[df.loc[i, 'id']]  # 查询新id并修正

# 保存
df.to_excel(out, index=False)

### 生成相似度稀疏矩阵

In [6]:
import pandas as pd
import ast
import numpy as np
import os


# v2.5
# root_path = '/mnt/new_pfs/liming_team/auroraX/caoying/Graph-Networks/topic_data/v2.5'  # 原始
root_path = './v2.5'                                                                # 本地克隆

print(os.path.join(root_path, 'PROC_topic_sim.xlsx'))
# 读取 Excel 文件的指定列
df = pd.read_excel(os.path.join(root_path, 'PROC_topic_sim.xlsx'), usecols=['id','最相似的前5个节点id及相似度'])

# 使用 ast.literal_eval 转换单元格内容
ind_cols = df['id'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
info_raws = df['最相似的前5个节点id及相似度'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# 将整列数据存为一个列表
ind_cols = ind_cols.tolist()
info_raws = info_raws.tolist()

# 生成相似度稀疏矩阵
sim_matrix = np.zeros((len(ind_cols)+1, len(ind_cols)+1))  # +1 是因为 id 从 1 开始，(479,479)
for ind_col, info_raw in zip(ind_cols, info_raws):
    for ind_raw, val in info_raw:
        sim_matrix[ind_raw, ind_col] = val

# 保存
print(sim_matrix)
print(np.max(sim_matrix), np.min(sim_matrix))
np.save(os.path.join(root_path, 'sim_mat.npy'), sim_matrix)

"""
确认一下是否需要对对角线进行赋值（目前不用，对角线等价于直接调整，已经在正常流程中进行）
最终输出矩阵的索引0列和行均为0，因为 id 从 1 开始
"""
pass

./v2.5/PROC_topic_sim.xlsx
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.80375594 0.0
