### 构建Graph

In [None]:
import json
import numpy as np
import networkx as nx
from sklearn.preprocessing import LabelEncoder

jsonl_file = "/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/en_title_100w_audio_filter_v2.jsonl"

# 读取jsonl文件并提取theme_1和topic的关系
edges = []
themes = set()
topics = set()

with open(jsonl_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        theme_1 = f"theme_{data['theme_1']}"  # 添加前缀 "theme_"
        topic_list = data['topic_map'] 
        
        # 如果 topic 是字符串而非列表，转换为单元素列表
        if isinstance(topic_list, str):
            topic_list = [topic_list]
        # 如果 topic 是列表，一一对应
        for topic in topic_list:
            topic = f"topic_{topic}"  # 添加前缀 "topic_"
            print(f"Processed: theme_1 = {theme_1}, topic = {topic}")
            edges.append((theme_1, topic))
            themes.add(theme_1)
            topics.add(topic)

G = nx.Graph()
G.add_edges_from(edges)

# 为每个节点添加标签
for theme in themes:
    G.nodes[theme]['label'] = 'theme'  # 给 theme 节点添加标签

for topic in topics:
    G.nodes[topic]['label'] = 'topic'  # 给 topic 节点添加标签

# 构建节点特征矩阵和邻接矩阵
nodes = list(G.nodes)
adj_matrix = nx.adjacency_matrix(G, nodelist=nodes).todense()

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(nodes)
num_classes = len(label_encoder.classes_)
feature_matrix = np.eye(num_classes)[labels]

cites_lines = []
for edge in G.edges():
    cites_line = f"{edge[0]}\t{edge[1]}"
    cites_lines.append(cites_line)

# 保存cora.cites文件
with open("/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/cora_819_alldata.cites", "w") as f:
    for line in cites_lines:
        f.write(line + '\n')

content_lines = []
node_ids = list(G.nodes)
for node_id in node_ids:
    attributes = [1 if G.has_edge(node_id, other_id) or G.has_edge(other_id, node_id) else 0 for other_id in node_ids]
    data = G.nodes[node_id]
    label = data['label']
    content_line = f"{node_id}\t{' '.join(map(str, attributes))}\t{label}"
    content_lines.append(content_line)

# 保存cora.content文件
with open("/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/cora_819_alldata.content", "w") as f:
    for line in content_lines:
        f.write(line + '\n')

print("cora.cites 和 cora.content 文件已保存。")

### 检查构建的Graph

In [None]:
import networkx as nx

# 读取cora.cites文件，构建图
cites_file = "/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/cora_819_alldata.cites"
edges = []

with open(cites_file, 'r') as f:
    for line in f:
        source, target = line.strip().split()
        edges.append((source, target))

# 使用NetworkX创建图
G = nx.Graph()
G.add_edges_from(edges)

# 读取cora.content文件并统计每个节点的度数
content_file = "/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/cora_819_alldata.content"
node_degrees = {}

with open(content_file, 'r') as f:
    for line in f:
        node_id = line.strip().split()[0]  # 获取节点ID
        node_degrees[node_id] = G.degree[node_id] if node_id in G else 0  # 获取度数，如果节点不在图中则为0

# 打印每个节点的度数
for node_id, degree in node_degrees.items():
    print(f"节点 {node_id} 有 {degree} 条边")
# 打印图中总的边数
total_edges = G.number_of_edges()
print(f"图中一共有 {total_edges} 条边")

### Deepwalk生成嵌入

In [2]:
import torch
import numpy as np
import os
import random
import pandas as pd
import scipy.sparse as sp
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
import networkx as nx
from gensim.models import Word2Vec
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from joblib import Parallel, delayed
from tqdm import tqdm 
import toml
with open('config.toml', 'r') as f:
    config = toml.load(f)
version = "v2"

data_path = config[f'data_path_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']
model_save_path = config['model_save_path']
png_path = config['png_path']
walks_save_path = config['walks_save_path']
dimensions = int(config['dimensions'])
theme_emb_path = config['theme_emb_path']
topic_emb_path = config['topic_emb_path']


def seed_everything(seed=2023):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()

def load_cora_data(data_path=data_path):
    content_df = pd.read_csv(os.path.join(data_path, content_data), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    print(content_df.shape) 
    index = content_df.index.tolist()

    features = np.array([list(map(float, row[0].split())) for row in content_df.iloc[:, :-1].values])
    features = sp.csr_matrix(features, dtype=np.float32)

    labels = content_df.values[:, -1]
    print(len(labels))
    class_encoder = LabelEncoder()
    labels = class_encoder.fit_transform(labels)

    cites_df = pd.read_csv(os.path.join(data_path, cites_data), delimiter="\t", header=None)
    cites_df[0] = cites_df[0].astype(str)
    cites_df[1] = cites_df[1].astype(str)
    cites = [tuple(x) for x in cites_df.values]
    edges = [(index.index(cite[0]), index.index(cite[1])) for cite in cites]
    edges = np.array(edges).T
    
    data = Data(x=torch.from_numpy(np.array(features.todense())),
                edge_index=torch.LongTensor(edges),
                y=torch.from_numpy(labels))
    if version == "v1":
        idx_train = range(400)
        idx_val = range(400, 500)
        idx_test = range(500, 539)
    elif version == "v2":
        idx_train = range(242)
        idx_val = range(242)
        idx_test = range(242)

    def index_to_mask(index, size):
        mask = np.zeros(size, dtype=bool)
        mask[index] = True
        return mask

    data.train_mask = index_to_mask(idx_train, size=labels.shape[0])
    data.val_mask = index_to_mask(idx_val, size=labels.shape[0])
    data.test_mask = index_to_mask(idx_test, size=labels.shape[0])

    def to_networkx(data):
        edge_index = data.edge_index.to(torch.device('cpu')).numpy()
        G = nx.Graph()
        for src, tar in edge_index.T:
            G.add_edge(index[src], index[tar])
        return G

    networkx_data = to_networkx(data)
    print(networkx_data)

    return data, networkx_data, index


pyg_data, networkx_data, node_index = load_cora_data()

def deepwalk(graph, num_walks, walk_length, save_path):
    def random_walk(start_node):
        walk = [start_node]
        current_node = start_node
        for _ in range(walk_length - 1):
            neighbors = list(graph.neighbors(current_node))
            if neighbors:
                current_node = random.choice(neighbors)
                walk.append(current_node)
            else:
                break
        return walk

    walks = []
    nodes = list(graph.nodes())
    
    for node in tqdm(nodes, desc="Generating random walks"):
        results = Parallel(n_jobs=-1)(delayed(random_walk)(node) for _ in range(num_walks))
        walks.extend(results)
    
    with open(save_path, 'w') as f:
        for walk in tqdm(walks, desc="Saving random walks"):
            f.write(" ".join(map(str, walk)) + "\n")
    
    return walks


def DeepWalk_run(networkx_data, dimensions=64, walk_length=10, num_walks=50, window_size=10, model_save_path="deepwalk.model", walks_save_path="random_walks.txt"):
    print('walk_length:', walk_length)
    walks = deepwalk(networkx_data, num_walks=num_walks, walk_length=walk_length, save_path=walks_save_path)
    model = Word2Vec(walks, vector_size=int(dimensions), window=window_size, min_count=1, sg=1, workers=4)
    nodes = model.wv.index_to_key
    embeddings = model.wv[nodes]
    
    # 保存模型
    model.save(model_save_path)
    
    return model, nodes, embeddings

def plot_embeddings(embeddings, labels, output_path):
    tsne = TSNE(n_components=2)
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 10))
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap="jet", alpha=0.6)
    legend1 = plt.legend(*scatter.legend_elements(), title="Classes")
    plt.gca().add_artist(legend1)
    plt.savefig(output_path)
    plt.close()

def save_embeddings(embeddings, node_index):
    if len(embeddings) != len(node_index):
        raise ValueError("The length of embeddings and node_index must match.")

    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df['index'] = node_index
    # 将 index 列移动到第一列
    embeddings_df = embeddings_df[['index'] + [col for col in embeddings_df.columns if col != 'index']]
    # 分离开头是'theme_'的数据和开头是'topic_'的数据
    theme_df = embeddings_df[embeddings_df['index'].str.startswith('theme_')]
    topic_df = embeddings_df[embeddings_df['index'].str.startswith('topic_')]
    theme_df.to_csv(os.path.join(data_path, theme_emb_path), index=False)
    topic_df.to_csv(os.path.join(data_path, topic_emb_path), index=False)

_, _, deepwalk_embeddings = DeepWalk_run(networkx_data, dimensions=dimensions, walk_length=10, num_walks=200, 
                                        model_save_path=os.path.join(data_path, model_save_path),
                                        walks_save_path=os.path.join(data_path, walks_save_path))
print("DeepWalk_embeddings:", np.array(deepwalk_embeddings).shape)
save_embeddings(deepwalk_embeddings, node_index)


# plot_embeddings(deepwalk_embeddings, pyg_data.y.numpy(), png_path)


(242, 2)
242
Graph with 242 nodes and 3585 edges
walk_length: 10


Generating random walks: 100%|██████████| 242/242 [00:48<00:00,  4.99it/s]
Saving random walks: 100%|██████████| 48400/48400 [00:00<00:00, 478240.10it/s]


DeepWalk_embeddings: (242, 128)


### 分别保存theme和topic的相似度矩阵

In [3]:
import os
import pandas as pd
import numpy as np
import torch
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)

version = "v2"

data_path = config[f'data_path_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']
model_save_path = config['model_save_path']
theme_sim_path = config['theme_sim_path']
topic_sim_path = config['topic_sim_path']

class configs():
    def __init__(self):
        self.data_path = data_path
        self.model_path = os.path.join(data_path, model_save_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cfg = configs()

def load_cora_data(data_path=data_path):
    content_df = pd.read_csv(os.path.join(data_path, content_data), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    index = content_df.index.astype(str).tolist()  # 使用字符串索引

    original_content = content_df.iloc[:, -1].tolist()

    return original_content, index

def load_model(model_path):
    model = Word2Vec.load(model_path)
    return model

def find_similar_vectors(model, target_node, node_index, top_k=5):
    if target_node.startswith('topic_'):
        relevant_nodes = [node for node in model.wv.index_to_key if node.startswith('topic_')]
    elif target_node.startswith('theme_'):
        relevant_nodes = [node for node in model.wv.index_to_key if node.startswith('theme_')]
    else:
        raise ValueError(f"Node '{target_node}' does not start with 'topic_' or 'theme_'")
    
    # 获取目标节点的向量
    try:
        target_vector = model.wv[target_node]
    except KeyError:
        raise KeyError(f"Key '{target_node}' not present in the model vocabulary")
    relevant_vectors = np.array([model.wv[node] for node in relevant_nodes])
    similarities = cosine_similarity([target_vector], relevant_vectors)[0]
    
    target_index = relevant_nodes.index(target_node)
    similarities[target_index] = -np.inf
    
    # 找到最相似的 top_k 个节点
    top_k_indices = similarities.argsort()[-top_k:][::-1]
    top_k_similarities = similarities[top_k_indices]
    top_k_nodes = [relevant_nodes[i] for i in top_k_indices]
    
    # print("relevant_nodes:")
    # print(relevant_nodes)
    # print("top_k_nodes:")
    # print(top_k_nodes)
    # 将最相似的节点转换为ID
    top_k_ids = [node_index.index(node) for node in top_k_nodes]

    return top_k_ids, top_k_nodes, top_k_similarities

if __name__ == '__main__':
    original_content, node_index = load_cora_data(cfg.data_path)
    # print("node_index:")
    # print(node_index)
    
    model_path = cfg.model_path
    model = load_model(model_path)

    topic_results = []
    theme_results = []

    for idx, target_node in enumerate(node_index):
        if not target_node.startswith('topic_') and not target_node.startswith('theme_'):
            continue  # 忽略非 topic_ 和 theme_ 开头的节点
        
        top_k_ids, top_k_nodes, top_k_similarities = find_similar_vectors(model, target_node, node_index)
        
        result = {
            'id': idx+1,
            '节点实际内容': target_node,
            '最相似的前5个节点id及相似度': [(top_k_ids[i]+1, top_k_similarities[i]) for i in range(5)]
        }
        for i in range(5):
            result[f'最相似的节点 {i+1}'] = top_k_nodes[i]
        
        if target_node.startswith('topic_'):
            topic_results.append(result)
        elif target_node.startswith('theme_'):
            theme_results.append(result)

    # 将结果保存到 DataFrame 并导出为 Excel 文件
    if topic_results:
        topic_results_df = pd.DataFrame(topic_results)
        topic_output_file_path = os.path.join(data_path, topic_sim_path)
        topic_results_df.to_excel(topic_output_file_path, index=False)
        print(f"Topic results have been saved to {topic_output_file_path}.")

    if theme_results:
        theme_results_df = pd.DataFrame(theme_results)
        theme_output_file_path = os.path.join(data_path, theme_sim_path)
        theme_results_df.to_excel(theme_output_file_path, index=False)
        print(f"Theme results have been saved to {theme_output_file_path}.")


Topic results have been saved to /mnt/new_pfs/liming_team/auroraX/caoying/Graph-Networks/topic_data/v2/topic_sim.xlsx.
Theme results have been saved to /mnt/new_pfs/liming_team/auroraX/caoying/Graph-Networks/topic_data/v2/theme_sim.xlsx.


### 将 theme 按序拼接到 topic_id文件中
#### grapth中需要theme节点，所以需要将theme节点id添加到topic_id文件中

In [13]:

import os
import json
import csv
import toml
with open('config.toml', 'r') as f:
    config = toml.load(f)
version = "v2"

data_path = config[f'data_path_{version}']
id4topics = config['id4topics']
topics_themes_id = config['topics_themes_id']
theme_emb_path = config['theme_emb_path']

topic_id_file = os.path.join(data_path, id4topics)
with open(topic_id_file, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)
# 获取当前最大序号
max_index = max(data.values())

# 读取 theme 文件并提取第一列的名字
theme_emb_file = os.path.join(data_path, theme_emb_path)
names = []
with open(theme_emb_file, 'r', encoding='utf-8') as theme_emb:
    reader = csv.reader(theme_emb)
    names = [row[0] for row in reader if row[0].startswith('theme_')]

# 将提取的名字按顺序拼接到 JSON 对象中
for name in names:
    max_index += 1
    data[name] = max_index

# 将更新后的 JSON 对象写回文件
topics_themes_file = os.path.join(data_path, topics_themes_id)
with open(topics_themes_file, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print("JSON 文件已成功更新并保存为 topics_themes_id.json")

JSON 文件已成功更新并保存为 topics_themes_id.json


### 将边文件 cora_819_alldata.cites 中的theme 和 topic转化为id映射

In [21]:
import os
import json
import csv
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
version = "v2"


data_path = config[f'data_path_{version}']
content_data = config[f'content_{version}']
cites_data = config[f'cites_{version}']

# 需要注意的是topics_themes_id中的 id是从 1 开始的，但是 gnn中要求是从 0 开始的，所以cities映射的时候，要将 id-1 处理
# 读取JSON文件并创建名字到ID的映射
with open(topics_themes_file, 'r', encoding='utf-8') as json_file:
    name_to_id = json.load(json_file)
cities_file = os.path.join(data_path, cites_data)
# 读取cities文件并进行名字到ID的映射转换
with open(cities_file, 'r', encoding='utf-8') as cities_file:
    edges = cities_file.readlines()

edges_with_id = []
for edge in edges:
    node1, node2 = edge.strip().split('\t')
    if node1.startswith('topic_') :
        node1 = node1[len('topic_'):]
        
    if node2.startswith('topic_') :
        node2 = node2[len('topic_'):]
    if node1 in name_to_id and node2 in name_to_id:
        node1_id = name_to_id.get(node1) - 1
        node2_id = name_to_id.get(node2) - 1
        edges_with_id.append(f"{node1_id}\t{node2_id}\n")
    else:
        print(f"Warning: One of the nodes '{node1}' or '{node2}' is not found in the JSON mapping.")

# 将转换后的边写入新的文件
cites_id_path = os.getenv('cites_id')
cites_id_file = os.path.join(data_path, cites_id_path)
with open(cites_id_file, 'w', encoding='utf-8') as updated_cities_file:
    updated_cities_file.writelines(edges_with_id)
       
    

### 将两个embedding文件的topic/theme名字改为id，按id序保存在一个文件中

In [15]:
# 将两个embedding文件的名字改为id，按序保存
import os
import json
import csv
import pandas as pd
import toml

with open('config.toml', 'r') as f:
    config = toml.load(f)
version = "v2"

data_path = config[f'data_path_{version}']
theme_emb_path = config[f'theme_emb_path']
topic_emb_path = config[f'topic_emb_path']
topics_themes_id = config[f'topics_themes_id']
topics_themes_id_emb = config[f'topics_themes_id_emb']

theme_emb_file = os.path.join(data_path, theme_emb_path)
topic_emb_file = os.path.join(data_path, topic_emb_path)
topics_themes_id_file = os.path.join(data_path, topics_themes_id)
topics_themes_id_emb_file = os.path.join(data_path, topics_themes_id_emb)

# 读取JSON文件
with open(topics_themes_id_file, 'r', encoding='utf-8') as f:
    name_to_id = json.load(f)

theme_emb = pd.read_csv(theme_emb_file,header=0)
topic_emb = pd.read_csv(topic_emb_file,header=0)

theme_emb.iloc[:, 0] = theme_emb.iloc[:, 0].map(name_to_id).apply(lambda x: x - 1 if pd.notna(x) else x)
topic_emb.iloc[:, 0] = topic_emb.iloc[:, 0].str.replace('topic_', '')
topic_emb.iloc[:, 0] = topic_emb.iloc[:, 0].map(name_to_id).apply(lambda x: x - 1 if pd.notna(x) else x)

merged_data = pd.concat([theme_emb, topic_emb])
# 按照 ID 排序
sorted_data = merged_data.sort_values(by=merged_data.columns[0])
# 保存到一个新的文件中
sorted_data.to_csv(topics_themes_id_emb_file, index=False)

### 可视化：绘制heatmap

In [29]:
import os
import pandas as pd
import numpy as np
import torch
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager

class configs():
    def __init__(self):
        self.data_path = r'/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data'
        self.model_path = r'/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/deepwalk819_alldata1.model'
        self.embedding_dim = 128
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.font_path = r'/mnt/new_pfs/liming_team/auroraX/chenlong/testdata/simhei.ttf'  # 字体路径

cfg = configs()

def load_cora_data(data_path='/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data'):
    content_df = pd.read_csv(os.path.join(data_path, "cora_819_alldata.content"), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    index = content_df.index.astype(str).tolist()  # 使用字符串索引

    original_content = content_df.iloc[:, -1].tolist()

    return original_content, index

def load_model(model_path):
    model = Word2Vec.load(model_path)
    return model

def find_similar_vectors(model, target_node, node_index, top_k=5):
    # 获取目标节点的类别（topic_ 或 theme_）
    if target_node.startswith('topic_'):
        relevant_nodes = [node for node in model.wv.index_to_key if node.startswith('topic_')]
    elif target_node.startswith('theme_'):
        relevant_nodes = [node for node in model.wv.index_to_key if node.startswith('theme_')]
    else:
        raise ValueError(f"Node '{target_node}' does not start with 'topic_' or 'theme_'")
    
    # 获取目标节点的向量
    try:
        target_vector = model.wv[target_node]
    except KeyError:
        raise KeyError(f"Key '{target_node}' not present in the model vocabulary")

    # 计算所有相关节点的相似度
    relevant_vectors = np.array([model.wv[node] for node in relevant_nodes])
    similarities = cosine_similarity([target_vector], relevant_vectors)[0]
    
    # 设置自身相似度为负无穷大，确保不会被选中
    target_index = relevant_nodes.index(target_node)
    similarities[target_index] = -np.inf
    
    # 找到最相似的 top_k 个节点
    top_k_indices = similarities.argsort()[-top_k:][::-1]
    top_k_similarities = similarities[top_k_indices]
    top_k_nodes = [relevant_nodes[i] for i in top_k_indices]

    # 将最相似的节点转换为ID
    top_k_ids = [node_index.index(node) for node in top_k_nodes]

    return top_k_ids, top_k_nodes, top_k_similarities

def plot_similarity_heatmap(nodes, model, output_path, font_path):
    # 加载自定义字体
    prop = font_manager.FontProperties(fname=font_path)
    
    # 计算相似度矩阵
    vectors = np.array([model.wv[node] for node in nodes])
    similarity_matrix = cosine_similarity(vectors)

    # 画热力图
    plt.figure(figsize=(20, 16))
    sns.heatmap(similarity_matrix, 
                xticklabels=nodes, 
                yticklabels=nodes, 
                cmap="RdYlBu_r",  # 使用红黄蓝颜色渐变
                annot=False, 
                square=True,  # 确保每个单元格是正方形
                cbar_kws={'label': 'similarity'}, 
                linewidths=.5)  # 增加单元格之间的线条分割
    
    plt.title("theme without weight", fontproperties=prop, fontsize=20, pad=20)
    plt.xticks(rotation=45, ha="right", fontproperties=prop, fontsize=12)
    plt.yticks(rotation=0, fontproperties=prop, fontsize=12)
    
    plt.tight_layout()  # 调整布局以适应标签
    plt.savefig(output_path)
    plt.show()

if __name__ == '__main__':
    original_content, node_index = load_cora_data(cfg.data_path)
    
    # 加载已训练的DeepWalk模型
    model_path = cfg.model_path
    model = load_model(model_path)
    # 打印模型中的所有节点
    all_nodes = model.wv.index_to_key
    print("模型词汇表中的所有节点:")
    print(all_nodes)

    # 分别处理 topic_ 和 theme_ 的节点
    topic_results = []
    theme_nodes = []

    for idx, target_node in enumerate(node_index):
        if target_node.startswith('topic_'):
            top_k_ids, top_k_nodes, top_k_similarities = find_similar_vectors(model, target_node, node_index)
            
            result = {
                'id': idx+1,
                '节点实际内容': target_node,
                '最相似的前5个节点id及相似度': [(top_k_ids[i], top_k_similarities[i]) for i in range(5)]
            }
            for i in range(5):
                result[f'最相似的节点 {i+1}'] = top_k_nodes[i]
            
            topic_results.append(result)
        elif target_node.startswith('theme_'):
            theme_nodes.append(target_node)

    # 保存 topic_ 节点的相似度结果
    topic_results_df = pd.DataFrame(topic_results)
    topic_output_file_path = '/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/deepwalk_alldata_topic1.xlsx'
    topic_results_df.to_excel(topic_output_file_path, index=False)
    print(f"Topic results have been saved to {topic_output_file_path}.")

    # 画 theme_ 节点的相似度热力图
    if len(theme_nodes) == 61:
        heatmap_output_path = '/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/theme_similarity_819.png'
        plot_similarity_heatmap(theme_nodes, model, heatmap_output_path, cfg.font_path)
        print(f"Theme similarity heatmap saved to {heatmap_output_path}.")
    else:
        print(f"Error: Expected 61 theme_ nodes, but found {len(theme_nodes)}.")


模型词汇表中的所有节点:
['theme_旅游与交通', 'theme_人际交往', 'theme_兴趣与爱好', 'theme_日常活动', 'theme_工作', 'theme_节假日活动', 'theme_计划与安排', 'theme_安全与救护', 'theme_活动安排', 'theme_学习', 'theme_历史与社会', 'theme_地点', 'theme_购物', 'theme_卫生与健康', 'theme_服务', 'theme_个人信息', 'theme_学校', 'theme_求助', 'theme_亲情', 'theme_文娱与体育', 'theme_人物描写', 'theme_居住情况', 'theme_家庭、朋友与周围的人', 'theme_科普知识与现代技术', 'theme_物品', 'theme_娱乐', 'theme_经历', 'theme_教育', 'theme_通讯', 'theme_文学与艺术', 'theme_自我介绍', 'theme_邀请', 'theme_情感与情绪', 'theme_人物故事', 'theme_时间', 'theme_饮食', 'theme_天气', 'theme_压力', 'theme_影视形象', 'theme_动物', 'theme_世界与环境', 'theme_友情', 'theme_周围的环境', 'topic_个人特征与生活', 'theme_媒体', 'topic_学生与教育', 'topic_日常生活', 'theme_环保', 'topic_家庭与日常事务', 'topic_社交与活动', 'topic_学习与阅读', 'topic_个人与社交', 'topic_交通', 'topic_家庭', 'theme_服装', 'topic_体育与健身', 'topic_餐饮与购物', 'theme_科学幻想', 'theme_自然', 'topic_教育与学生', 'topic_位置与导航', 'topic_公共设施与行为', 'topic_家庭与儿童', 'topic_影视与娱乐', 'topic_教育与校园生活', 'topic_教育及校园生活', 'topic_个人生活', 'topic_休闲与旅游', 'topic_旅行与假期', 'topic_社交沟通', 'topic_人

  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()


Theme similarity heatmap saved to /mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/theme_similarity_819.png.


In [79]:
import os
import pandas as pd
import numpy as np
import torch
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager

class configs():
    def __init__(self):
        self.data_path = r'/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data'
        self.model_path = r'/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/deepwalk819_alldata1.model'
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.font_path = r'/mnt/new_pfs/liming_team/auroraX/chenlong/testdata/simhei.ttf'  # 字体路径
        self.similarity_threshold = 0.5

cfg = configs()

def load_cora_data(data_path='/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data'):
    content_df = pd.read_csv(os.path.join(data_path, "cora_819_alldata.content"), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    index = content_df.index.astype(str).tolist()  # 使用字符串索引

    original_content = content_df.iloc[:, -1].tolist()

    return original_content, index

def load_model(model_path):
    model = Word2Vec.load(model_path)
    return model

def normalize_similarity(similarities, max_value, preserve_diagonal=False):
    min_sim = np.min(similarities)
    max_sim = np.max(similarities)
    # 如果最大值和最小值相等，则所有值都是一样的，直接返回max_value
    if max_sim == min_sim:
        return np.full_like(similarities, max_value)
    normalized_similarities = max_value * (similarities - min_sim) / (max_sim - min_sim)
    
    # 如果要保留对角线的值，恢复对角线元素
    if preserve_diagonal:
        np.fill_diagonal(normalized_similarities, max_value)
    
    return normalized_similarities

def plot_similarity_heatmap(nodes, model, output_path, font_path, similarity_threshold):
    # 加载自定义字体
    prop = font_manager.FontProperties(fname=font_path)
    
    # 计算相似度矩阵
    vectors = np.array([model.wv[node] for node in nodes])
    similarity_matrix = cosine_similarity(vectors)
    print(similarity_matrix)
    
    # 设置对角线元素为相似度阈值
    np.fill_diagonal(similarity_matrix, similarity_threshold)

    # 将相似度矩阵中大于相似度阈值的值设置为相似度阈值
    similarity_matrix[similarity_matrix > similarity_threshold] = similarity_threshold

    # 归一化相似度到0到相似度阈值之间，同时保留对角线元素
    similarity_matrix = normalize_similarity(similarity_matrix, similarity_threshold, preserve_diagonal=True)
    print(similarity_matrix)
    
    # 画热力图
    plt.figure(figsize=(20, 16))
    sns.heatmap(similarity_matrix, 
                xticklabels=nodes, 
                yticklabels=nodes, 
                cmap="RdYlBu_r",  # 使用红黄蓝颜色渐变
                annot=False, 
                square=True,  # 确保每个单元格是正方形
                cbar_kws={'label': 'similarity'}, 
                linewidths=.5)  # 增加单元格之间的线条分割
    
    plt.title("theme without weight", fontproperties=prop, fontsize=20, pad=20)
    plt.xticks(rotation=45, ha="right", fontproperties=prop, fontsize=12)
    plt.yticks(rotation=0, fontproperties=prop, fontsize=12)
    
    plt.tight_layout()  # 调整布局以适应标签
    plt.savefig(output_path)
    plt.show()

if __name__ == '__main__':
    original_content, node_index = load_cora_data(cfg.data_path)
    
    # 加载已训练的DeepWalk模型
    model_path = cfg.model_path
    model = load_model(model_path)
    # 打印模型中的所有节点
    all_nodes = model.wv.index_to_key
    print("模型词汇表中的所有节点:")
    print(all_nodes)

    # 处理 theme_ 的节点
    theme_nodes = []

    for idx, target_node in enumerate(node_index):
        if target_node.startswith('theme_'):
            theme_nodes.append(target_node)

    if len(theme_nodes) == 61:
        heatmap_output_path = '/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/theme_similarity_819_05.png'
        plot_similarity_heatmap(theme_nodes, model, heatmap_output_path, cfg.font_path, cfg.similarity_threshold)
        print(f"Theme similarity heatmap saved to {heatmap_output_path}.")
    else:
        print(f"Error: Expected 61 theme_ nodes, but found {len(theme_nodes)}.")


模型词汇表中的所有节点:
['theme_旅游与交通', 'theme_人际交往', 'theme_兴趣与爱好', 'theme_日常活动', 'theme_工作', 'theme_节假日活动', 'theme_计划与安排', 'theme_安全与救护', 'theme_活动安排', 'theme_学习', 'theme_历史与社会', 'theme_地点', 'theme_购物', 'theme_卫生与健康', 'theme_服务', 'theme_个人信息', 'theme_学校', 'theme_求助', 'theme_亲情', 'theme_文娱与体育', 'theme_人物描写', 'theme_居住情况', 'theme_家庭、朋友与周围的人', 'theme_科普知识与现代技术', 'theme_物品', 'theme_娱乐', 'theme_经历', 'theme_教育', 'theme_通讯', 'theme_文学与艺术', 'theme_自我介绍', 'theme_邀请', 'theme_情感与情绪', 'theme_人物故事', 'theme_时间', 'theme_饮食', 'theme_天气', 'theme_压力', 'theme_影视形象', 'theme_动物', 'theme_世界与环境', 'theme_友情', 'theme_周围的环境', 'topic_个人特征与生活', 'theme_媒体', 'topic_学生与教育', 'topic_日常生活', 'theme_环保', 'topic_家庭与日常事务', 'topic_社交与活动', 'topic_学习与阅读', 'topic_个人与社交', 'topic_交通', 'topic_家庭', 'theme_服装', 'topic_体育与健身', 'topic_餐饮与购物', 'theme_科学幻想', 'theme_自然', 'topic_教育与学生', 'topic_位置与导航', 'topic_公共设施与行为', 'topic_家庭与儿童', 'topic_影视与娱乐', 'topic_教育与校园生活', 'topic_教育及校园生活', 'topic_个人生活', 'topic_休闲与旅游', 'topic_旅行与假期', 'topic_社交沟通', 'topic_人

  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()


Theme similarity heatmap saved to /mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/theme_similarity_819_05.png.


### 细节调整

In [72]:
import json
import numpy as np
import networkx as nx
from sklearn.preprocessing import LabelEncoder

# 假设jsonl文件为 "data.jsonl"
jsonl_file = "/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/en_title_100w_audio_filter_v2.jsonl"

# 读取jsonl文件并提取theme_1和topic的关系
edges = []
themes = set()
topics = set()
edge_weights = {}

with open(jsonl_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        theme_1 = f"theme_{data['theme_1']}"  # 添加前缀 "theme_"
        topic = f"topic_{data['topic']}"  # 添加前缀 "topic_"
        print(f"Processed: theme_1 = {theme_1}, topic = {topic}")

        edge = (theme_1, topic)
        # 如果边已存在，则增加权重
        if edge in edge_weights:
            edge_weights[edge] += 1
        else:
            edge_weights[edge] = 1
        
        themes.add(theme_1)
        topics.add(topic)

# 创建一个双向图
G = nx.Graph()

# 根据加权边添加到图中
for edge, weight in edge_weights.items():
    G.add_edge(edge[0], edge[1], weight=weight)

# 为每个节点添加标签
for theme in themes:
    G.nodes[theme]['label'] = 'theme'  

for topic in topics:
    G.nodes[topic]['label'] = 'topic' 

# 构建节点特征矩阵和邻接矩阵
nodes = list(G.nodes)
adj_matrix = nx.adjacency_matrix(G, nodelist=nodes).todense()

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(nodes)
num_classes = len(label_encoder.classes_)
feature_matrix = np.eye(num_classes)[labels]

# 保存加权的 cora.cites 文件
cites_lines = []
for edge in G.edges(data=True):
    cites_line = f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}"
    cites_lines.append(cites_line)

with open("/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/cora_weight.cites", "w") as f:
    for line in cites_lines:
        f.write(line + '\n')

# 保存 cora.content 文件
content_lines = []
node_ids = list(G.nodes)
for node_id in node_ids:
    attributes = [1 if G.has_edge(node_id, other_id) or G.has_edge(other_id, node_id) else 0 for other_id in node_ids]
    data = G.nodes[node_id]
    label = data['label']
    content_line = f"{node_id}\t{' '.join(map(str, attributes))}\t{label}"
    content_lines.append(content_line)

with open("/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/cora_weight.content", "w") as f:
    for line in content_lines:
        f.write(line + '\n')

print("cora.cites 和 cora.content 文件已保存。")


Processed: theme_1 = theme_亲情, topic = topic_家庭与关系
Processed: theme_1 = theme_物品, topic = topic_物品与消费
Processed: theme_1 = theme_饮食, topic = topic_饮食与健康
Processed: theme_1 = theme_人物描写, topic = topic_个人特征与生活
Processed: theme_1 = theme_亲情, topic = topic_家庭与关系
Processed: theme_1 = theme_居住情况, topic = topic_个人特征与生活
Processed: theme_1 = theme_日常活动, topic = topic_个人特征与生活
Processed: theme_1 = theme_饮食, topic = topic_饮食与健康
Processed: theme_1 = theme_卫生与健康, topic = topic_饮食与健康
Processed: theme_1 = theme_日常活动, topic = topic_个人特征与生活
Processed: theme_1 = theme_日常活动, topic = topic_家庭与关系
Processed: theme_1 = theme_教育, topic = topic_个人特征与生活
Processed: theme_1 = theme_购物, topic = topic_物品与消费
Processed: theme_1 = theme_旅游与交通, topic = topic_出行与旅游
Processed: theme_1 = theme_求助, topic = topic_物品与消费
Processed: theme_1 = theme_求助, topic = topic_物品与消费
Processed: theme_1 = theme_动物, topic = topic_动物与自然
Processed: theme_1 = theme_求助, topic = topic_物品与消费
Processed: theme_1 = theme_经历, topic = topic_出行与旅游
Proce

  adj_matrix = nx.adjacency_matrix(G, nodelist=nodes).todense()


### 调用

In [None]:
import torch
import numpy as np
import os
import random
import pandas as pd
import scipy.sparse as sp
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from node2vec import Node2Vec
import networkx as nx
import matplotlib
matplotlib.use('Agg')

def seed_everything(seed=2023):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()

def load_cora_data(data_path='/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data'):
    content_df = pd.read_csv(os.path.join(data_path, "cora.content"), delimiter="\t", header=None)
    content_df.set_index(0, inplace=True)
    index = content_df.index.tolist()

    features = np.array([list(map(float, row[0].split())) for row in content_df.iloc[:, :-1].values])
    features = sp.csr_matrix(features, dtype=np.float32)

    labels = content_df.values[:, -1]
    class_encoder = LabelEncoder()
    labels = class_encoder.fit_transform(labels)

    cites_df = pd.read_csv(os.path.join(data_path, "cora.cites"), delimiter="\t", header=None)
    cites_df[0] = cites_df[0].astype(str)
    cites_df[1] = cites_df[1].astype(str)
    cites = [tuple(x) for x in cites_df.values]
    edges = [(index.index(cite[0]), index.index(cite[1])) for cite in cites]
    edges = np.array(edges).T

    data = Data(x=torch.from_numpy(np.array(features.todense())),
                edge_index=torch.LongTensor(edges),
                y=torch.from_numpy(labels))

    idx_train = range(400)
    idx_val = range(400, 500)
    idx_test = range(500, 561)

    def index_to_mask(index, size):
        mask = np.zeros(size, dtype=bool)
        mask[index] = True
        return mask

    data.train_mask = index_to_mask(idx_train, size=labels.shape[0])
    data.val_mask = index_to_mask(idx_val, size=labels.shape[0])
    data.test_mask = index_to_mask(idx_test, size=labels.shape[0])

    def to_networkx(data):
        edge_index = data.edge_index.to(torch.device('cpu')).numpy()
        G = nx.DiGraph()
        for src, tar in edge_index.T:
            G.add_edge(src, tar)
        return G

    networkx_data = to_networkx(data)

    return data, networkx_data

pyg_data, networkx_data = load_cora_data()


def Node2Vec_run(networkx_data, dimensions=64, walk_length=30, num_walks=200, model_save_path="node2vec2.model"):
    node2vec = Node2Vec(networkx_data, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=4)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    nodes = model.wv.index_to_key
    embeddings = model.wv[nodes]
    
    # 保存模型
    model.save(model_save_path)
    
    return model, nodes, embeddings
_, _, node2vec_embeddings = Node2Vec_run(networkx_data, num_walks=200, model_save_path="/mnt/new_pfs/liming_team/auroraX/chenlong/Graph-Networks/topic_data/node2vec2.model")
print("node2vec_embeddings:", np.array(node2vec_embeddings).shape)


