In [11]:
from xmark import XMark_benchmark
import networkx as nx
import numpy as np
import random

In [12]:
def homophily (graph, label):
    similar = [(u, v) for u, v in graph.edges() if graph.nodes[u][label] == graph.nodes[v][label]]
    obs = len(similar) / len(graph.edges())
    
    similar_random = 0
    for u in graph.nodes():
        v = random.choice(list(graph.nodes()))
        if graph.nodes[u][label] == graph.nodes[v][label]:
            similar_random += 1
            
    exp = similar_random / len(graph.nodes())
    
    return obs, exp

def sse(coms, graph, label):
    lab_nod_coms = []
    for c in coms:
        part = []
        for el in c:
            part.append(graph.nodes[el][label])
        lab_nod_coms.append(part)
        
    sse = []
    tot_centr = []
    for c in lab_nod_coms:
        centroid = np.mean(c)
        tot_centr.append(centroid)
        sse_c = 0
        for n in c:
            sse_c += abs( (n - centroid)**2 )
        sse.append(sse_c) 
        
    return sum(sse)

### Categorical attributes

In [8]:
N = 2000
gamma = 3
beta = 2
#auto: number of labels equal to number of communities
m_cat = [2, 5, 'auto']
theta = 0.3
mu = 0.3
avg_k = 10
min_com = 20

g = XMark_benchmark(N, gamma, beta, mu,
          labels=m_cat,
          noise=theta,
          average_degree=avg_k, min_community=min_com,
          type_attr="categorical")

for i, m in enumerate(m_cat):
    print('label_' + str(i), "m=" + str(m))
    obs, exp = homophily(g, 'label_' + str(i))
    print("observed homophily: ", obs)
    print("expected homophily: ", exp)
    r = nx.attribute_assortativity_coefficient(g, 'label_' + str(i))
    print("categorical newman's coefficient: ", r)
    print("")

label_0 m=2
observed homophily:  0.6269156244495332
expected homophily:  0.479
categorical newman's coefficient:  0.2502573989696535

label_1 m=5
observed homophily:  0.43297516293817156
expected homophily:  0.238
categorical newman's coefficient:  0.25948473986330645

label_2 m=auto
observed homophily:  0.307556808173331
expected homophily:  0.0685
categorical newman's coefficient:  0.25554249697182035



In [9]:
for i, m in enumerate(m_cat):
    dict_val = nx.get_node_attributes(g, 'label_' + str(i))
    dom = set(list(dict_val.values()))
    print(dom)

{1, 2}
{1, 2, 3, 4, 5}
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}


### Continuous attributes

In [None]:
N = 2000
gamma = 3
beta = 2
#studied in the paper, number of labels equal to number of communities
#m_cont = ["auto", "auto"]
m_cont = [2, 5] #连续属性峰值数 (即属性的模态数量)。[2, 5] 表示生成两个连续属性，第一个有 2 个峰值，第二个有 5 个峰值 。
# 生成器会使用一个由 5 个具有不同均值的正态分布混合而成的分布来为节点分配 Label_1 的值。
sigma = 0.5 #标准差：控制每个社区内属性值的离散程度（噪声）。值越低，属性越清晰
mu = 0.3 #混合参数 (控制拓扑结构难度)
avg_k = 10 #平均度
min_com = 20 #最小社区大小

g = XMark_benchmark(N, gamma, beta, mu,
          labels=m_cont,
          std=sigma,
          average_degree=avg_k, min_community=min_com,
          type_attr="continuous")

coms = {frozenset(g.nodes[v]['community']) for v in g}
coms = [list(c) for c in coms]

val_sse1 = sse(coms, g, 'label_0')
print("SSE 1: ", val_sse1)
val_sse2 = sse(coms, g, 'label_1')
print("SSE 2: ", val_sse2)

SSE 1:  511.8815947368363
SSE 2:  484.3988124362103


In [None]:
import pandas as pd
import networkx as nx

# 假设 Cell 7 已经成功执行，并且图对象存储在变量 g 中

print("--- 开始导出网络数据 ---")

# 1. 导出拓扑结构 (边列表)
# ----------------------------------
edge_list_path = "xmark_continuous_edge_list.csv"
# data=False 表示只导出节点 ID，不包含边上的属性（因为 X-Mark 默认是无权图）
nx.write_edgelist(g, edge_list_path, delimiter=",", data=False)

print(f" 拓扑结构（边列表）已导出到: {edge_list_path}")


# 2. 导出节点属性和真实划分
# ----------------------------------
node_data = []
# 遍历图 g 中的所有节点及其数据
for node_id, data in g.nodes(data=True):
    # 提取节点ID、真实社区ID和所有生成的连续属性
    row = {
        'NodeID': node_id,
        # 'community' 存储真实划分 (C)
        'CommunityID': data['community'], 
        # 'label_0' 存储第一个连续属性 (对应 m_cont=2)
        'Label_0': data['label_0'],
        # 'label_1' 存储第二个连续属性 (对应 m_cont=5)
        'Label_1': data['label_1'], 
    }
    node_data.append(row)

# 转换为 DataFrame 并导出为 CSV
df_nodes = pd.DataFrame(node_data)
attributes_path = "xmark_continuous_node_attributes_and_truth.csv"
df_nodes.to_csv(attributes_path, index=False)

print(f" 节点属性和真实划分已导出到: {attributes_path}")

--- 开始导出网络数据 ---
✅ 拓扑结构（边列表）已导出到: xmark_continuous_edge_list.csv
✅ 节点属性和真实划分已导出到: xmark_continuous_node_attributes_and_truth.csv


In [15]:
# 检查前 5 个节点的原始社区数据
for i, (node_id, data) in enumerate(g.nodes(data=True)):
    if i < 5:
        # 直接打印原始值，查看它是单个数字还是列表/集合
        print(f"NodeID: {node_id}, Community Value Type: {type(data['community'])}, Value: {data['community']}")
    else:
        break

NodeID: 0, Community Value Type: <class 'set'>, Value: {0, 1036, 146, 150, 1818, 1050, 810, 47, 943, 560, 1200, 57, 190, 1472, 1861, 1096, 456, 1352, 95, 121, 613, 1639, 118, 502, 505}
NodeID: 1, Community Value Type: <class 'set'>, Value: {1536, 1, 1926, 1691, 292, 171, 175, 1202, 1589, 54, 439, 950, 1855, 576, 968, 1865, 587, 1239, 856, 480, 1120, 1634, 101, 108, 1005}
NodeID: 2, Community Value Type: <class 'set'>, Value: {2, 1550, 1058, 34, 36, 40, 41, 1579, 559, 561, 1587, 1076, 51, 1079, 59, 1086, 1090, 591, 1617, 1107, 596, 1108, 1619, 1111, 1620, 601, 1626, 93, 1630, 94, 1128, 106, 618, 620, 1130, 1137, 1139, 117, 1146, 1658, 124, 127, 1153, 1154, 644, 1669, 1159, 136, 1162, 652, 654, 149, 151, 1176, 156, 669, 165, 1709, 179, 697, 701, 711, 1734, 716, 719, 1749, 749, 750, 1786, 1792, 1295, 788, 1815, 1816, 282, 800, 1827, 808, 1834, 318, 1350, 1356, 1358, 342, 867, 1902, 369, 1922, 928, 428, 1003}
NodeID: 3, Community Value Type: <class 'set'>, Value: {1664, 1537, 3, 1799, 1547

In [16]:
# 检查前 5 个节点的全部属性，寻找真正的社区标签 ID
for i, (node_id, data) in enumerate(g.nodes(data=True)):
    if i < 5:
        print(f"NodeID: {node_id}, All Attributes: {data}")
    else:
        break

NodeID: 0, All Attributes: {'label_0': 10.648366964335429, 'label_1': 29.0682943442907, 'community': {0, 1036, 146, 150, 1818, 1050, 810, 47, 943, 560, 1200, 57, 190, 1472, 1861, 1096, 456, 1352, 95, 121, 613, 1639, 118, 502, 505}}
NodeID: 1, All Attributes: {'label_0': 1.9667720268725155, 'label_1': 11.395291076597774, 'community': {1536, 1, 1926, 1691, 292, 171, 175, 1202, 1589, 54, 439, 950, 1855, 576, 968, 1865, 587, 1239, 856, 480, 1120, 1634, 101, 108, 1005}}
NodeID: 2, All Attributes: {'label_0': 0.9590790833524321, 'label_1': 11.305724888937258, 'community': {2, 1550, 1058, 34, 36, 40, 41, 1579, 559, 561, 1587, 1076, 51, 1079, 59, 1086, 1090, 591, 1617, 1107, 596, 1108, 1619, 1111, 1620, 601, 1626, 93, 1630, 94, 1128, 106, 618, 620, 1130, 1137, 1139, 117, 1146, 1658, 124, 127, 1153, 1154, 644, 1669, 1159, 136, 1162, 652, 654, 149, 151, 1176, 156, 669, 165, 1709, 179, 697, 701, 711, 1734, 716, 719, 1749, 749, 750, 1786, 1792, 1295, 788, 1815, 1816, 282, 800, 1827, 808, 1834, 318

In [17]:
# 目标：将每个节点ID映射到它所属的社区的索引 (0, 1, 2, ...)
node_to_community_id = {}
for community_index, node_list in enumerate(coms):
    # node_list 是一个社区的所有成员 (例如: {0, 1036, 146, ...})
    for node_id in node_list:
        node_to_community_id[node_id] = community_index 

print(f"成功识别 {len(coms)} 个社区。")

# 现在 node_to_community_id 字典中存储着每个节点的真实社区 ID（从 0 开始）

成功识别 32 个社区。


In [None]:
import pandas as pd
import networkx as nx

# ... 假设上述代码已运行，g 和 node_to_community_id 已定义 ...

node_data = []
for node_id, data in g.nodes(data=True):
    row = {
        'NodeID': node_id,
        # 修正的关键：从映射字典中获取单个整数 ID
        'CommunityID': node_to_community_id[node_id], 
        'Label_0': data['label_0'],
        'Label_1': data['label_1'],
    }
    node_data.append(row)

# 转换为 DataFrame 并导出为 CSV
df_nodes = pd.DataFrame(node_data)
attributes_path = "xmark_continuous_node_attributes_and_truth_FIXED.csv"
df_nodes.to_csv(attributes_path, index=False)

print(f" 修正后的节点属性和真实划分已导出到: {attributes_path}")

✅ 修正后的节点属性和真实划分已导出到: xmark_continuous_node_attributes_and_truth_FIXED.csv
