# 影响力最大化实验教程

本教程将引导您完成整个影响力最大化实验流程，从网络生成到参数学习再到影响力最大化。

## 目录
1. 环境设置
2. 生成网络和级联数据
3. 训练参数学习模型
4. 运行影响力最大化算法
5. 对比分析
6. 可视化

## 1. 环境设置

In [None]:
import sys
import os

# 添加 src 到路径
sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from src.data import NetworkGenerator, CascadeGenerator, DataSplitter
from src.models import GraphEmbedding, ParameterLearner
from src.diffusion import DiffusionSimulator
from src.influence_max import LazyGreedyIM, TIM, IMM
from src.influence_max.heuristics import DegreeHeuristic, PageRankHeuristic
from src.utils import plot_training_history, plot_influence_comparison
from src.utils.network_viz import NetworkVisualizer, CascadeAnimator

# 设置样式
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# 设置随机种子
SEED = 42
np.random.seed(SEED)

print("✓ 环境设置完成！")

## 2. 生成网络和级联数据

我们首先生成一个 BA 无标度网络，然后模拟 IC 模型的级联传播。

In [None]:
# 生成网络
gen = NetworkGenerator(seed=SEED)
G = gen.generate_ba(n=300, m=3)

print(f"网络统计:")
print(f"  节点数: {G.number_of_nodes()}")
print(f"  边数: {G.number_of_edges()}")
print(f"  平均度: {np.mean([d for n, d in G.degree()]):.2f}")

# 分配 IC 概率
G = gen.assign_ic_probabilities(G, prob_range=(0.01, 0.1))

print("\n✓ 网络生成完成！")

### 可视化网络

In [None]:
# 创建可视化器
viz = NetworkVisualizer(G, figsize=(10, 8))

# 绘制网络
viz.plot_network(title="BA 无标度网络", show_labels=False)

# 绘制度分布
viz.plot_degree_distribution()

### 生成级联数据

In [None]:
# 生成级联
cascade_gen = CascadeGenerator(G, seed=SEED)
cascades = cascade_gen.generate_cascades('ic', num_cascades=500)

# 统计
cascade_sizes = [c['cascade_size'] for c in cascades]
print(f"级联统计:")
print(f"  总数: {len(cascades)}")
print(f"  平均大小: {np.mean(cascade_sizes):.2f}")
print(f"  最大大小: {max(cascade_sizes)}")
print(f"  最小大小: {min(cascade_sizes)}")

# 可视化级联大小分布
plt.figure(figsize=(10, 5))
plt.hist(cascade_sizes, bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('级联大小', fontsize=12)
plt.ylabel('频率', fontsize=12)
plt.title('级联大小分布', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.show()

print("\n✓ 级联生成完成！")

## 3. 训练参数学习模型

使用生成的级联数据训练 PyTorch 模型来学习边传播概率。

In [None]:
# 将级联转换为训练数据
edges, labels = cascade_gen.cascades_to_training_data(cascades)

print(f"训练数据:")
print(f"  样本数: {len(edges)}")
print(f"  正样本比例: {np.mean(labels):.3f}")

In [None]:
# 数据划分
splitter = DataSplitter(seed=SEED)
train_edges, train_labels, val_edges, val_labels, test_edges, test_labels = \
    splitter.split_edges(edges, labels)

print(f"数据划分:")
print(f"  训练集: {len(train_edges)}")
print(f"  验证集: {len(val_edges)}")
print(f"  测试集: {len(test_edges)}")

In [None]:
# 生成嵌入
print("生成 Node2Vec 嵌入...")
embedding_gen = GraphEmbedding(G, embedding_dim=64, seed=SEED)
embeddings = embedding_gen.train_node2vec(num_walks=10, walk_length=40, workers=4)

# 准备特征
print("准备特征...")
train_features = np.array([embedding_gen.get_edge_features(e) for e in train_edges])
val_features = np.array([embedding_gen.get_edge_features(e) for e in val_edges])
test_features = np.array([embedding_gen.get_edge_features(e) for e in test_edges])
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

print(f"特征维度: {train_features.shape[1]}")
print("\n✓ 特征准备完成！")

In [None]:
# 训练模型
print("训练参数学习模型...")
learner = ParameterLearner(
    input_dim=train_features.shape[1],
    hidden_dims=[128, 64],
    dropout=0.3,
    learning_rate=0.001,
    device='cpu',  # 改为 'cuda' 如果有 GPU
    seed=SEED
)

history = learner.fit(
    train_features, train_labels,
    val_features, val_labels,
    epochs=50,
    batch_size=128,
    early_stopping_patience=10,
    verbose=True
)

print("\n✓ 模型训练完成！")

In [None]:
# 可视化训练历史
plot_training_history(history)

In [None]:
# 评估测试集
test_predictions = learner.predict(test_features)

from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

binary_labels = (test_labels > 0).astype(int)
binary_preds = (test_predictions > 0.5).astype(int)

print("测试集性能:")
print(f"  AUC: {roc_auc_score(binary_labels, test_predictions):.4f}")
print(f"  准确率: {accuracy_score(binary_labels, binary_preds):.4f}")
print("\n分类报告:")
print(classification_report(binary_labels, binary_preds))

### 创建学习参数的图

In [None]:
# 为所有边预测概率
G_learned = G.copy()
all_edges = list(G_learned.edges())
edge_features = np.array([embedding_gen.get_edge_features(e) for e in all_edges])
learned_probs = learner.predict(edge_features)

# 分配学习到的概率
for edge, prob in zip(all_edges, learned_probs):
    G_learned[edge[0]][edge[1]]['prob'] = float(prob)

print("✓ 学习参数的图创建完成！")

## 4. 运行影响力最大化算法

使用真实参数和学习参数分别运行多种影响力最大化算法。

In [None]:
# 参数设置
K = 5  # 种子节点数
NUM_SIMULATIONS = 500  # MC 模拟次数

# 存储结果
results = {}

### 使用真实参数

In [None]:
print("=" * 60)
print("使用真实参数")
print("=" * 60)

sim_true = DiffusionSimulator(G, model='ic', seed=SEED)

# Lazy Greedy
print("\n[1] Lazy Greedy...")
greedy = LazyGreedyIM(G, sim_true, seed=SEED)
seeds_greedy, gains, time_greedy = greedy.select_seeds(K, NUM_SIMULATIONS, verbose=False)
inf_greedy = sim_true.estimate_influence(seeds_greedy, NUM_SIMULATIONS)
results['True-LazyGreedy'] = {'seeds': seeds_greedy, 'influence': inf_greedy, 'time': time_greedy}
print(f"  种子: {seeds_greedy}")
print(f"  影响力: {inf_greedy:.2f}")
print(f"  时间: {time_greedy:.2f}s")

# TIM
print("\n[2] TIM...")
tim = TIM(G, model='ic', seed=SEED)
seeds_tim, _, time_tim = tim.select_seeds(K, epsilon=0.5, verbose=False)
inf_tim = sim_true.estimate_influence(seeds_tim, NUM_SIMULATIONS)
results['True-TIM'] = {'seeds': seeds_tim, 'influence': inf_tim, 'time': time_tim}
print(f"  种子: {seeds_tim}")
print(f"  影响力: {inf_tim:.2f}")
print(f"  时间: {time_tim:.2f}s")

# Degree Heuristic
print("\n[3] Degree Heuristic...")
degree_h = DegreeHeuristic(G, seed=SEED)
seeds_degree, time_degree = degree_h.select_seeds(K)
inf_degree = sim_true.estimate_influence(seeds_degree, NUM_SIMULATIONS)
results['True-Degree'] = {'seeds': seeds_degree, 'influence': inf_degree, 'time': time_degree}
print(f"  种子: {seeds_degree}")
print(f"  影响力: {inf_degree:.2f}")
print(f"  时间: {time_degree:.2f}s")

### 使用学习参数

In [None]:
print("\n" + "=" * 60)
print("使用学习参数")
print("=" * 60)

sim_learned = DiffusionSimulator(G_learned, model='ic', seed=SEED)

# Lazy Greedy
print("\n[1] Lazy Greedy...")
greedy_l = LazyGreedyIM(G_learned, sim_learned, seed=SEED)
seeds_greedy_l, gains_l, time_greedy_l = greedy_l.select_seeds(K, NUM_SIMULATIONS, verbose=False)
# 在真实参数下评估
inf_greedy_l = sim_true.estimate_influence(seeds_greedy_l, NUM_SIMULATIONS)
results['Learned-LazyGreedy'] = {'seeds': seeds_greedy_l, 'influence': inf_greedy_l, 'time': time_greedy_l}
print(f"  种子: {seeds_greedy_l}")
print(f"  影响力(真实): {inf_greedy_l:.2f}")
print(f"  时间: {time_greedy_l:.2f}s")

# TIM
print("\n[2] TIM...")
tim_l = TIM(G_learned, model='ic', seed=SEED)
seeds_tim_l, _, time_tim_l = tim_l.select_seeds(K, epsilon=0.5, verbose=False)
inf_tim_l = sim_true.estimate_influence(seeds_tim_l, NUM_SIMULATIONS)
results['Learned-TIM'] = {'seeds': seeds_tim_l, 'influence': inf_tim_l, 'time': time_tim_l}
print(f"  种子: {seeds_tim_l}")
print(f"  影响力(真实): {inf_tim_l:.2f}")
print(f"  时间: {time_tim_l:.2f}s")

## 5. 结果对比分析

In [None]:
# 影响力对比
plot_data = {k: [v['influence']] for k, v in results.items()}
plot_influence_comparison(plot_data)

# 运行时间对比
from src.utils import plot_runtime_comparison
runtime_data = {k: v['time'] for k, v in results.items()}
plot_runtime_comparison(runtime_data)

In [None]:
# 种子节点重叠分析
true_seeds = set(results['True-LazyGreedy']['seeds'])
learned_seeds = set(results['Learned-LazyGreedy']['seeds'])

overlap = len(true_seeds.intersection(learned_seeds))
jaccard = overlap / len(true_seeds.union(learned_seeds))

print(f"种子节点重叠分析 (LazyGreedy):")
print(f"  真实参数种子: {sorted(true_seeds)}")
print(f"  学习参数种子: {sorted(learned_seeds)}")
print(f"  重叠数: {overlap}/{K}")
print(f"  Jaccard 系数: {jaccard:.3f}")

## 6. 可视化

可视化选出的种子节点和传播过程。

In [None]:
# 可视化选出的种子节点
viz_seeds = NetworkVisualizer(G, figsize=(10, 8))
viz_seeds.pos = viz.pos  # 使用相同的布局
viz_seeds.plot_network(
    highlighted_nodes=results['True-LazyGreedy']['seeds'],
    title="选出的种子节点 (Lazy Greedy - 真实参数)",
    show_labels=True
)

In [None]:
# 模拟一次传播并可视化
seeds = results['True-LazyGreedy']['seeds']
activated, edges = sim_true.model.simulate_single(seeds)

print(f"传播结果:")
print(f"  种子节点: {seeds}")
print(f"  激活节点数: {len(activated)}")
print(f"  激活边数: {len(edges)}")

# 创建快照
animator = CascadeAnimator(G, pos=viz.pos)
animator.create_cascade_snapshots(
    cascade_edges=edges,
    initial_nodes=seeds,
    num_snapshots=4
)

## 总结

在本教程中，我们完成了：

1. ✅ 生成了一个 BA 无标度网络
2. ✅ 模拟了 IC 模型的级联传播
3. ✅ 使用 PyTorch 训练了参数学习模型
4. ✅ 运行了多种影响力最大化算法
5. ✅ 对比了真实参数和学习参数的效果
6. ✅ 可视化了网络和传播过程

### 关键发现

- 参数学习模型达到了较好的 AUC (通常 > 0.8)
- 学习参数下选出的种子节点与真实参数有一定重叠
- Lazy Greedy 和 TIM 算法在影响力上接近，但 TIM 更快
- 启发式方法（Degree）速度快但影响力较低

### 扩展练习

1. 尝试不同的网络类型（ER, WS）
2. 调整参数学习模型的超参数
3. 测试 IMM 算法
4. 在真实数据集上运行实验
5. 使用结构特征增强模型