In [163]:
# 创建目标文件夹（如果不存在）
import os
os.makedirs("dataset", exist_ok=True)

# 下载文件
!wget -P dataset https://snap.stanford.edu/data/cit-Patents.txt.gz

# 解压缩到指定文件夹
!gunzip -k dataset/cit-Patents.txt.gz


--2024-12-14 20:04:52--  https://snap.stanford.edu/data/cit-Patents.txt.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85139832 (81M) [application/x-gzip]
Saving to: ‘dataset/cit-Patents.txt.gz’


2024-12-14 20:05:33 (2,01 MB/s) - ‘dataset/cit-Patents.txt.gz’ saved [85139832/85139832]



In [164]:
def read_graph(file_path):
    graph = {}
    all_nodes = set()  # 用于记录所有出现过的节点
    with open(file_path, 'r') as file:
        for line in file:
            # 跳过注释行（# 或其他符号）
            if line.startswith('#'):
                continue
            
            # 读取边的起点和终点
            source, target = map(int, line.strip().split())
            
            # 记录所有节点
            all_nodes.update([source, target])
            
            # 构建邻接列表（有向图逻辑）
            if source not in graph:
                graph[source] = []
            if target not in graph[source]:  # 避免重复边
                graph[source].append(target)
    
    # 添加孤立节点（没有出边的节点）
    for node in all_nodes:
        if node not in graph:
            graph[node] = []
    
    return graph

def print_graph_info(graph):
    # 节点数：邻接表中的所有键值对数量
    num_nodes = len(graph)
    
    # 边数：计算所有出边的数量总和
    num_edges = sum(len(targets) for targets in graph.values())
    
    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}")


file_path = 'dataset/email-Enron.txt'
file_path ='dataset/amazon0601.txt'
file_path ='dataset/web-Google.txt'
file_path ='dataset/cit-Patents.txt'
graph = read_graph(file_path)

# 打印节点和边数
print_graph_info(graph)


Number of nodes: 3774768
Number of edges: 16518948


In [165]:
# def degree_discount_ic(graph, k, p):
#     """
#     使用度折扣启发式算法选择种子节点以最大化影响力传播。

#     参数:
#     - graph: dict，表示图的邻接列表，key 是节点，value 是该节点的邻居列表。
#     - k: int，要选择的种子节点数量。
#     - p: float，传播概率。

#     返回:
#     - seeds: list，包含选出的种子节点。
#     """
#     # 初始化
#     degree = {v: len(neighbors) for v, neighbors in graph.items()}  # 每个节点的度
#     t = {v: 0 for v in graph}  # 每个节点已选为种子的邻居数
#     dd = degree.copy()  # 初始化度折扣值
#     seeds = []  # 存放种子节点

#     for _ in range(k):
#         # 选择度折扣值最大的节点作为新种子
#         u = max(dd, key=dd.get)
#         seeds.append(u)
#         print(f"选出的种子节点: {u}")  # 打印每次选出的种子节点

#         # 更新新种子节点的邻居的度折扣值
#         for v in graph[u]:
#             if v not in seeds:  # 如果邻居节点尚未被选为种子
#                 t[v] += 1  # 增加该节点的已选种子邻居数
#                 dd[v] = degree[v] - 2 * t[v] - (degree[v] - t[v]) * t[v] * p  # 更新度折扣值

#         # 移除已经选为种子的节点的度折扣值
#         dd.pop(u)

#     return seeds

# # 参数
# k = 50  
# p = 0.01  # 传播概率

# seeds = degree_discount_ic(graph, k, p)
# # print("选择的种子节点:", seeds)


In [166]:
import heapq
import time
def degree_discount_ic_heap(graph, k, p):
    """
    使用度折扣启发式算法，并利用堆优化，选择种子节点以最大化影响力传播。

    参数:
    - graph: dict，图的邻接列表，key 是节点，value 是邻居列表。
    - k: int，要选择的种子节点数量。
    - p: float，传播概率。

    返回:
    - seeds: list，包含选出的种子节点。
    """
    # 初始化
    degree = {v: len(neighbors) for v, neighbors in graph.items()}  # 每个节点的度
    t = {v: 0 for v in graph}  # 每个节点已选为种子的邻居数
    dd = degree.copy()  # 度折扣值
    heap = []  # 使用最小堆（存储负值模拟最大堆）
    seeds = []  # 最终的种子节点集合
    
    # 构建初始堆
    for v, val in dd.items():
        heapq.heappush(heap, (-val, v))  # 堆存储 (-度折扣值, 节点)
    
    # 贪婪选择种子节点
    for _ in range(k):
        while True:
            # 弹出堆顶，获取最大度折扣值节点
            _, u = heapq.heappop(heap)
            # 检查是否是最新的度折扣值
            if -dd[u] == _:
                break  # 最新的值，继续执行
            # 否则跳过，因为它是旧值
        
        # 添加种子节点
        seeds.append(u)
        print(f"选出的种子节点: {u}") 
        # 更新邻居节点的度折扣值
        for v in graph[u]:
            if v not in seeds:  # 如果邻居节点尚未被选为种子
                t[v] += 1  # 更新该节点已选种子邻居数
                # 计算新的度折扣值
                dd[v] = degree[v] - 2 * t[v] - (degree[v] - t[v]) * t[v] * p
                # 将更新后的节点重新加入堆
                heapq.heappush(heap, (-dd[v], v))  # 存储负值
    
    return seeds
# 参数
k = 50  
p = 0.01  # 传播概率
start_time = time.time()
seeds1 = degree_discount_ic_heap(graph, k, p)
end_time = time.time()  # 记录结束时间

选出的种子节点: 5795784
选出的种子节点: 5887243
选出的种子节点: 5856194
选出的种子节点: 5855655
选出的种子节点: 5891229
选出的种子节点: 5908495
选出的种子节点: 5858586
选出的种子节点: 6008268
选出的种子节点: 5885337
选出的种子节点: 5837429
选出的种子节点: 5786132
选出的种子节点: 5782963
选出的种子节点: 5709955
选出的种子节点: 5773182
选出的种子节点: 5865471
选出的种子节点: 5733693
选出的种子节点: 5681380
选出的种子节点: 5721287
选出的种子节点: 5901425
选出的种子节点: 5817744
选出的种子节点: 5700850
选出的种子节点: 5618907
选出的种子节点: 5739256
选出的种子节点: 5587105
选出的种子节点: 5602226
选出的种子节点: 5573854
选出的种子节点: 5554769
选出的种子节点: 5969079
选出的种子节点: 5994152
选出的种子节点: 5705574
选出的种子节点: 5683843
选出的种子节点: 5643701
选出的种子节点: 5616443
选出的种子节点: 5747550
选出的种子节点: 5739175
选出的种子节点: 5798015
选出的种子节点: 5811199
选出的种子节点: 5849411
选出的种子节点: 5714566
选出的种子节点: 5756597
选出的种子节点: 5643356
选出的种子节点: 5645964
选出的种子节点: 5610317
选出的种子节点: 5989237
选出的种子节点: 5998220
选出的种子节点: 5830548
选出的种子节点: 5957898
选出的种子节点: 5877028
选出的种子节点: 5738921
选出的种子节点: 5797877


In [167]:
import heapq
import random

def linear_threshold_degree_discount(graph, k):
    """
    Implements a degree discount heuristic for the Linear Threshold (LT) model.
    
    Parameters:
    - graph: dict, adjacency list where key is a node, and value is a list of its neighbors.
    - k: int, number of seed nodes to select.
    
    Returns:
    - seeds: list, selected seed nodes.
    """
    # Step 1: Initialize thresholds for all nodes (randomly chosen from [0, 1])
    thresholds = {v: random.uniform(0, 1) for v in graph}
    
    # Step 2: Initialize data structures
    influence_received = {v: 0 for v in graph}  # Tracks total influence received by each node
    degree = {v: len(graph[v]) for v in graph}  # Degree of each node
    dd = degree.copy()  # Discounted degree values
    seeds = []  # Selected seed nodes
    heap = []  # Max-heap for selecting nodes (negative value for max-heap behavior)
    
    # Step 3: Build initial heap based on node degrees
    for v, val in dd.items():
        heapq.heappush(heap, (-val, v))  # Push (-degree, node)
    
    # Step 4: Greedy seed selection
    for _ in range(k):
        while True:
            # Pop the node with the highest discounted degree
            _, u = heapq.heappop(heap)
            
            # Validate if the value is up-to-date
            if -dd[u] == _:
                break  # Up-to-date, continue
            
        # Add the selected node to the seed set
        seeds.append(u)
        
        # Update influence for neighbors of the selected node
        for neighbor in graph[u]:
            if neighbor not in seeds:  # Skip already selected nodes
                # Update influence received by the neighbor
                influence_received[neighbor] += 1 / degree[u]  # Assuming uniform weight
                
                # Check if the neighbor is already activated
                if influence_received[neighbor] >= thresholds[neighbor]:
                    continue  # Skip, as it's already activated
                
                # Update discounted degree for the neighbor
                dd[neighbor] = degree[neighbor] - influence_received[neighbor]
                heapq.heappush(heap, (-dd[neighbor], neighbor))  # Push updated value to heap
    
    return seeds


    
k = 50 # Number of seeds to select
seeds2 = linear_threshold_degree_discount(graph, k)
print("Selected seed nodes:", seeds2)


Selected seed nodes: [5795784, 5887243, 5856194, 5855655, 5891229, 5908495, 5858586, 6008268, 5885337, 5837429, 5786132, 5782963, 5709955, 5773182, 5865471, 5733693, 5681380, 5721287, 5901425, 5817744, 5700850, 5618907, 5739256, 5587105, 5602226, 5573854, 5554769, 5969079, 5994152, 5705574, 5683843, 5643701, 5616443, 5747550, 5739175, 5798015, 5811199, 5849411, 5714566, 5756597, 5610317, 5643356, 5645964, 5989237, 5998220, 5830548, 5957898, 5877028, 5738921, 5797877]


In [168]:
import random

def influence_spread(graph, seeds, p):
    """
    Simulate influence propagation and calculate the spread.

    Parameters:
    - graph: dict, adjacency list representation of the graph.
    - seeds: list, initial seed nodes.
    - p: float, propagation probability.

    Returns:
    - int: Total number of activated nodes.
    """
    active_nodes = set(seeds)  # Set of nodes already activated.
    newly_active = set(seeds)  # Nodes activated in the current step.

    while newly_active:
        next_newly_active = set()
        for node in newly_active:
            for neighbor in graph[node]:
                # Activate the neighbor with probability p if it is not already active
                if neighbor not in active_nodes and random.random() < p:
                    active_nodes.add(neighbor)
                    next_newly_active.add(neighbor)
        newly_active = next_newly_active  # Update for the next round

    return len(active_nodes)

In [169]:
p = 0.2  # 传播概率



# 使用 influence_spread 模拟传播范围
spread = influence_spread(graph, seeds1, p)
print("最终传播影响范围的节点数:", spread)


最终传播影响范围的节点数: 9057


In [170]:
p = 0.2  # 传播概率



    # 使用 influence_spread 模拟传播范围
spread = influence_spread(graph, seeds2, p)
print("最终传播影响范围的节点数:", spread)

最终传播影响范围的节点数: 9176
