In [1]:
from collections import Counter
import sys
sys.path.append('..')
import config
import os

data_path_prefix = config.data_path
neo4j_output_path = config.neo4j_path
os.makedirs(data_path_prefix, exist_ok=True)
os.makedirs(neo4j_output_path, exist_ok=True)

# 读取关系边文件
edges_file = os.path.join(data_path_prefix, '第0层+第1层+第2层关系边.txt')

# 初始化数据结构
level_0_nodes = set()
level_1_nodes = set()
level_2_nodes = set()
all_nodes = []

# 读取文件并统计每个节点出现的次数
with open(edges_file, 'r') as file:
    for line in file:
        node_a, node_b = line.strip().split('\t')
        all_nodes.extend([node_a])

# 统计频率
node_counts = Counter(all_nodes)

# 获取第0层节点（出现频率最高的13个节点）
most_common_nodes = node_counts.most_common(13)
for node, count in most_common_nodes:
    level_0_nodes.add(node)

# 获取第1层和第2层节点
with open(edges_file, 'r') as file:
    for line in file:
        node_a, node_b = line.strip().split('\t')
        if node_a in level_0_nodes:
            level_1_nodes.add(node_b)
        elif node_a in level_1_nodes:
            level_2_nodes.add(node_b)

# 移除第0层节点和第1层节点中出现的节点
level_1_nodes.difference_update(level_0_nodes)
level_2_nodes.difference_update(level_0_nodes)
level_2_nodes.difference_update(level_1_nodes)

# 打印结果
print('第0层节点：', len(level_0_nodes))
print('第1层节点：', len(level_1_nodes))
print('第2层节点：', len(level_2_nodes))


第0层节点： 13
第1层节点： 46039
第2层节点： 195362


In [6]:
# 第1层留100个，第2层留200个
level_1_nodes = set(list(level_1_nodes)[:1000])
level_2_nodes = set(list(level_2_nodes)[:5000])

In [7]:
# 读取关系点文件和转发点文件，合并并去重
import csv
points_file = os.path.join(data_path_prefix, '第0层+第1层+第2层关系点.txt')
repost_points_file = os.path.join(data_path_prefix, '第0层+第1层+第2层关系且转发点.txt')
all_points = set()

with open(points_file, 'r', encoding='utf-8') as file:
    for line in file:
        all_points.add(line.strip())

with open(repost_points_file, 'r', encoding='utf-8') as file:
    for line in file:
        all_points.add(line.strip())

# 仅保留属于第0层、第1层和第2层的节点
filtered_points = {point for point in all_points if point in level_0_nodes or point in level_1_nodes or point in level_2_nodes}

# 写入到 users.csv 文件
with open(os.path.join(neo4j_output_path, 'users.csv'), 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['USER_ID', 'LEVEL'])
    for point in filtered_points:
        if point in level_0_nodes:
            level = 0
        elif point in level_1_nodes:
            level = 1
        elif point in level_2_nodes:
            level = 2
        writer.writerow([point, level])

print("users.csv 文件已生成")

users.csv 文件已生成


In [8]:
import csv

# 定义文件路径
edges_file = os.path.join(data_path_prefix, '第0层+第1层+第2层关系边.txt')
filtered_points_file = os.path.join(neo4j_output_path, 'users.csv')  # 已生成的节点文件

# 读取filtered_points
filtered_points = set()
with open(filtered_points_file, 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    next(reader)  # 跳过表头
    for row in reader:
        filtered_points.add(row[0])

# 处理边并生成relations.csv
with open(edges_file, 'r', encoding='utf-8') as infile, open(os.path.join(neo4j_output_path, 'relations.csv'), 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['USER_1', 'relation', 'USER_2'])
    for line in infile:
        node_a, node_b = line.strip().split('\t')
        if node_a in filtered_points and node_b in filtered_points:
            writer.writerow([node_b, 'follow', node_a])  # 注意边的方向

print("relations.csv 文件已生成")


relations.csv 文件已生成


In [9]:
import csv

# 定义文件路径
repost_edges_file = os.path.join(data_path_prefix, '第0层+第1层+第2层关系且转发边.txt')
filtered_points_file = os.path.join(neo4j_output_path, 'users.csv')  # 已生成的节点文件

# 读取filtered_points
filtered_points = set()
with open(filtered_points_file, 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    next(reader)  # 跳过表头
    for row in reader:
        filtered_points.add(row[0])

# 处理边并生成repost_relations.csv
with open(repost_edges_file, 'r', encoding='utf-8') as infile, open(os.path.join(neo4j_output_path, 'repost_relations.csv'), 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['USER_1', 'relation', 'USER_2', 'COUNT'])
    for line in infile:
        node_a, node_b, count = line.strip().split('\t')
        if node_a in filtered_points and node_b in filtered_points:
            writer.writerow([node_b, 'follow_and_post', node_a, count])  # 注意边的方向和转发次数

print("repost_relations.csv 文件已生成")


repost_relations.csv 文件已生成
