In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import random
import math

In [7]:
# Load the data
data_name = 'mooc'

df = pd.read_csv(f'{data_name}.csv', header=None)
df.columns = ['source', 'target', 'timestamp', 'label']
df = df.sort_values(by='timestamp')
df['timestamp'] = pd.qcut(df['timestamp'], q=6, labels=False)
df = df.reindex(columns=['source', 'target', 'label', 'timestamp'])
df['is_generated'] = 0
label_0_count = df[df['label'] == 1].shape[0]
# 统计'is_generated'的数量
is_generated_count = df[df['is_generated'] == 0].shape[0]
print("Number of rows where label is 1: ", label_0_count)
print("Number of rows where is_generated is 1: ", is_generated_count)
# df[df['label'] <= 3]['timestamp'] = 3
# df[df['label'] >= 8]['timestamp'] = 8
# df['timestamp'] -= 3
print(df[df['label'] == 0])
print(df[df['label'] == 0]['timestamp'].unique())
print(df[df['label'] == 1]['timestamp'].unique())

unique_elements = pd.concat([df['source'], df['target']]).unique()
print("Total unique elements:", len(unique_elements))

Number of rows where label is 1:  4066
Number of rows where is_generated is 1:  411748
        source  target  label  timestamp  is_generated
0            0       1      0          0             0
1            0       2      0          0             0
2            0       1      0          0             0
3            0       2      0          0             0
4            0       3      0          0             0
...        ...     ...    ...        ...           ...
411743    7026       8      0          5             0
411744    6842       8      0          5             0
411745    7026       9      0          5             0
411746    6842       5      0          5             0
411747      70      23      0          5             0

[407682 rows x 5 columns]
[0 1 2 3 4 5]
[0 1 2 3 4 5]
Total unique elements: 7047


In [8]:
# 对原始数据进行训练/测试集切割
train, test = train_test_split(df, test_size=0.2, random_state=42)

# 对训练集进一步切割，获取验证集
train, valid = train_test_split(train, test_size=0.25, random_state=42)

df_all = pd.concat([train, valid, test])

all_nodes = set(train['source'].tolist())

# 找到所有的时间戳子集
timestamps = train['timestamp'].unique()

# 遍历每个时间戳子集
for timestamp_subset in timestamps:
    # 找到在这个子集中存在的节点
    subset_nodes = set(train[train['timestamp'] == timestamp_subset]['source'].tolist())

    # 找到在全集上存在但不在子集中的节点
    nodes_to_add = all_nodes - subset_nodes

    # 只选择一半的节点来添加
    num_nodes_to_add = len(nodes_to_add)
    nodes_to_add = list(nodes_to_add)[:num_nodes_to_add]

    # 为这些节点生成新的边
    new_edges = []
    for node_to_add in nodes_to_add:
        # 随机选择一个在子集中的节点来与新添加的节点创建连接
        existing_node_id = random.choice(list(subset_nodes))
        new_edges.append([node_to_add, existing_node_id, 1, timestamp_subset, 1])

    new_edges_df = pd.DataFrame(new_edges, columns=['source', 'target', 'label', 'timestamp', 'is_generated'])
    df_all=pd.concat([df_all, new_edges_df], ignore_index=True)
    train = pd.concat([train, new_edges_df], ignore_index=True)


In [10]:
def to_txt(df,type_):
    # 根据"timestamp"的值拆分DataFrame为10个子DataFrame
    grouped = df.groupby('timestamp')

    # 创建并命名子DataFrame
    sub_dataframes = [group for _, group in grouped]
    sub_dataframe_names = [f'layer_{timestamp}' for timestamp, _ in grouped]

    # 将子DataFrame与对应的名称关联起来
    sub_dataframe_dict = dict(zip(sub_dataframe_names, sub_dataframes))

    # 循环遍历每个层次的子DataFrame，将数据根据"label"的值分为正负样本
    for name, sub_df in sub_dataframe_dict.items():
        # print(sub_df)
        pos_df = sub_df[sub_df['label'] == 0]
        neg_df = sub_df[sub_df['label'] == 1]
        layer = int(name[-1])+1
        
            # 将负样本数据写入文件
        with open(f"./{data_name}/{layer}_{type_}_neg.txt", 'w') as neg_file:
            for _, row in neg_df.iterrows():
                neg_file.write(f"{row['source']} {row['target']}\n")

        # 将正样本数据写入文件
        with open(f"./{data_name}/{layer}_{type_}_pos.txt", 'w') as pos_file:
            for _, row in pos_df.iterrows():
                pos_file.write(f"{row['source']} {row['target']}\n")


In [None]:
to_txt(train,'train')
to_txt(valid,'valid')
to_txt(test,'test')