In [11]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Load the data
data_name = 'mooc'

data = pd.read_csv(f'{data_name}.csv', header=None)
data.columns = ['source', 'target', 'timestamp', 'label']
data = data.sort_values(by='timestamp')
data['timestamp'] = pd.qcut(data['timestamp'], q=6, labels=False)
data = data.reindex(columns=['source', 'target', 'label', 'timestamp'])
data['is_generated'] = 0
label_0_count = data[data['label'] == 1].shape[0]
# 统计'is_generated'的数量
is_generated_count = data[data['is_generated'] == 0].shape[0]
print("Number of rows where label is 1: ", label_0_count)
print("Number of rows where is_generated is 1: ", is_generated_count)


data_timestamp_9 = data[data['timestamp'] == 5]

# 根据label列的值分开数据
pos_data = data_timestamp_9[data_timestamp_9['label'] == 0]
neg_data = data_timestamp_9[data_timestamp_9['label'] == 1]


print(len(pos_data))
print(len(neg_data))

Number of rows where label is 1:  4066
Number of rows where is_generated is 1:  411748
67969
656


In [12]:


# 切分数据集
temp_pos, test_pos = train_test_split(pos_data, train_size=0.4, random_state=42)
train_pos, valid_pos = train_test_split(temp_pos, train_size=0.5, random_state=42)

temp_neg, test_neg = train_test_split(neg_data, train_size=0.4, random_state=42)
train_neg, valid_neg = train_test_split(temp_neg, train_size=0.5, random_state=42)

# 创建保存文件的文件夹
output_folder = f'./{data_name}_5/'
os.makedirs(output_folder, exist_ok=True)

# 写入数据到文本文件
def write_data_to_file(data, filename):
    with open(filename, 'w') as f:
        for index, row in data.iterrows():
            f.write(f"{row['source']} {row['target']}\n")

# 保存文件的命名规则
file_suffixes = ['train', 'valid', 'test']
file_labels = ['pos', 'neg']

# 遍历所有组合，写入文本文件
for suffix in file_suffixes:
    for label in file_labels:
        data_to_write = globals()[f'{suffix}_{label}']
        filename = os.path.join(output_folder, f'6_{suffix}_{label}.txt')
        write_data_to_file(data_to_write, filename)

print("文件写入完成！")


文件写入完成！
