In [1]:
import os
import json
import pandas as pd
import random
import json5
from tqdm import tqdm

### 将TreeWeibo中的所有信息整合到一个文件weibo_timedelay.txt中

In [267]:
# 遍历所有时间信息文件输出到weibo_timedelay.txt
# Specify the folder containing the files
folder_path = '/home/ame/rumor/PPA/data/TreeWeibo'

# Specify the output file path
output_path = '/home/ame/rumor/weibo_dataset/weibo_timedelay.txt'

# Iterate through all the files in the folder
with open(output_path, 'w') as out_file:
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)

            # Perform the operation on the file
            with open(file_path, 'r') as in_file:
                lines = in_file.readlines()
                lines = [line for line in lines if line.strip() and len(line.split()) >= 2 and line.split()[1] == '0'] # Added a check for lines with at least two items

                out_file.writelines(lines)
# 3536975887764280	0	3536982095687768	24.68

### 将原数据与评论数据整合 weibo_id_text.txt + weibo_timedelay.txt

### weibo_id_text.txt 所有评论信息的内容 weibo_timedelay.txt 所有评论信息的原数据编号 评论数据编号 评论回复原数据时间

In [47]:
#首先weibo_id_text.txt文件的第二列数据中存在空格，如果不消除空格对后续数据处理有很大有影响
input_file_path = "weibo_id_text.txt"
output_file_path = "weibo_id_text2.txt"
with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, "w", encoding="utf-8") as output_file:
    for line in input_file:
        columns = line.strip().split("\t")
        if len(columns) >= 2:
            columns[1] = columns[1].replace(" ", "")
            output_file.write("\t".join(columns) + "\n")

# Read weibo_id_text.txt file and create a dictionary
with open('weibo_id_text2.txt', 'r') as file:
    weibo_id_text = {}
    for line in file:
        line_parts = line.strip().split(maxsplit=1)
        if len(line_parts) == 2:
            weibo_id_text[line_parts[0]] = line_parts[1]

# Read weibo_timedelay.txt and replace the response text number
with open('weibo_timedelay.txt', 'r') as file:
    weibo_time_delay = [line.strip().split() for line in file if len(line.strip().split()) == 4]

for i, row in enumerate(weibo_time_delay):
    if row[2] in weibo_id_text:
        weibo_time_delay[i][2] = weibo_id_text[row[2]]

# Write the updated data to a new file
with open('weibo_id_text_merge1.txt', 'w') as file:
    for row in weibo_time_delay:
        file.write(' '.join(row) + '\n')

#这个时候数据会格式会出现错误，只显示一列
with open('weibo_id_text_merge1.txt', 'r') as input_file, open('weibo_id_text_merge2.txt', 'w') as output_file:
    for line in input_file:
        columns = line.split()
        output_file.write('\t'.join(columns) + '\n')

#第二列的0为为无用数据，删除

input_file_path = "weibo_id_text_merge2.txt"
output_file_path = "weibo_id_text_merge3.txt"

# Read the input file and remove the second column
with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        # Split the line into columns
        columns = line.split()

        # Remove the second column
        del columns[1]

        # Write the modified line to the output file
        output_file.write(' '.join(columns) + '\n')

#同样会出现数据格式错误变为1列，我们将其修复
with open('weibo_id_text_merge3.txt', 'r') as input_file, open('weibo_id_text_merge4.txt', 'w') as output_file:
    for line in input_file:
        columns = line.split()
        output_file.write('\t'.join(columns) + '\n')


### 再整合进原数据内容和信息类别label: weibo.txt

In [56]:
# Read weibo_timedelay_modified1.txt
with open('weibo_id_text_merge4.txt', 'r') as f:
    weibo_timedelay = [line.strip().split() for line in f]

# Create a set of metadata ids from weibo_timedelay
metadata_ids = set(data[0] for data in weibo_timedelay)

# Read weibo.txt and filter by metadata id
with open('weibo.txt', 'r') as f:
    weibo = [line.strip().split('\t') for line in f if line.strip().split('\t')[0] in metadata_ids]

# Create a dictionary mapping metadata id to metadata content and label
metadata_dict = {data[0]: (data[1], 0 if data[2] == 'false' else 1) for data in weibo}

# Add metadata content column and label column to weibo_timedelay
weibo_timedelay_with_metadata = [
    [data[0], metadata_dict.get(data[0], ('', ''))[0], metadata_dict.get(data[0], ('', ''))[1], data[1], data[2]]
    for data in weibo_timedelay
]

# Write weibo_timedelay_with_metadata to a new file
with open('weibo_id_text_merge5.txt', 'w') as f:
    for data in weibo_timedelay_with_metadata:
        f.write('\t'.join(map(str, data)) + '\n')


### 将txt文件转换为PLAN数据格式的json文件


In [72]:

# Read weibo_timedelay_modified2.txt using pandas
df = pd.read_csv('weibo_id_text_merge5.txt', sep='\t', header=None, names=['metadata', 'content_metadata', 'label', 'content_reply', 'time_reply'])

# Remove rows with missing values
df.dropna(subset=['metadata', 'label', 'time_reply'], inplace=True)

# Convert 'metadata' and 'label' columns to integer data type
df['metadata'] = df['metadata'].astype(int)
df['label'] = df['label'].astype(int)

# Group the data by the 'metadata' column and create the desired JSON structure
grouped_data = {}
for index, row in df.iterrows():
    metadata, content_metadata, label, content_reply, time_reply = row
    if metadata not in grouped_data:
        grouped_data[metadata] = {
            "id_": metadata,
            "label": label,
            "tweets": [content_metadata],
            "time_delay": [0],
            "structure": [],
        }
    else:
        grouped_data[metadata]["tweets"].append(content_reply)
        grouped_data[metadata]["time_delay"].append(time_reply)

# Write the JSON data to a file, with each object on a single line
with open('weibo_full.json', 'w') as file:
    for data in grouped_data.values():
        json.dump(data, file, ensure_ascii=False)
        file.write('\n')


### 统计新生成的json文件的行数


In [78]:
json_file_path = 'weibo_full.json'

with open(json_file_path, 'r') as file:
    line_count = sum(1 for _ in file)

print(f'There are {line_count} elements in the JSON file.')

There are 4595 elements in the JSON file.


### 统计txt文件不相同元素的个数


In [61]:
file_name = 'weibo_timedelay_modified2_filtered.txt'

unique_elements = set()
with open(file_name, 'r') as file:
    for line in file:
        first_column = line.strip().split()[0]
        unique_elements.add(first_column)

print(f'The number of different elements in the first column is: {len(unique_elements)}')

#

The number of different elements in the first column is: 4662


### 统计txt文件列数(防止数据混乱)

In [30]:
# define the filename and delimiter
filename = "weibo_id_text_merge4.txt"
delimiter = "\t"  # change to "," for a CSV file

# read the first row of the file and count the number of delimiter characters
with open(filename, "r") as file:
    first_row = file.readline().strip()
    num_columns = first_row.count(delimiter) + 1  # add 1 to account for the last column

# output the result
print(f"The file '{filename}' has {num_columns} columns.")


The file 'weibo_id_text_merge5.txt' has 5 columns.


### 按70% 15% 15%分配数据集

In [2]:
# Load the JSON data from the file
with open('/home/ame/rumor/PLAN/codes/data/weibo/weibo_full_data_repair.json', 'r') as file:
    json_data = [json.loads(line) for line in file]

# Shuffle the data
random.shuffle(json_data)

# Split the data into a training set, a test set 1, and a test set 2 with a 70:15:15 ratio
train_index = int(len(json_data) * 0.7)
test1_index = int(len(json_data) * 0.85)
train_data = json_data[:train_index]
test1_data = json_data[train_index:test1_index]
test2_data = json_data[test1_index:]

# Write the training set to a file
with open('weibo_train_repair.json', 'w') as file:
    for data in train_data:
        json.dump(data, file, ensure_ascii=False)
        file.write('\n')

# Write test set 1 to a file
with open('weibo_test1_repair.json', 'w') as file:
    for data in test1_data:
        json.dump(data, file, ensure_ascii=False)
        file.write('\n')

# Write test set 2 to a file
with open('weibo_test2_repair.json', 'w') as file:
    for data in test2_data:
        json.dump(data, file, ensure_ascii=False)
        file.write('\n')


### 统计所有json数据中的time_delay[]包含的最大元素值和包含最多的元素个数

In [66]:
# Initialize the maximum element count and maximum number
max_element_count = 0
max_number = 0

# Read the JSON file line by line
with open('/home/ame/rumor/PLAN/codes/pheme.json', 'r') as f:
    for line in f:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        time_delay = item['time_delay']

        # Update the maximum number if a larger value is found
        current_max_number = max(time_delay)
        max_number = max(max_number, current_max_number)

        # Update the maximum element count
        max_element_count = max(max_element_count, len(time_delay))

print(f'The maximum number in all time_delay arrays is {max_number}.')
print(f'The maximum number of elements in all time_delay arrays is {max_element_count}.')


The maximum number in all time_delay arrays is 98.
The maximum number of elements in all time_delay arrays is 229.


### 更改微博数据集 中的time_dealy[] 最大为100

In [83]:
# Initialize the output list
output_data = []

# Read the JSON file line by line
with open('/home/ame/rumor/PLAN/codes/data/weibo/weibo_full.json', 'r') as f:
    for line in f:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        time_delay = item['time_delay']

        # Cap numbers greater than 100 to 100
        capped_time_delay = [min(x, 100) for x in time_delay]

        # Update the time_delay array in the current JSON object
        item['time_delay'] = capped_time_delay

        # Add the modified JSON object to the output list
        output_data.append(item)

# Write the modified JSON objects to a new file
with open('/home/ame/rumor/PLAN/codes/data/weibo/weibo_full_change_time_delay.json', 'w', encoding='utf-8') as f:
    for item in output_data:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')


### 不再是将大于100的数据变为100而是删除，同时删除对应的tweet[]中的数据

In [81]:
import json

# Initialize the output list
output_data = []

# Read the JSON file line by line
with open('weibo_full.json', 'r') as f:
    for line in f:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        time_delay = item['time_delay']

        # Remove items in the time_delay and tweets arrays where time_delay > 100
        cleaned_time_delay = []
        cleaned_tweets = []
        for i, time_val in enumerate(time_delay):
            try:
                time_val_float = float(time_val)  # Attempt to convert time_val to float
                if time_val_float <= 100:
                    cleaned_time_delay.append(time_val_float)
                    cleaned_tweets.append(item['tweets'][i])
            except ValueError:
                # Skip the value if it cannot be converted to float
                continue

        # Update the time_delay and tweets arrays in the current JSON object
        item['time_delay'] = cleaned_time_delay
        item['tweets'] = cleaned_tweets

        # Add the modified JSON object to the output list
        output_data.append(item)

# Write the modified JSON objects to a new file
with open('weibo_full_del.json', 'w', encoding='utf-8') as f:
    for item in output_data:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')


### 统计weibo数据中的label参数

In [73]:
# Initialize label counters
label_0_count = 0
label_1_count = 1

# Read the JSON file line by line
with open('weibo_full.json', 'r') as f:
    for line in f:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        label = item['label']

        # Update the label counters
        if label == 0:
            label_0_count += 1
        elif label == 1:
            label_1_count += 1

# Print the count of each label
print(f"Count of label 0: {label_0_count}")
print(f"Count of label 1: {label_1_count}")


Count of label 0: 2281
Count of label 1: 2315


### 删除微博数据集 中的time_dealy[]的元素


In [75]:
# Read the JSON file line by line and process the objects
with open('/home/ame/rumor/PLAN/codes/data/weibo/weibo_test1.json', 'r', encoding='utf-8') as input_file, \
        open('/home/ame/rumor/PLAN/codes/data/weibo/weibo_test1_no_time_dealy.json', 'w', encoding='utf-8') as output_file:
    for line in input_file:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        item['time_delay'] = []

        # Write the modified JSON object to the output file with ensure_ascii=False
        json.dump(item, output_file, ensure_ascii=False)
        output_file.write('\n')

### 拆分json数据


In [83]:
with open('/home/ame/rumor/PLAN/codes/data/weibo/weibo_full_del.json', 'r',encoding='utf-8') as f:
    data = f.read().strip().split('\n')

for i, obj in enumerate(data):
    with open(f'weibo_full_change_time_dealy_{i}.json', 'w') as f_out:
        json.dump(json.loads(obj), f_out, separators=(',', ':'),ensure_ascii=False)
        f_out.write('\n')

### 查找错误json文件

In [None]:
check_file_path = os.path.join('/home/ame/rumor/PLAN/codes', 'weibo_check.py')
%run $check_file_path

### 合并json文件

In [86]:
def clean_line(line):
    return ''.join(c for c in line if not c.isspace() or c.isprintable())

def read_json_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                line = clean_line(line.strip())
                if not line:
                    continue
                yield json5.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping malformed JSON object in {file_path}: {line}")

# Rest of the code remains the same


def write_json_lines(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data:
            json_line = json.dumps(item, ensure_ascii=False)
            file.write(f"{json_line}\n")

def merge_json_files(directory_path, output_file):
    merged_data = []

    # Iterate over all files in the specified directory
    for file_name in tqdm(os.listdir(directory_path), desc="Merging files"):
        # Check if the file is a JSON file
        if file_name.endswith('.json'):
            file_path = os.path.join(directory_path, file_name)

            # Read the JSON lines and merge them into the merged_data list
            for json_line in read_json_lines(file_path):
                merged_data.append(json_line)

    # Write the merged data to the output file
    write_json_lines(output_file, merged_data)

if __name__ == '__main__':
    input_directory = '/home/ame/rumor/PLAN/codes/data/weibo/test2'
    output_file = '/home/ame/rumor/PLAN/codes/data/weibo/weibo_full_data_repair.json'
    merge_json_files(input_directory, output_file)


Merging files: 100%|██████████| 4594/4594 [02:13<00:00, 34.30it/s]


### 我们将微博数据集分成5份，选取当中的一份按照训练集：测试集1：测试集2为70：15：15的比例进行测试



In [88]:
def read_json_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line.strip()) for line in file]
    return data

def write_json_lines(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data:
            json_line = json.dumps(item, ensure_ascii=False)
            file.write(f"{json_line}\n")

def split_dataset(dataset, ratios):
    random.shuffle(dataset)
    total_length = len(dataset)
    splits = []
    current_index = 0
    for ratio in ratios:
        split_length = int(total_length * ratio)
        splits.append(dataset[current_index:current_index + split_length])
        current_index += split_length
    return splits

if __name__ == '__main__':
    dataset_file = '/home/ame/rumor/PLAN/codes/data/weibo/weibo_full_data_repair.json'
    output_directory = '/home/ame/rumor/PLAN/codes/data/weibo'

    dataset = read_json_lines(dataset_file)

    num_splits = 5
    equal_parts = [dataset[i::num_splits] for i in range(num_splits)]

    for i, part in enumerate(equal_parts):
        train, test1, test2 = split_dataset(part, [0.7, 0.15, 0.15])
        write_json_lines(os.path.join(output_directory, f'weibo_split_{i}_train.json'), train)
        write_json_lines(os.path.join(output_directory, f'weibo_split_{i}_test1.json'), test1)
        write_json_lines(os.path.join(output_directory, f'weibo_split_{i}_test2.json'), test2)


### 创造weibo数据集，纠正label

In [4]:
with open('/home/ame/rumor/weibo/weibo_source_text.txt', 'r', encoding='utf-8') as input_file, open('weibo_source_text.txt', 'w', encoding='utf-8') as output_file:
    for line in input_file:
        columns = line.strip().split('\t')
        if columns[2] == 'non-rumor':
            columns[2] = 'true'
        output_file.write('\t'.join(columns) + '\n')


## 统计twitter 15/16label

In [9]:
# Initialize label counters
label_counts = {0: 0, 1: 0, 2: 0, 3: 0}

# Read the JSON file line by line
with open('/home/ame/rumor/PLAN/codes/data/twitter15_16/twitter15/split_data/structure_v2/split_0/train_unique_w_structure_v2_modified.json', 'r') as f:
    for line in f:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        label = item['label']

        # Update the label counters
        if label in label_counts:
            label_counts[label] += 1

# Print the count of each label
for label, count in label_counts.items():
    print(f"Count of label {label}: {count}")


Count of label 0: 836
Count of label 1: 297
Count of label 2: 0
Count of label 3: 0


### 改变twitter15  的label
#string_to_id = {"false" : 0,
                "true" : 1,
                "unverified" : 2,
                "non-rumor" : 3}

false-rumor, true-rumor, unverified all are rumor, non-rumor is non-rumor

In [12]:
import json

# Open the input JSON file for reading and create a new output JSON file for writing
with open('/home/ame/rumor/PLAN/codes/data/pheme/split_data/structure/split_1/test_w_structure.json', 'r') as infile, open('/home/ame/rumor/PLAN/codes/data/pheme/split_data/structure/split_0/test2_w_structure_modified.json', 'w') as outfile:
    for line in infile:
        # Ignore empty lines
        if not line.strip():
            continue

        item = json.loads(line)
        label = item['label']

        # Update the label values
        if label in [1,2]:
            item['label'] = 0
        elif label == 0:
            item['label'] = 1

        # Write the modified item to the output JSON file
        json.dump(item, outfile)
        outfile.write('\n')
