In [1]:
import json
import re

def preprocess_text(text):
    # Split and join text
    text = text.split()
    text = " ".join(text)
    
    # Remove URLs
    text = re.sub(r"http(\S)+", ' ', text)
    text = re.sub(r"www(\S)+", ' ', text)
    
    # Replace '&' and '&amp'
    text = re.sub(r"&", ' and ', text)
    text = text.replace('&amp', ' ')
    
    # Remove characters that are not letters, numbers, spaces, or specific punctuation marks
    text = re.sub(r'[^a-zA-Z0-9\s!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~]+', '', text)
    
    return text.strip()  # Strip leading and trailing whitespace

def preprocess_json_file(json_file):
    # Read JSON file
    with open(json_file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    # Preprocess each entry in JSON file
    for key, value in json_data.items():
        if 'origin_text' in value:
            origin_text = value['origin_text']
            value['origin_text'] = preprocess_text(origin_text)
    
    return json_data

In [4]:
import json
import jsonlines

def merge_jsonl_json(jsonl_file, json_file, output_file):
    # 读取 json 文件，构建 origin_text 到 generated_text 的映射字典
    origin_to_generated = {}
    # with open(json_file, 'r', encoding='utf-8') as f:
    #     json_data = json.load(f)

    json_data = preprocess_json_file(json_file)
    
    for key, value in json_data.items():
        origin_to_generated[value['origin_text']] = {'text': value['generated_text'], 
                                                     'label':  value['generated_label']}
    
    cnt = 0
    
    # 读取 jsonl 文件，并根据映射字典更新数据
    with jsonlines.open(jsonl_file) as reader, jsonlines.open(output_file, 'w') as writer:
        for obj in reader:
            message = obj['message']
            if message in origin_to_generated:
                cnt+=1
                obj['message'] = origin_to_generated[message]['text']
                obj['label'] = origin_to_generated[message]['label']
            writer.write(obj)
    print('*'*20, cnt)

# 示例用法
merge_jsonl_json('/home/gx/data/cloudcomputing/Trust_TELLER/data/GOSSIPCOP/train.jsonl', 
                 '/home/gx/data/cloudcomputing/data/gossipcop_v3-1_style_based_fake.json', 
                 'train_generate.jsonl')


******************** 5127
