In [1]:
import json
import pandas as pd
import numpy as np
import random
import os
import sys

In [2]:
df_source=pd.DataFrame(columns=["id","original_text"])

In [3]:
for line in open('cail2022/train_source.json', 'r', encoding='utf-8'):
    content = json.loads(line[:-1]) # 将字符串数据转为字典
    idx = len(df_source) 
    df_source.loc[idx,"id"]= content["pid"] 
    df_source.loc[idx,"original_text"] = content["source"]

In [4]:
df_source.head()  # 查看内容

Unnamed: 0,id,original_text
0,23136,"基于上述证据,确认如下事实:原告于2012年2月17日至被告处工作,每月工资为3,200元"
1,22507,同时，姚建杰自述中建二局未曾与其签订劳动合同，未为其缴纳过社会保险，其就是到工地打工，没有档案
2,20120,公司将于7月25日连同李华生6月26-7月25日及另外10天工资一起打至李华生的账户上
3,21151,诉讼中，原告王伟提供了下列证据：1、请假申请表3份，用以证明其向被告提出了请假一个月的申请，...
4,5214,具体收费标准由市物价、财政、市容园林部门按照国家规定并根据我市实际情况制定。


In [5]:
df_target=pd.DataFrame(columns=["id","wrong_ids","edits","pos","offset"])

In [6]:
for line in open('cail2022/train_label.json', 'r', encoding='utf-8'):
    content = json.loads(line[:-1])  # 将字符串数据转为字典
    idx = len(df_target)
    df_target.loc[idx,"id"]= content["pid"]
    if len(content["target"])==0:  # 对于没有错误的数据，wrong_ids为空
        df_target.loc[idx,"wrong_ids"]=[]
    else:
        wrong_ids = []  # 初始化位置列表
        edit_dic = []  # 初始化修正字典
        pos_lst = []  # 记录下错误开始的位置，用于替换
        offset_lst = []  # 记录修正后原文字数的变化，用edit-ori
        
        for x in content["target"]: 
            start = int(x["pos"])  # 记录出错的文本位置
            if len(x["ori"])>1:  # 需要把词语拆成单字记录位置
                wrong_ids.append(start)
                wrong_ids.append(start+1)
            else:
                wrong_ids.append(start)
            edit_dic.append((x['ori'],x['edit']))  # 记录修改内容
            pos_lst.append(start)  # 记录下错误开始的位置 
            offset_lst.append(len(x['edit'])-len(x['ori'])) # 记录修正后原文字数的变化   
       
        df_target.loc[idx,"wrong_ids"]= wrong_ids
        df_target.loc[idx,"edits"]= edit_dic
        df_target.loc[idx,"pos"] = pos_lst
        df_target.loc[idx,"offset"] = offset_lst

In [7]:
df_target.head()  # 查看内容

Unnamed: 0,id,wrong_ids,edits,pos,offset
0,23136,[],,,
1,22507,[],,,
2,20120,[],,,
3,21151,[588],"[(评, 凭)]",[588],[0]
4,5214,[],,,


In [8]:
df = pd.merge(df_source, df_target, how='inner', on="id")  # 将source和target拼接到一起

In [9]:
df.info()  # 查看相应信息

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             3000 non-null   object
 1   original_text  3000 non-null   object
 2   wrong_ids      3000 non-null   object
 3   edits          1426 non-null   object
 4   pos            1426 non-null   object
 5   offset         1426 non-null   object
dtypes: object(6)
memory usage: 164.1+ KB


In [10]:
df["correct_text"] = df["original_text"]  # 生成修改后的文本

In [11]:
for i in range(len(df)):
    if len(df.loc[i,"wrong_ids"]) >= 1:  # 对于需要修改的文本
        content = df.loc[i,"correct_text"]
        pos = df.loc[i,"pos"]
        edits = df.loc[i,"edits"]
        offset = df.loc[i,"offset"]
    
        if len(pos) == 1:
            change_len = len(edits[0][0])  # 改变的文本长度
            content = content[:pos[0]]+edits[0][1]+content[pos[0]+change_len:]  # 如果只有一处错误，直接替换即可
        else:
            for j in range(len(pos)):
                change_len = len(edits[j][0])  # 改变的文本长度
                if j==0:  # 第一次替换时，不必考虑偏移值
                    content = content[:pos[j]]+edits[j][1]+content[pos[j]+change_len:]
                else:  # 其他情况下，需要考虑替换文本对文字位置产生的影响，要在位置中加上偏移值
                    content = content[:pos[j]+offset[j-1]]+edits[j][1]+content[pos[j]+change_len+offset[j-1]:]  
    
        df.loc[i,"correct_text"]=content   

In [12]:
longs=[]
for i in range(3000):
    if len(df.loc[i,"original_text"]) != len(df.loc[i,"correct_text"]): # macbert要求必须相同长度
        df.drop(index=i,inplace=True)
    elif len(df.loc[i,"original_text"]) > 512 or len(df.loc[i,"correct_text"]) > 512: # 最大长度不能超过512
        longs.append(i)

In [13]:
longs

[3, 195, 1122, 1157, 2718, 2748]

In [14]:
for i in longs:
    if len(df.loc[i,"wrong_ids"]) == 0:
        #print(i)
        df.drop(index=i,inplace=True)  # 如果没有错误，则直接过滤掉
    else:
        # print(len(df.loc[i,"original_text"])-df.loc[i,"wrong_ids"][0])
        # 如果有错误，则从第一处错误开始，寻找该处错误所在的句子的起始位置
        # 否则，则直接找到最大限度（倒数第511个字）开始切分
        for k in range(df.loc[i,"wrong_ids"][0],len(df.loc[i,"original_text"])-513,-1):
            if df.loc[i,"original_text"][k] in ["。","；","）",":"] or k == len(df.loc[i,"original_text"])-512:
                # print(k)
                # print(df.loc[i,"wrong_ids"])
                df.loc[i,"wrong_ids"]=[x-k-1 for x in df.loc[i,"wrong_ids"]]
                df.loc[i,"pos"] = [x-k-1 for x in df.loc[i,"pos"]]
                df.loc[i,"original_text"]=df.loc[i,"original_text"][k+1:]
                df.loc[i,"correct_text"]=df.loc[i,"correct_text"][k+1:]
                break

In [15]:
df.reset_index(inplace=True, drop=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2675 entries, 0 to 2674
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             2675 non-null   object
 1   original_text  2675 non-null   object
 2   wrong_ids      2675 non-null   object
 3   edits          1103 non-null   object
 4   pos            1103 non-null   object
 5   offset         1103 non-null   object
 6   correct_text   2675 non-null   object
dtypes: object(7)
memory usage: 146.4+ KB


In [20]:
## 将数据切分成训练数据、测试数据和验证数据，比例为6:3:1
lst = list(range(2675))
random.shuffle(lst)
data = [[] for _ in range(3)]  #分别存储训练数据、测试数据和验证数据
for i in range(2675):
    k = lst[i] 
    if 0<=i<1605:
        data[0].append({'id': df.loc[k,"id"], 'original_text': df.loc[k,"original_text"]
                           , 'wrong_ids': df.loc[k,"wrong_ids"], 'correct_text': df.loc[k,"correct_text"]})
    elif 1605<=i<2408:
        data[1].append({'id': df.loc[k,"id"], 'original_text': df.loc[k,"original_text"]
                           , 'wrong_ids': df.loc[k,"wrong_ids"], 'correct_text': df.loc[k,"correct_text"]})
    else:
        data[2].append({'id': df.loc[k,"id"], 'original_text': df.loc[k,"original_text"]
                           , 'wrong_ids': df.loc[k,"wrong_ids"], 'correct_text': df.loc[k,"correct_text"]})

In [21]:
def save_json(data, json_path, mode='w', encoding='utf-8'):
    dir = os.path.dirname(os.path.abspath(json_path))
    if not os.path.exists(dir):
        print(dir)
        os.makedirs(dir)
    with open(json_path, mode=mode, encoding=encoding) as f:
        f.write(json.dumps(data, ensure_ascii=False, indent=4))

In [22]:
name=["train","test","dev"]
for i in range(3):
    save_json(data[i],"cail2022/output/{}.json".format(name[i])) #将其存储起来