In [None]:
import sentencepiece as sp
import os
import json

In [None]:
root_dir = os.path.abspath(os.path.join(os.getcwd(),os.pardir,os.pardir))


In [None]:
model_file = os.path.join(root_dir,'model/laboro_distilbert/tokenizer/ccc_13g_unigram.model')
tokenizer_sp = sp.SentencePieceProcessor(model_file=model_file)


In [None]:
def ddqa_copy_format(ori_dic):
    output_dic = {'version':ori_dic['version'],'data':[]}
    data_dic = {'title':ori_dic['data'][0]['title'],'paragraphs':[]}
    output_dic['data'].append(data_dic)
    return output_dic

In [None]:
ddqa_path = os.path.join(root_dir,'data/ddqa/RC-QA')
train_path = os.path.join(ddqa_path,'DDQA-1.0_RC-QA_train.json')
val_path = os.path.join(ddqa_path,'DDQA-1.0_RC-QA_dev.json')
test_path = os.path.join(ddqa_path,'DDQA-1.0_RC-QA_test.json')

train_ori = json.load(open(train_path,encoding='utf8'))
val_ori = json.load(open(val_path,encoding='utf8'))
test_ori = json.load(open(test_path,encoding='utf8'))

train_output = ddqa_copy_format(train_ori)
val_output = ddqa_copy_format(val_ori)
test_output = ddqa_copy_format(test_ori)

In [None]:
def pre_processing_context(tokenizer_sp,line):
    ids = tokenizer_sp.encode(line, out_type=int)
    tokens = tokenizer_sp.id_to_piece(ids)
    #print(tokens)
    return ' '.join(tokens)

def pre_processing(tokenizer_sp,line,max_seq_len=512):
    ids = tokenizer_sp.encode(line, out_type=int)
    if len(ids)>max_seq_len-2:
        ids = ids[:max_seq_len-2]
    tokens = tokenizer_sp.id_to_piece(ids)
    #print(tokens)
    return ' '.join(tokens)

In [None]:
def read_data(ori_dic):
    for para in ori_dic['data'][0]['paragraphs']:
        yield para

def tokenize_para(ori_para):
    output_para = {'context':'','qas':[]}
    
    context = ori_para['context'].replace(" ", ".").replace("…", ".")
    output_context = pre_processing_context(tokenizer_sp,context)
    output_para['context'] = output_context
    
    for qas in ori_para['qas']:
        qas_dic_format = {'id':'','question':'','answers':[],'is_impossible':None}
        qas_dic_format['id'] = qas['id']
        qas_dic_format['is_impossible'] = qas['is_impossible']
        
        question =  qas['question']
        tokenized_question = pre_processing(tokenizer_sp,question)
        qas_dic_format['question'] = tokenized_question
        
        for answer in qas['answers']:
            answers_dic_format = {'text':'','answer_start':-1}
            answers_dic_format['answer_start'] = answer['answer_start']
            
            text = answer['text']
            tokenized_text = pre_processing(tokenizer_sp,text)
            answers_dic_format['text'] = tokenized_text
            
            qas_dic_format['answers'].append(answers_dic_format)
        
        output_para['qas'].append(qas_dic_format)
        
    return output_para


In [None]:
for para in read_data(train_ori):
    output_para = tokenize_para(para)
    train_output['data'][0]['paragraphs'].append(output_para)

train_output_path = os.path.join(ddqa_path,'tokenized_DDQA-1.0_RC-QA_train.json')
json.dump(train_output,open(train_output_path,'w',encoding='utf8'),ensure_ascii=False)
    

In [None]:
for para in read_data(val_ori):
    output_para = tokenize_para(para)
    val_output['data'][0]['paragraphs'].append(output_para)

val_output_path = os.path.join(ddqa_path,'tokenized_DDQA-1.0_RC-QA_dev.json')
json.dump(val_output,open(val_output_path,'w',encoding='utf8'),ensure_ascii=False)


In [None]:
for para in read_data(test_ori):
    output_para = tokenize_para(para)
    test_output['data'][0]['paragraphs'].append(output_para)

test_output_path = os.path.join(ddqa_path,'tokenized_DDQA-1.0_RC-QA_test.json')
json.dump(test_output,open(test_output_path,'w',encoding='utf8'),ensure_ascii=False)
