# 전처리 테스트

In [60]:
import pandas as pd
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from transformers import AutoTokenizer

In [72]:
# def remove_empty_utterance(data:json):
#     """
#     Remove empty utterances from the data
#     """
#     for example in data:
#         example['input']['conversation'] = [cvt for cvt in example['input']['conversation'] if cvt['utterance'] != '']
#     return data
# 최종 단계에서 if utterance = "" 하면 되는 거여서 굳이 함수로 만들 필요 없음

def correct_wrong_output(data:json, path:str):
    """
    1. Correct wrong speakers in outputs of train samples 'train-000401', 'train-000402, 'train-000111'
    2. Add dot(.) at the end of the last sentence in outputs of train samples 'train-000130'
    4. Replace speaker name 'SSD' with 'SD' in outputso of 'train-000030', 'train-000193' and 'dev-000085'
    5. Remove duplicate sentences in outputs of dev samples 'dev-000093'.
    """
    if 'train' in path:
        # Correct wrong speakers
        data[400]['output'] = data[400]['output'].replace('SD2100504','SD2110504')
        data[401]['output'] = data[401]['output'].replace('SD2110503','SD2100503')
        data[110]['output'] = data[110]['output'].replace('SD20010813','SD2001083')
        # Add dot(.) at the end of the last sentence
        data[129]['output'] = data[129]['output'] + '.'
        # Replace speaker name
        data[29]['output'] = data[29]['output'].replace('SSD', 'SD')
        data[192]['output'] = data[192]['output'].replace('SSD', 'SD')

    elif 'dev' in path:
        # Replace speaker name
        data[84]['output'] = data[84]['output'].replace('SSD', 'SD')
        # Remove duplicate sentences
        data[92]['output'] = '.'.join(data[92]['output'].split('.')[1:]).strip()

    return data


def add_space_after_period_and_remove_control_characters(data:json, path:str):
    """
    Add space after period if there is no space after period
    text = re.sub(r'\.(?=\S)', '. ', text)
    """
    if 'train' or 'dev' in path:
        # Add space after period in utterances
        for example in data:
            example['input']['conversation'] = [{'speaker': cvt['speaker'], 'utterance': re.sub(r'\.(?=\S)', '. ', cvt['utterance']).strip()} for cvt in example['input']['conversation']]

        # Remove_control_characters and Add space after period in outputs
        for example in data:
            output = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', example['output'])
            example['output'] = re.sub(r'\.(?=\S)', '. ', output).strip()

    return data


def change_weird_output(data:json, path:str):
    """
    Standardize the type of the output of train-000032, train-000418, dev-000074, dev-000093
    """
    # Standardize the type of outputs
    if 'train' in path:
        # train-000032 : total_summary 교체
        output = data[31]['output'].split('.')
        total_summary = "두 화자는 이 대화에서 진로 관련 고민에 대해 이야기했습니다. "
        data[31]['output'] = total_summary + '.'.join(output[1:])

        # train-000418 : total_summary 추가
        total_summary = "두 화자는 이 대화에서 다이어트에 대해 이야기했습니다. "
        data[417]['output'] = total_summary + data[417]['output']


    elif 'dev' in path:
        # dev-000074 : total_summary 수정
        data[73]['output'] = "두 화자는 "+ data[73]['output'] # 이 대화에서 -> 두 화자는 이 대화에서

        # dev-000093 : total_summary 추가
        total_summary = "두 화자는 이 대화에서 엔시티와 방탄소년단에 대해 이야기 했습니다. "
        data[92]['output'] = total_summary + data[92]['output']
    


    return data


def remove_sd_in_total_summary(data:json, path:str):
    """
    Remove 'SD' in total_summary of train-000020 and train-000176
    """
    if 'train' in path:
        # train-000020 : total_summary 수정
        data[19]['output'] = data[19]['output'].replace('SD2000039의 꿈인 ','')

        # train-000176 : total_summary '.' 가 빠져있던 것을 수정
        output = data[175]['output']
        data[175]['output'] = re.sub(r'(장단점에 대해 말했습니다)\s+(SD\d{7}(?:은|는))', r'\1. \2', output)

    return data


def total_summary_generalization(data:json, path:str):
    """
    Standardize the format of the total summary in the first sentence of the output 
    to start with "두 화자는 이 대화에서".
    """
    types = ["두 화자는", "화자들은" ,"두 사람은", "이 대화에서는"] # "두 화자는 이 대화에서"
    types2 = r"SD\d{7}(?:와|과).*SD\d{7}(?:은|는)"

    if 'train' or 'dev' in path:
        for example in data:
            output = example['output']
            total_summary = output.split('.')[0]

            if "두 화자는 이 대화에서" in total_summary:
                continue
            elif re.search(types2, total_summary):
                total_summary = re.sub(r'(.*)'+types2, '두 화자는 이 대화에서', total_summary)+'.'
                example['output'] = total_summary+'.'.join(output.split('.')[1:])
            else:
                for type in types:
                    if type in total_summary:
                        total_summary = re.sub(r'(.*)'+type, '두 화자는 이 대화에서', total_summary)+'.'
                        example['output'] = total_summary+'.'.join(output.split('.')[1:])
                        break
    
    return data


def remove_stopwords(text):
    stopwords_pattern = [r'\w~', r'\b으\b', r'\b그\b', r'\b뭐\b', r'\b어\b',  r'\b인제\b', r'\b이제\b', r'\b막\b', r'\b아\b', r'\b음\b', r'\b읍\b', r'\b오\b', r'\b으\b'] 

    # 커스텀 불용어 제거
    for pattern in stopwords_pattern:
        text = re.sub(pattern, '', text)
    
    # x를 포함한 단어 제거
    text = re.sub(r'\b[가-힣a-zA-Z]*[xX][가-힣a-zA-Z]*\b', '', text)

    # 단어가 두 번 이상 반복되는 경우 -> 1개로
    # text = re.sub(r'\b(\w)\s+\1\b', r'\1', text)
    # text = re.sub(r'\b([가-힣a-zA-Z0-9_]+)\s+\1\b', r'\1', text)
    text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', text)

    # 공백 두 번 이상 연속 -> 1개로
    text = re.sub(r'\s{2,}', ' ', text)

    # 간단한 후처리
    text = text.strip()
    
    return text

In [73]:
# Preprocess the dataset
def preprocess(path:str):
    
    # Load the dataset
    with open(path, 'r') as f:
        data = json.load(f)

    # correct_wrong_output
    data = correct_wrong_output(data, path)

    # change_weird_output
    data = change_weird_output(data, path)

    # remove_sd_in_total_summary
    data = remove_sd_in_total_summary(data, path)

    # add_space_after_period and strip
    data = add_space_after_period_and_remove_control_characters(data, path)
    
    # total_summary_generalization
    data = total_summary_generalization(data, path)

    # # remove_empty_utterance
    # data = remove_empty_utterance(data)

    # preprocess the dataset
    for example in data:
        for cvt in example['input']['conversation']:
            cvt['utterance'] = remove_stopwords(cvt['utterance'])

    # Save the preprocessed dataset
    with open(path.split('/')[-1].split('_')[1].split('.')[0]+'.json', 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"Preprocessing of {path} is done!")

In [74]:
preprocess('../resource/data/일상대화요약_train.json')

Preprocessing of ../resource/data/일상대화요약_train.json is done!


In [75]:
preprocess('../resource/data/일상대화요약_dev.json')
preprocess('../resource/data/일상대화요약_test.json')

Preprocessing of ../resource/data/일상대화요약_dev.json is done!


Preprocessing of ../resource/data/일상대화요약_test.json is done!
