# 전처리 테스트

In [4]:
import pandas as pd
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from transformers import AutoTokenizer

In [20]:
def remove_empty_utterance(data:json):
    """
    Remove empty utterances from the data
    """
    for example in data:
        example['input']['conversation'] = [cvt for cvt in example['input']['conversation'] if cvt['utterance'].strip() not in ["","."]]
    return data
# 최종 단계에서 if utterance = "" 하면 되는 거여서 굳이 함수로 만들 필요 없음

def correct_wrong_output(data:json, path:str):
    """
    1. Correct wrong speakers in outputs of train samples 'train-000401', 'train-000402, 'train-000111'
    2. Add dot(.) at the end of the last sentence in outputs of train samples 'train-000130'
    4. Replace speaker name 'SSD' with 'SD' in outputso of 'train-000030', 'train-000193' and 'dev-000085'
    5. Remove duplicate sentences in outputs of dev samples 'dev-000093'.
    """
    if 'train' in path:
        # Correct wrong speakers
        data[400]['output'] = data[400]['output'].replace('SD2100504','SD2110504')
        data[401]['output'] = data[401]['output'].replace('SD2110503','SD2100503')
        data[110]['output'] = data[110]['output'].replace('SD20010813','SD2001083')
        # Add dot(.) at the end of the last sentence
        data[129]['output'] = data[129]['output'] + '.'
        # Replace speaker name
        data[29]['output'] = data[29]['output'].replace('SSD', 'SD')
        data[192]['output'] = data[192]['output'].replace('SSD', 'SD')

    elif 'dev' in path:
        # Replace speaker name
        data[84]['output'] = data[84]['output'].replace('SSD', 'SD')
        # Remove duplicate sentences
        data[92]['output'] = '.'.join(data[92]['output'].split('.')[1:]).strip()

    return data


def add_space_after_period_and_remove_control_characters(data:json, path:str):
    """
    Add space after period if there is no space after period
    text = re.sub(r'\.(?=\S)', '. ', text)
    """
    if 'train' or 'dev' in path:
        # Add space after period in utterances
        for example in data:
            example['input']['conversation'] = [{'speaker': cvt['speaker'], 'utterance': re.sub(r'\.(?=\S)', '. ', cvt['utterance']).strip()} for cvt in example['input']['conversation']]

        # Remove_control_characters and Add space after period in outputs
        for example in data:
            output = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', example['output'])
            example['output'] = re.sub(r'\.(?=\S)', '. ', output).strip()

    return data


def change_weird_output(data:json, path:str):
    """
    Standardize the type of the output of train-000032, train-000418, dev-000074, dev-000093
    """
    # Standardize the type of outputs
    if 'train' in path:
        # train-000032 : total_summary 교체
        output = data[31]['output'].split('.')
        total_summary = "두 화자는 이 대화에서 진로 관련 고민에 대해 이야기했습니다. "
        data[31]['output'] = total_summary + '.'.join(output[1:])

        # train-000418 : total_summary 추가
        total_summary = "두 화자는 이 대화에서 다이어트에 대해 이야기했습니다. "
        data[417]['output'] = total_summary + data[417]['output']


    elif 'dev' in path:
        # dev-000074 : total_summary 수정
        data[73]['output'] = "두 화자는 "+ data[73]['output'] # 이 대화에서 -> 두 화자는 이 대화에서

        # dev-000093 : total_summary 추가
        total_summary = "두 화자는 이 대화에서 엔시티와 방탄소년단에 대해 이야기 했습니다. "
        data[92]['output'] = total_summary + data[92]['output']
    


    return data


def remove_sd_in_total_summary(data:json, path:str):
    """
    Remove 'SD' in total_summary of train-000020 and train-000176
    """
    if 'train' in path:
        # train-000020 : total_summary 수정
        data[19]['output'] = data[19]['output'].replace('SD2000039의 꿈인 ','')

        # train-000176 : total_summary '.' 가 빠져있던 것을 수정
        output = data[175]['output']
        data[175]['output'] = re.sub(r'(장단점에 대해 말했습니다)\s+(SD\d{7}(?:은|는))', r'\1. \2', output)

    return data


def total_summary_generalization(data:json, path:str):
    """
    Standardize the format of the total summary in the first sentence of the output 
    to start with "두 화자는 이 대화에서".
    """
    types = ["두 화자는", "화자들은" ,"두 사람은", "이 대화에서는"] # "두 화자는 이 대화에서"
    types2 = r"SD\d{7}(?:와|과).*SD\d{7}(?:은|는)"

    if 'train' or 'dev' in path:
        for example in data:
            output = example['output']
            total_summary = output.split('.')[0]

            if "두 화자는 이 대화에서" in total_summary:
                continue
            elif re.search(types2, total_summary):
                total_summary = re.sub(r'(.*)'+types2, '두 화자는 이 대화에서', total_summary)+'.'
                example['output'] = total_summary+'.'.join(output.split('.')[1:])
            else:
                for type in types:
                    if type in total_summary:
                        total_summary = re.sub(r'(.*)'+type, '두 화자는 이 대화에서', total_summary)+'.'
                        example['output'] = total_summary+'.'.join(output.split('.')[1:])
                        break
    
    return data


def remove_duplicate_output_words(data:json, path:str):
    """
    Remove duplicate words in outputs of train samples 
    (그리고 그리고) 'train-000387', 
    (대화에서 대화에서) 'train-000383', 'train-000451', 'train-000479', 'train-000495'
    (좋은 좋은) 'train-000268'
    (화자 화자) 'train-000092', 'train-000231'
    (할머니가 할머니가) 'train-000128'
    (가도 가도) 'train-000338'
    """
    if 'train' in path:
        # Remove duplicate words
        ids = [387, 383, 451, 479, 495, 268, 92, 231, 128, 338]
        for id in ids:
            output = data[id-1]['output']
            output = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', output)
            data[id-1]['output'] = output

    return data

# stopword로 제거하기 전, 예외적인 경우 처리
def remove_stopwords_exception(data:json, path:str):
    """
    manual exception handling for removing stopwords in utterances
    (" 좋 ") : train과 dev에서는 의미없게 단어 사이에 추가된 단어이지만, test에서는 의미있는 단어로 사용되는 경우(좋 은데, 좋 을 것)가 있음
        ex) 'test-000119' : "좋 은데" -> "좋은데"
            'test-000303' : "좋 을 것" -> "좋을 것"
            'test-000348' : "좋 다고" -> "좋다고"
    """
    if 'test' in path:
        # " 좋 " -> " 좋"
        data[118]['input']['conversation'][-1]['utterance'] = data[118]['input']['conversation'][-1]['utterance'].replace(' 좋 ', ' 좋')
        data[302]['input']['conversation'][-2]['utterance'] = data[302]['input']['conversation'][-2]['utterance'].replace(' 좋 ', ' 좋')
        data[347]['input']['conversation'][4]['utterance'] = data[347]['input']['conversation'][4]['utterance'].replace(' 좋 ', ' 좋')

    return data


# SD\d{7} 앞에 '화자' 제거
def remove_hwaja_before_speaker_in_output(data:json, path:str):
    """
    Remove '화자' before 'SD\d{7}' in outputs of train samples
    """
    if 'train' in path:
        for example in data:
            output = example['output']
            output = re.sub(r'화자\s*(SD\d{7})', r'\1', output)
            example['output'] = output

    return data

# SD\d{7} 뒤에 아무런 조사가 붙지 않은 경우 수정
def add_josa_after_speaker_in_output(data:json, path:str):
    """
    <Train>
    - train-243 : SD2002060 또한 -> SD2002060도
    - train-410 : 또 SD2100516 자신은 -> 또 자신은
    - train-441 :  SD2110545 유기견을 -> 또 유기견을 / 또 SD2100546은 -> SD2100546은
    - train-495 :  SD2100589에도 -> SD2100589에게도 / SD2100589 헬스장 -> SD2100589에게 헬스장
    """
    if 'train' in path:
        data[242]['output'] = data[242]['output'].replace('SD2002060 또한', 'SD2002060도')
        data[409]['output'] = data[409]['output'].replace('또 SD2100516 자신은', '또 자신은')
        data[440]['output'] = data[440]['output'].replace('SD2110545 유기견을', '또 유기견을').replace('또 SD2100546은', 'SD2100546은')
        data[494]['output'] = data[494]['output'].replace('SD2100589에도', 'SD2100589에게도').replace('SD2100589 헬스장', 'SD2100589에게 헬스장')
        
    return data


# speaker summary generalization
def speaker_summary_generalization(data:json, path:str):
    """
    Standardize the format of the speaker summary in the first sentence of the output 
    to start with "SD\d{7}은(는)".
    """
    if 'train' in path:
        # exception handling 
        # train-000496 "SD2100589가" -> "SD2100589는"
        # train-000476 "SD2100573도" -> "SD2100573은"
        data[495]['output'] = data[495]['output'].replace('SD2100589가', 'SD2100589는')
        data[475]['output'] = data[475]['output'].replace('SD2100573도', 'SD2100573은')
    

    def check_first_speaker_and_first_summary_speaker_is_same(example:json) -> bool:
        """
        Check if the first speaker and the first speaker summary speaker are the same.
        """
        first_speaker = example['input']['conversation'][0]['speaker']
        first_summary_speaker = re.search(r'SD\d{7}', example['output']).group()
        return first_speaker == first_summary_speaker


    def find_split_indexes(text: str) -> list[tuple]:
        """
        Find the indexes(strat, end) to split the structured summary.
        """
        # The number of 'SD{7}[은는]{1}'
        num_speakers = len(re.findall(r'SD\d{7}[은는]{1}', text))

        # Split the structured summary based on the number of 'SD{7}[은는]{1}'
        if num_speakers == 2: 
            mathes = re.finditer(r'SD\d{7}[은는]{1}', text)
            return [(match.group(), match.start()) for match in mathes] # [(speaker1, start_id_1), (speaker2, start_id_2)]
        
        elif num_speakers == 0:
            matches = re.finditer(r'SD\d{7}\w+', text)

            first_match = next(matches)
            first_tuple = (first_match.start(), first_match.group())

            for match in matches:
                if match.group()[:9] == first_tuple[1][:9]: # SD{7}가 같은 경우
                    continue
                return [(first_tuple[1], first_tuple[0]), (match.group(), match.start())]
            
        elif num_speakers == 1:
            matches = re.finditer(r'SD\d{7}\w+', text)

            first_match = next(matches)
            first_tuple = (first_match.start(), first_match.group())

            for match in matches:
                if match.group()[:9] == first_tuple[1][:9]: # SD{7}가 같은 경우
                    continue
                return [(first_tuple[1], first_tuple[0]), (match.group(), match.start())]
            
        elif num_speakers == 3:
            matches = re.finditer(r'SD\d{7}[은는]{1}', text)

            first_match = next(matches)
            first_tuple = (first_match.start(), first_match.group())

            for match in matches:
                if match.group()[:9] == first_tuple[1][:9]: # SD{7}가 같은 경우
                    continue
                return [(first_tuple[1], first_tuple[0]), (match.group(), match.start())]
        
        elif num_speakers == 4:
            matches = re.finditer(r'SD\d{7}[은는]{1}', text)

            first_match = next(matches)
            first_tuple = (first_match.start(), first_match.group())

            for match in matches:
                if match.group()[:9] == first_tuple[1][:9]: # SD{7}가 같은 경우
                    continue
                return [(first_tuple[1], first_tuple[0]), (match.group(), match.start())]

    if 'test' in path:
        for example in data:
            # Find speaker_1 and speaker_2
            speaker_1 = example['input']['conversation'][0]['speaker']

            for speaker in example['input']['conversation']:
                if speaker['speaker'] != speaker_1:
                    speaker_2 = speaker['speaker']
                    break
                
            example['input']['speaker_1'] = speaker_1
            example['input']['speaker_2'] = speaker_2

    elif 'train' or 'dev' in path:
        for example in data:
            output = example['output']

            # Find the indexes to split the structured summary
            split_indexes = find_split_indexes(output) # [(r'speaker1\w+', start_id_1), (r'speaker2\w+', start_id_2)]
            speaker_1, speaker_2 = split_indexes[0][0][:9], split_indexes[1][0][:9] # SD{7}

            # Split the structured summary
            total_summary = output[:split_indexes[0][1]].strip()
            if check_first_speaker_and_first_summary_speaker_is_same(example):
                # The first speaker and the first speaker summary speaker are the same
                example['input']['speaker_1'] = speaker_1
                example['input']['speaker_2'] = speaker_2

                speaker_1_summary = output[split_indexes[0][1]:split_indexes[1][1]].strip()
                speaker_2_summary = output[split_indexes[1][1]:].strip()
            else:
                # The first speaker and the first speaker summary speaker are different
                speaker_1, speaker_2 = speaker_2, speaker_1 # Swap the speakers
                example['input']['speaker_1'] = speaker_1
                example['input']['speaker_2'] = speaker_2

                speaker_1_summary = output[split_indexes[1][1]:].strip()
                speaker_2_summary = output[split_indexes[0][1]:split_indexes[1][1]].strip()

            # Standardize the format of the speaker summary
            output_format = f'''## 전반적인 요약\n{total_summary}\n\n## {speaker_1} 요약\n{speaker_1_summary}\n\n## {speaker_2} 요약\n{speaker_2_summary}'''
            
            example['output'] = output_format
    
    return data



def remove_stopwords(text):

    stopwords_pattern = [r'\w~', r'\b으\b', r'\b그\b', r'\b뭐\b', r'\b어\b',  r'\b인제\b', r'\b이제\b', r'\b막\b', r'\b아\b', r'\b음\b', r'\b읍\b', r'\b오\b', r'\b으\b',
                      r'좋 ', r'\b크\b', r'\b스\b', r'\. \.', r'^\s*\.\s{1}'] # r'name[0-9]\S*'

    # 커스텀 불용어 제거
    for pattern in stopwords_pattern:
        text = re.sub(pattern, '', text)
    
    # x를 포함한 단어 제거
    text = re.sub(r'\b[가-힣a-zA-Z]*[xX][가-힣a-zA-Z]*\b', '', text)

    # 단어가 두 번 이상 반복되는 경우 -> 1개로
    # text = re.sub(r'\b(\w)\s+\1\b', r'\1', text)
    # text = re.sub(r'\b([가-힣a-zA-Z0-9_]+)\s+\1\b', r'\1', text)
    text = re.sub(r'\b(\w+)\b(?:\s+\1\b)+', r'\1', text)

    # 공백 두 번 이상 연속 -> 1개로
    text = re.sub(r'\s{2,}', ' ', text)

    # 간단한 후처리
    text = text.strip()
    
    return text



In [25]:
# Preprocess the dataset
def preprocess(path:str):
    
    # Load the dataset
    with open(path, 'r') as f:
        data = json.load(f)

    # remove_stopwords_exception
    data = remove_stopwords_exception(data, path)

    # correct_wrong_outputㅋ
    data = correct_wrong_output(data, path)

    # change_weird_output
    data = change_weird_output(data, path)

    # remove_sd_in_total_summary
    data = remove_sd_in_total_summary(data, path)

    # add_space_after_period and strip
    data = add_space_after_period_and_remove_control_characters(data, path)
    
    # total_summary_generalization
    data = total_summary_generalization(data, path)

    # # remove_empty_utterance
    # data = remove_empty_utterance(data)

    # remove_duplicate_output_words
    data = remove_duplicate_output_words(data, path)

    # remove_hwaja_before_speaker_in_output
    data = remove_hwaja_before_speaker_in_output(data, path)

    # preprocess the dataset
    for example in data:
        for cvt in example['input']['conversation']:
            cvt['utterance'] = remove_stopwords(cvt['utterance'])

    # remove_empty_utterance
    data = remove_empty_utterance(data)

    # add_josa_after_speaker_in_output
    data = add_josa_after_speaker_in_output(data, path)

    # speaker_summary_generalization
    data = speaker_summary_generalization(data, path)

    # Save the preprocessed dataset
    with open(path.split('/')[-1].split('_')[1].split('.')[0]+'.json', 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"Preprocessing of {path} is done!")

In [26]:
preprocess('../resource/data/일상대화요약_train.json')

Preprocessing of ../resource/data/일상대화요약_train.json is done!


In [27]:
preprocess('../resource/data/일상대화요약_dev.json')

Preprocessing of ../resource/data/일상대화요약_dev.json is done!


In [28]:
preprocess('../resource/data/일상대화요약_test.json')

Preprocessing of ../resource/data/일상대화요약_test.json is done!
