# 쓸모없이 끼어들어 있는 단어들

- 잘 그 잘, 나는 납 나는 : 한 단어를 뛰고 반복되는 단어들
- 겨 계절은, 겨 경험이 : 자음 반복
- 너무너무너무 : 공백 없이 붙어서 두 번 이상 반복되는 단어들
- 너무 먹고 싶 너무 먹고 싶어요 : 짤려서 반복

In [90]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from transformers import AutoTokenizer
import re
from collections import defaultdict, OrderedDict
from tqdm import tqdm

def make_dataframe(path: str) -> pd.DataFrame:
    """
    Read a json file and return a pandas DataFrame.

    Parameters:
    path (str): Path to the json file.

    Returns:
    pd.DataFrame: DataFrame of the json file.
    """
    # Read the json file
    with open(path, 'r') as file:
        data = json.load(file)

    # Create a DataFrame
    # columns = ['id', 'conversation', 'subject_keyword', 'output']
    df = pd.DataFrame(data)
    df['conversation'] = df['input'].apply(lambda x: x['conversation'])
    df['subject_keyword'] = df['input'].apply(lambda x: x['subject_keyword'])

    # Drop the 'input' column
    df.drop('input', axis=1, inplace=True)

    # Speakers in the conversation
    def find_ordered_speakers(turns):
        speakers = OrderedDict()
        for turn in turns:
            speakers[turn['speaker']] = 0
        return list(speakers.keys())
    
    df['speakers'] = df['conversation'].apply(find_ordered_speakers)

    # Reorder the columns
    df = df[['id', 'conversation', 'subject_keyword', 'speakers', 'output']]

    return df

train_df = make_dataframe('../resource/data/일상대화요약_train.json')
dev_df = make_dataframe('../resource/data/일상대화요약_dev.json')
test_df = make_dataframe('../resource/data/일상대화요약_test.json')
filtered_train_df = make_dataframe('./train.json')
filtered_dev_df = make_dataframe('./dev.json')
filtered_test_df = make_dataframe('./test.json')

## 공백 없이 붙어서 두 번 이상 반복되는 단어들

In [27]:
import re

text = "너무너무너무 재미있어 재미재미있어 하하하하호호호"

# 동일한 단어가 2번 이상 중복되는 패턴 찾기
pattern = r'(\w+)\1+'
matches = re.findall(pattern, text)

print(matches)

['너무', '재미', '하하', '호']


In [33]:
# 공백 없이 붙어서 두 번 이상 반복되는 단어들

def find_repeated_words(df:pd.DataFrame, test = False):
    """
    Find repeated words in the conversation. and output
    """

    df = deepcopy(df)

    conversation_word_dict = defaultdict(int)
    output_word_dict = defaultdict(int)

    def find_repeated_words_in_conversation(conversation):
        for turn in conversation:
            repeated_words = re.findall(r'(\w+)\1+', turn['utterance'])
            if repeated_words:
                for word in repeated_words:
                    conversation_word_dict[word] += 1
    
    def find_repeated_words_in_output(output):
        repeated_words = re.findall(r'(\w+)\1+', output)
        if repeated_words:
            for word in repeated_words:
                output_word_dict[word] += 1
    
    df['conversation'].apply(find_repeated_words_in_conversation)
    print("The number of repeated words in conversation :", len(conversation_word_dict))

    if not test:
        df['output'].apply(find_repeated_words_in_output)
        print("The number of repeated words in output :", len(output_word_dict))
        output_word_dict = sorted(output_word_dict.items(), key=lambda x: x[1], reverse=True)
    
    conversation_word_dict = sorted(conversation_word_dict.items(), key=lambda x: x[1], reverse=True)

    return conversation_word_dict, output_word_dict

In [34]:
conv, output = find_repeated_words(filtered_train_df)

The number of repeated words in conversation : 375
The number of repeated words in output : 88


In [35]:
conv2, output2 = find_repeated_words(filtered_dev_df)
conv3, _ = find_repeated_words(filtered_test_df, test=True)

The number of repeated words in conversation : 123
The number of repeated words in output : 27
The number of repeated words in conversation : 287


In [36]:
convs1 = [c[0] for c in conv]
convs2 = [c[0] for c in conv2]
convs3 = [c[0] for c in conv3]

In [31]:
filtered_train_df[filtered_train_df['output'].apply(lambda x: '너무' in x)]

Unnamed: 0,id,conversation,subject_keyword,speakers,output
1,nikluge-2024-일상 대화 요약-train-000002,"[{'speaker': 'SD2000002', 'utterance': '여행 다닐 ...","[여행 스타일, 숙소, 음식]","[SD2000002, SD2000001]","두 화자는 이 대화에서 쇼핑하기, 숙소, 음식 등 각자 선호하는 여행 스타일에 대해..."
8,nikluge-2024-일상 대화 요약-train-000009,"[{'speaker': 'SD2000018', 'utterance': '네. 오늘 ...","[계절, 날씨]","[SD2000018, SD2000017]",두 화자는 이 대화에서 추위와 더위 중에 더 많이 타는 것과 선호하는 계절과 날씨가...
15,nikluge-2024-일상 대화 요약-train-000016,"[{'speaker': 'SD2000039', 'utterance': '먹거리가 코...",[음식],"[SD2000039, SD2000038]",두 화자는 이 대화에서 할머니와 엄마가 해주셨던 음식 중에서 지금은 먹을 수 없지만...
26,nikluge-2024-일상 대화 요약-train-000027,"[{'speaker': 'SD2000870', 'utterance': '여자가 결혼...",[자식],"[SD2000870, SD2000869]",두 화자는 이 대화에서 자식을 낳는 것에 대해 말했습니다. SD2000869는 본인...
30,nikluge-2024-일상 대화 요약-train-000031,"[{'speaker': 'SD2000873', 'utterance': '근데 무릎이...",[건강],"[SD2000873, SD2000874]",두 화자는 이 대화에서 배우자의 건강에 대해 말했습니다. SD2000873은 건강 ...
...,...,...,...,...,...
465,nikluge-2024-일상 대화 요약-train-000466,"[{'speaker': 'SD2100568', 'utterance': '산 사람 마...",[명절 문제],"[SD2100568, SD2100567]",두 화자는 이 대화에서 명절 문제에 관한 대화를 했습니다. SD2100568은 제사...
474,nikluge-2024-일상 대화 요약-train-000475,"[{'speaker': 'SD2110574', 'utterance': '이번에 언니...","[휴가, 여행]","[SD2110574, SD2100573]",두 화자는 이 대화에서 여행 휴가지에 관해 이야기했습니다. SD2110574는 SD...
491,nikluge-2024-일상 대화 요약-train-000492,"[{'speaker': 'SD2100585', 'utterance': '그리고 지금...",[결혼],"[SD2100585, SD2100586]",두 화자는 이 대화에서 젊은 세대를 중심으로 일어나고 있는 비혼주의 확산 현상에 대...
493,nikluge-2024-일상 대화 요약-train-000494,"[{'speaker': 'SD2100586', 'utterance': '근데 개념을...",[대안학교],"[SD2100586, SD2100585]",두 화자는 이 대화에서 자녀 양육 방안을 함께 모색했습니다. SD2100586은 안...


## 짤려서 반복

In [19]:
import re

text = "너무 먹고 싶 너무 먹고 싶어요"

# 단어나 구절이 반복되는 패턴 찾기
pattern = r'(\b\w+(?:\s\w+)*)\s\1'
matches = re.findall(pattern, text)

print(matches)

['너무 먹고 싶']


In [150]:
def find_repeated_phrases(df:pd.DataFrame, test = False):
    """
    Find repeated phrases in the conversation. and output
    """

    df = deepcopy(df)

    conversation_phrase_dict = defaultdict(int)
    output_phrase_dict = defaultdict(int)

    def find_repeated_phrases_in_conversation(conversation):
        for turn in conversation:
            repeated_phrases = re.findall(r'(\b\w+(?:\s\w+)*)\s\1', turn['utterance'])
            if repeated_phrases:
                for phrase in repeated_phrases:
                    conversation_phrase_dict[phrase] += 1
    
    def find_repeated_phrases_in_output(output):
        repeated_phrases = re.findall(r'(\b\w+(?:\s\w+)*)\s\1', output)
        if repeated_phrases:
            for phrase in repeated_phrases:
                output_phrase_dict[phrase] += 1

    df['conversation'].apply(find_repeated_phrases_in_conversation)
    print("The number of repeated phrases in conversation :", len(conversation_phrase_dict))
    conversation_phrase_dict = sorted(conversation_phrase_dict.items(), key=lambda x: x[1], reverse=True)
    print("The number of repeated phrases in conversation that repeated num <= 2 :", len([c for c in conversation_phrase_dict if c[1] <= 2]))

    if not test:
        df['output'].apply(find_repeated_phrases_in_output)
        print("The number of repeated phrases in output :", len(output_phrase_dict))
        output_phrase_dict = sorted(output_phrase_dict.items(), key=lambda x: x[1], reverse=True)

    return conversation_phrase_dict, output_phrase_dict

In [151]:
conv, output = find_repeated_phrases(filtered_train_df)
conv2, output2 = find_repeated_phrases(filtered_dev_df)
conv3, _ = find_repeated_phrases(filtered_test_df, test=True)

The number of repeated phrases in conversation : 828
The number of repeated phrases in conversation that repeated num <= 2 : 752
The number of repeated phrases in output : 2
The number of repeated phrases in conversation : 195
The number of repeated phrases in conversation that repeated num <= 2 : 181
The number of repeated phrases in output : 1
The number of repeated phrases in conversation : 639
The number of repeated phrases in conversation that repeated num <= 2 : 574


- 많이 뽑히는 것은 이유가 있는 경우(자주 쓰이는 표현) 일 수 있음
- 그러므로, 이런 경우는 정성적인 분석이 필요하고
- 그렇지 않은 적게 뽑히는 것은 예외일 가능성이 높으므로 모두 1개로 통일 (2개 이하) 
    - train : 752 / dev : 181 / test : 574

In [104]:
text = " 그때 너무 긴장하고 발표 긴장하고 발표 연습"
re.sub(r'긴장하고 발표 긴장하고 발표', '긴장하고 발표', text)

' 그때 너무 긴장하고 발표 연습'

In [105]:
a= '긴장하고 발표'
re.search(rf'{a} {a}', text) 

<re.Match object; span=(7, 22), match='긴장하고 발표 긴장하고 발표'>

In [99]:
conv

[('가', 62),
 ('다', 29),
 ('이', 28),
 ('있', 26),
 ('나', 22),
 ('하', 21),
 ('거', 13),
 ('먹', 12),
 ('내', 10),
 ('없', 9),
 ('사', 9),
 ('너', 9),
 ('해', 8),
 ('저', 8),
 ('유', 7),
 ('대', 7),
 ('시', 7),
 ('요', 7),
 ('일', 7),
 ('그래', 6),
 ('많', 6),
 ('했', 6),
 ('고', 6),
 ('주', 6),
 ('제', 5),
 ('학', 5),
 ('원', 5),
 ('여', 5),
 ('자', 5),
 ('생', 5),
 ('한', 5),
 ('같', 4),
 ('건', 4),
 ('보', 4),
 ('올', 4),
 ('약', 4),
 ('지', 4),
 ('그렇', 4),
 ('우', 4),
 ('입', 4),
 ('얼', 4),
 ('매', 4),
 ('거기', 4),
 ('외', 4),
 ('집', 4),
 ('맞', 4),
 ('않', 4),
 ('되', 4),
 ('우리', 4),
 ('들', 3),
 ('영', 3),
 ('재료', 3),
 ('그냥', 3),
 ('엄마', 3),
 ('재', 3),
 ('몇', 3),
 ('여름', 3),
 ('세', 3),
 ('중', 3),
 ('인', 3),
 ('별', 3),
 ('강', 3),
 ('3', 3),
 ('한 명', 3),
 ('언니', 3),
 ('친', 3),
 ('어디', 3),
 ('신', 3),
 ('마', 3),
 ('쪼금', 3),
 ('시국이', 3),
 ('언', 3),
 ('취', 3),
 ('고양이', 3),
 ('예', 3),
 ('그거', 3),
 ('맛있', 2),
 ('정도', 2),
 ('단', 2),
 ('살', 2),
 ('코로나 때문에', 2),
 ('한국', 2),
 ('구', 2),
 ('또 그러고', 2),
 ('아이', 2),
 ('꽃', 2),
 ('애들', 2),
 

In [24]:
output

[('두루', 1), ('깜짝', 1)]

In [65]:
conv2

[('있', 8),
 ('다', 6),
 ('하', 6),
 ('먹', 5),
 ('기', 5),
 ('이', 5),
 ('언니', 4),
 ('해', 3),
 ('보', 3),
 ('저', 3),
 ('한 살', 3),
 ('사', 3),
 ('생', 3),
 ('1', 3),
 ('말', 2),
 ('자', 2),
 ('산', 2),
 ('가', 2),
 ('설', 2),
 ('인', 2),
 ('최', 2),
 ('입', 2),
 ('약', 2),
 ('매', 2),
 ('반', 2),
 ('주', 2),
 ('그래', 2),
 ('많', 2),
 ('미국', 2),
 ('공', 2),
 ('음식', 2),
 ('잃', 2),
 ('유산', 1),
 ('안 했', 1),
 ('음식을 먹으실 때', 1),
 ('평', 1),
 ('찾게 되', 1),
 ('오랜만에 도가니', 1),
 ('건물 지어가지고', 1),
 ('인테리어', 1),
 ('능이', 1),
 ('자연', 1),
 ('식', 1),
 ('맛', 1),
 ('그런 게', 1),
 ('문', 1),
 ('비서', 1),
 ('챙', 1),
 ('탈', 1),
 ('후', 1),
 ('혹', 1),
 ('우리', 1),
 ('야간 알바', 1),
 ('치', 1),
 ('떡볶이', 1),
 ('그렇게 하면', 1),
 ('거기', 1),
 ('씁', 1),
 ('노', 1),
 ('저 돈', 1),
 ('네다', 1),
 ('라', 1),
 ('뭘 하려면', 1),
 ('유산균이', 1),
 ('건강', 1),
 ('마', 1),
 ('순간', 1),
 ('최근에 종영한', 1),
 ('요즘', 1),
 ('드라', 1),
 ('실망도 하고', 1),
 ('이름', 1),
 ('쓰이', 1),
 ('반려묘', 1),
 ('버', 1),
 ('개', 1),
 ('겨울', 1),
 ('습', 1),
 ('항', 1),
 ('무', 1),
 ('형', 1),
 ('갱년기가 오면서', 1),
 ('둘레

In [113]:
def find_index_of_repeated_phrases(df:pd.DataFrame, sampled_conv):
    """
    Find index of repeated phrases in the conversation.
    """

    df = deepcopy(df)

    repeated_phrases = sampled_conv
    repeated_phrase_indices = defaultdict(list)

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        for phrase in repeated_phrases:
            pattern = rf'{phrase} {phrase}'
            for turn in row['conversation']:
                if re.search(pattern, turn['utterance']):
                    repeated_phrase_indices[idx].append(phrase)

    return repeated_phrase_indices

In [152]:
train_repeated_phrase_indices = find_index_of_repeated_phrases(filtered_train_df, [c[0] for c in conv if c[1]<=2])
dev_repeated_phrase_indices = find_index_of_repeated_phrases(filtered_dev_df, [c[0] for c in conv2 if c[1]<=2])
test_repeated_phrase_indices = find_index_of_repeated_phrases(filtered_test_df, [c[0] for c in conv3 if c[1]<=2])

  0%|          | 0/506 [00:00<?, ?it/s]

100%|██████████| 506/506 [00:09<00:00, 50.80it/s]
100%|██████████| 102/102 [00:00<00:00, 548.47it/s]
100%|██████████| 408/408 [00:05<00:00, 78.87it/s]


In [116]:
train_repeated_phrase_indices

defaultdict(list,
            {1: ['적'],
             2: ['났'],
             5: ['된장찌개'],
             6: ['맛있', '제주', '버터 비어', '가 보', '터키'],
             7: ['꿀', '면'],
             9: ['눈'],
             10: ['정도', '드시'],
             12: ['단', '거인이었', '깊'],
             14: ['살', '두릅', '들깨 칼국수', '뭐가 유명합니다'],
             15: ['양념', '술빵', '알았', '며칠 전에', '너무 먹고 싶'],
             16: ['거래처'],
             17: ['코로나 때문에', '방 하나 부엌 하나', '취미'],
             18: ['첫', '얘'],
             19: ['헌 집'],
             20: ['한국', '바'],
             23: ['코로나 때문에'],
             24: ['치즈타르트'],
             25: ['아쉽'],
             26: ['주님 주시면', '세 명', '너무 싫', '이쁘긴 한데'],
             27: ['바래'],
             28: ['구', '누구'],
             29: ['한 해'],
             30: ['또 그러고', '은'],
             31: ['좋은 학교', '제가 지', '대해'],
             32: ['편집을 하', '대학을 가고 나서'],
             33: ['민', '여기 너무'],
             34: ['어려워', '이외'],
             38: ['거북이', '결혼', '이번에는 지난번보다 더 맛있는 케이크'],
             4

In [117]:
dev_repeated_phrase_indices

defaultdict(list,
            {0: ['자', '유산', '안 했'],
             1: ['가', '고'],
             2: ['음식을 먹으실 때'],
             3: ['말'],
             4: ['자', '평'],
             5: ['찾게 되', '오랜만에 도가니', '건물 지어가지고', '인테리어'],
             6: ['산', '능이', '자연', '식', '맛', '그런 게', '문', '비서'],
             11: ['자', '가', '챙', '탈', '후', '고'],
             12: ['설', '혹', '우리'],
             13: ['가', '가', '고'],
             14: ['가', '야간 알바'],
             15: ['가', '치', '떡볶이', '그렇게 하면', '거기', '고'],
             16: ['씁'],
             17: ['가', '노', '저 돈', '네다', '라', '뭘 하려면'],
             18: ['유산균이', '건강'],
             20: ['마', '순간', '최근에 종영한'],
             21: ['인', '요즘'],
             22: ['최', '드라'],
             23: ['실망도 하고'],
             25: ['이름', '쓰이'],
             26: ['입', '반려묘', '버', '개'],
             27: ['가', '설', '약', '겨울', '습'],
             28: ['가', '항', '무'],
             30: ['가'],
             31: ['최', '매'],
             32: ['형'],
             34: ['산', '가', '고'],
 

In [118]:
test_repeated_phrase_indices

defaultdict(list,
            {0: ['팥'],
             4: ['중간', '고', '생일 날에'],
             5: ['고', '매', '중학교 때는', '어렸을 때'],
             6: ['고', '발음', '간호사'],
             7: ['어색', '에', '남자', '티를 내야지', '아무런 징조도 없이', '저 사람이', '짜'],
             8: ['매점에'],
             10: ['야구를 했', '다르', '하게 됐', '꿈', '우선적으로', '그것'],
             11: ['3 40', '도'],
             13: ['우리 엄마'],
             14: ['1위', '고'],
             15: ['머리', '브'],
             16: ['진', '잠', '품', '인간', '짧', '님', '키웠', '나무틀', '처음에'],
             18: ['달', '고'],
             19: ['유튜버 활', '도'],
             20: ['거 그런'],
             22: ['이성', '이성', '납', '시계'],
             23: ['고'],
             24: ['연애', '시국이', '굉', '통계'],
             25: ['의', '무', '그랬'],
             27: ['미술', '포토', '외국', '인지', '뇌'],
             28: ['에'],
             29: ['의'],
             30: ['서'],
             31: ['어떻게 하지', '층간'],
             32: ['찍', '눈'],
             34: ['신'],
             35: ['봐'],
             36: ['앞', 

In [164]:
# pickle로 저장
import pickle

with open('train_repeated_phrase_indices_0.pkl', 'wb') as f:
    pickle.dump(train_repeated_phrase_indices, f)

with open('dev_repeated_phrase_indices_0.pkl', 'wb') as f:
    pickle.dump(dev_repeated_phrase_indices, f)

with open('test_repeated_phrase_indices_0.pkl', 'wb') as f:
    pickle.dump(test_repeated_phrase_indices, f)

In [165]:
# pickle로 불러오기

with open('train_repeated_phrase_indices_0.pkl', 'rb') as f:
    train_repeated_phrase_indices = pickle.load(f)

with open('dev_repeated_phrase_indices_0.pkl', 'rb') as f:
    dev_repeated_phrase_indices = pickle.load(f)

with open('test_repeated_phrase_indices_0.pkl', 'rb') as f:
    test_repeated_phrase_indices = pickle.load(f)

In [166]:
def make_one_repeated_words(data:json, path:str, iter:int=0):
    """
    Replace the repeated words in the text with one word.

    Parameters:
    data (json): Data to be processed.
    path (str): Path to save the processed data.

    Returns:
    data (json): Processed data.
    """
    
    # Function for removing repeated words
    def removeing_repeated_words(data:json, repeated_phrase_indices:dict, mode:str):
        # repeated_phrase_indices = {key : index, value : repeated phrase}

        for idx in tqdm(repeated_phrase_indices.keys(), total=len(repeated_phrase_indices), desc=f'Removing repeated phrases in {mode} data ... (Phase {iter})'):
            repeated_phrases = repeated_phrase_indices[idx]
            for phrase in repeated_phrases:
                pattern = rf'{phrase} {phrase}'
                for i, turn in enumerate(data[idx]['input']['conversation']):
                    if re.search(pattern, turn['utterance']):
                        data[idx]['input']['conversation'][i]['utterance'] = re.sub(pattern, phrase, turn['utterance'])

    # Remove repeated words in the conversation
    if 'train' in path:
        with open(f'train_repeated_phrase_indices_{iter}.pkl', 'rb') as file:
            repeated_phrase_indices = pickle.load(file)
        
        removeing_repeated_words(data, repeated_phrase_indices, mode='train')

    elif 'dev' in path:
        with open(f'dev_repeated_phrase_indices_{iter}.pkl', 'rb') as file:
            repeated_phrase_indices = pickle.load(file)
        removeing_repeated_words(data, repeated_phrase_indices, mode='dev')

    elif 'test' in path:
        with open(f'test_repeated_phrase_indices_{iter}.pkl', 'rb') as file:
            repeated_phrase_indices = pickle.load(file)
        removeing_repeated_words(data, repeated_phrase_indices, mode='test')

    return data

In [167]:
# Read the json file
with open('./train.json', 'rb') as file:
        train_data = json.load(file)
with open('./dev.json', 'rb') as file:
        dev_data = json.load(file)
with open('./test.json', 'rb') as file:
        test_data = json.load(file)

post_train_data = make_one_repeated_words(train_data, 'train.json', iter=0)
post_dev_data = make_one_repeated_words(dev_data, 'dev.json', iter=0)
post_test_data = make_one_repeated_words(test_data, 'test.json', iter=0)

Removing repeated phrases in train data ... (Phase 0): 100%|██████████| 378/378 [00:00<00:00, 13502.70it/s]
Removing repeated phrases in dev data ... (Phase 0): 100%|██████████| 87/87 [00:00<00:00, 12418.05it/s]
Removing repeated phrases in test data ... (Phase 0): 100%|██████████| 313/313 [00:00<00:00, 16132.33it/s]


In [168]:
with open('./test_train.json', 'w') as file:
    json.dump(post_train_data, file, ensure_ascii=False, indent=4)

with open('./test_dev.json', 'w') as file:
    json.dump(post_dev_data, file, ensure_ascii=False, indent=4)

with open('./test_test.json', 'w') as file:
    json.dump(post_test_data, file, ensure_ascii=False, indent=4)

In [169]:
test_train = make_dataframe('./test_train.json')
test_dev = make_dataframe('./test_dev.json')
test_test = make_dataframe('./test_test.json')

In [170]:
test_train_idx, _ = find_repeated_phrases(test_train)
test_dev_idx, _ = find_repeated_phrases(test_dev)
test_test_idx, _ = find_repeated_phrases(test_test)

The number of repeated phrases in conversation : 89
The number of repeated phrases in conversation that repeated num <= 2 : 13
The number of repeated phrases in output : 2
The number of repeated phrases in conversation : 15
The number of repeated phrases in conversation that repeated num <= 2 : 1
The number of repeated phrases in output : 1
The number of repeated phrases in conversation : 73
The number of repeated phrases in conversation that repeated num <= 2 : 8
The number of repeated phrases in output : 0


In [139]:
test_train_idx

[('가', 62),
 ('다', 29),
 ('이', 28),
 ('있', 26),
 ('나', 22),
 ('하', 21),
 ('거', 13),
 ('먹', 12),
 ('내', 10),
 ('없', 9),
 ('사', 9),
 ('너', 9),
 ('해', 8),
 ('저', 8),
 ('유', 7),
 ('대', 7),
 ('시', 7),
 ('요', 7),
 ('일', 7),
 ('그래', 6),
 ('많', 6),
 ('했', 6),
 ('고', 6),
 ('주', 6),
 ('제', 5),
 ('학', 5),
 ('원', 5),
 ('여', 5),
 ('자', 5),
 ('생', 5),
 ('한', 5),
 ('같', 4),
 ('건', 4),
 ('보', 4),
 ('올', 4),
 ('약', 4),
 ('지', 4),
 ('그렇', 4),
 ('우', 4),
 ('입', 4),
 ('얼', 4),
 ('매', 4),
 ('거기', 4),
 ('외', 4),
 ('집', 4),
 ('맞', 4),
 ('않', 4),
 ('되', 4),
 ('우리', 4),
 ('들', 3),
 ('영', 3),
 ('재료', 3),
 ('그냥', 3),
 ('엄마', 3),
 ('재', 3),
 ('몇', 3),
 ('여름', 3),
 ('세', 3),
 ('중', 3),
 ('인', 3),
 ('별', 3),
 ('강', 3),
 ('3', 3),
 ('한 명', 3),
 ('언니', 3),
 ('친', 3),
 ('어디', 3),
 ('신', 3),
 ('마', 3),
 ('쪼금', 3),
 ('시국이', 3),
 ('언', 3),
 ('취', 3),
 ('고양이', 3),
 ('예', 3),
 ('그거', 3),
 ('났던', 1),
 ('첫날', 1),
 ('중에서 어떤 계절', 1),
 ('습관', 1),
 ('전단지', 1),
 ('아줌마들이 구하', 1),
 ('윗사람이랑', 1),
 ('안 보인다', 1),
 ('학교 나가고 집에 오고', 1),

In [171]:
train_repeated_phrase_indices2 = find_index_of_repeated_phrases(test_train,[c[0] for c in test_train_idx if c[1]<=2])
dev_repeated_phrase_indices2 = find_index_of_repeated_phrases(test_dev,[c[0] for c in test_dev_idx if c[1]<=2])
test_repeated_phrase_indices2 = find_index_of_repeated_phrases(test_test,[c[0] for c in test_test_idx if c[1]<=2])

100%|██████████| 506/506 [00:00<00:00, 4474.53it/s]
100%|██████████| 102/102 [00:00<00:00, 20408.29it/s]
100%|██████████| 408/408 [00:00<00:00, 3916.20it/s]


In [141]:
train_repeated_phrase_indices2

defaultdict(list,
            {2: ['났던'],
             18: ['첫날'],
             94: ['중에서 어떤 계절'],
             155: ['습관'],
             181: ['전단지'],
             182: ['아줌마들이 구하'],
             291: ['윗사람이랑'],
             299: ['안 보인다'],
             303: ['학교 나가고 집에 오고'],
             323: ['맞아 온천도'],
             330: ['화천'],
             335: ['추워지면 바로'],
             338: ['운전에 너무 힘들어서 다음 운전을 위해서']})

In [172]:
with open('train_repeated_phrase_indices_1.pkl', 'wb') as f:
    pickle.dump(train_repeated_phrase_indices2, f)

with open('dev_repeated_phrase_indices_1.pkl', 'wb') as f:
    pickle.dump(dev_repeated_phrase_indices2, f)

with open('test_repeated_phrase_indices_1.pkl', 'wb') as f:
    pickle.dump(test_repeated_phrase_indices2, f)

In [173]:
with open('test_train.json', 'rb') as f:
    test_train = json.load(f)
with open('test_dev.json', 'rb') as f:
    test_dev = json.load(f)
with open('test_test.json', 'rb') as f:
    test_test = json.load(f)

test_train_2 = make_one_repeated_words(test_train, 'test_train.json', iter=1)
test_dev_2 = make_one_repeated_words(test_dev, 'test_dev.json', iter=1)
test_test_2 = make_one_repeated_words(test_test, 'test_test.json', iter=1)

Removing repeated phrases in train data ... (Phase 1): 100%|██████████| 13/13 [00:00<00:00, 25173.57it/s]
Removing repeated phrases in dev data ... (Phase 1): 100%|██████████| 1/1 [00:00<?, ?it/s]
Removing repeated phrases in test data ... (Phase 1): 100%|██████████| 8/8 [00:00<00:00, 7998.67it/s]


In [174]:
with open('./test_train2.json', 'w') as file:
    json.dump(test_train_2, file, ensure_ascii=False, indent=4)

with open('./test_dev2.json', 'w') as file:
    json.dump(test_dev_2, file, ensure_ascii=False, indent=4)

with open('./test_test2.json', 'w') as file:
    json.dump(test_test_2, file, ensure_ascii=False, indent=4)

In [175]:
test_train3 = make_dataframe('./test_train2.json')
test_dev3 = make_dataframe('./test_dev2.json')
test_test3 = make_dataframe('./test_test2.json')

In [176]:
test_train_idx3, _ = find_repeated_phrases(test_train3)
test_dev_idx3, _ = find_repeated_phrases(test_dev3)
test_test_idx3, _ = find_repeated_phrases(test_test3)

The number of repeated phrases in conversation : 76
The number of repeated phrases in conversation that repeated num <= 2 : 0
The number of repeated phrases in output : 2
The number of repeated phrases in conversation : 14
The number of repeated phrases in conversation that repeated num <= 2 : 0
The number of repeated phrases in output : 1
The number of repeated phrases in conversation : 65
The number of repeated phrases in conversation that repeated num <= 2 : 0
The number of repeated phrases in output : 0


In [177]:
# 후처리 이후
t = make_dataframe('./train.json')
d = make_dataframe('./dev.json')
te = make_dataframe('./test.json')

_, _ = find_repeated_phrases(t)
_, _ = find_repeated_phrases(d)
_, _ = find_repeated_phrases(te)

The number of repeated phrases in conversation : 76
The number of repeated phrases in conversation that repeated num <= 2 : 0
The number of repeated phrases in output : 2
The number of repeated phrases in conversation : 14
The number of repeated phrases in conversation that repeated num <= 2 : 0
The number of repeated phrases in output : 1
The number of repeated phrases in conversation : 65
The number of repeated phrases in conversation that repeated num <= 2 : 0
The number of repeated phrases in output : 0


<br/>

<br/>

In [13]:
import re

# 자음과 모음의 범위를 지정합니다.
consonant = "[ㄱ-ㅎ]"
vowel = "[ㅏ-ㅣ]"

# 같은 자음이 두 번 사용된 음절을 포착하는 정규식
pattern = f"({consonant})({vowel}) \1({vowel})"

# 매칭을 테스트합니다.
word = "겨 계절"
match = re.search(pattern, word)

if match:
    print("패턴이 일치합니다:", match.group())
else:
    print("패턴이 일치하지 않습니다.")

패턴이 일치하지 않습니다.


In [10]:
!pip install jamo

Collecting jamo
  Obtaining dependency information for jamo from https://files.pythonhosted.org/packages/ac/cc/49812faae67f9a24be6ddaf58a2cf7e8c3cbfcf5b762d9414f7103d2ea2c/jamo-0.4.1-py3-none-any.whl.metadata
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Installing collected packages: jamo
Successfully installed jamo-0.4.1


In [28]:
from jamo import h2j, j2hcj
from unicode import join_jamos
from collections import defaultdict

def find_matching_syllables(word):
    # 한글 음절을 자소(자음, 모음)로 분리합니다.
    decomposed = [j2hcj(h2j(syllable)) for syllable in word if syllable != ' ']
    
    temp = defaultdict(int)
    # 자음이 같은 음절 쌍을 찾습니다.
    for i in range(len(decomposed) - 1):
        first = decomposed[i]
        second = decomposed[i + 1]
        
        # 자음(첫 번째 자모)이 같은지 확인
        if first[0] == second[0]:
            temp[(join_jamos(first)+' '+ join_jamos(second))]+=1
    
    return temp

# 테스트
word = "겨 계절"
find_matching_syllables(word)

defaultdict(int, {'겨 계': 1})