# 반복 단어 1개로 통일

In [3]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
from copy import deepcopy
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [4]:
def make_dataframe(path: str) -> pd.DataFrame:
    """
    Read a json file and return a pandas DataFrame.

    Parameters:
    path (str): Path to the json file.

    Returns:
    pd.DataFrame: DataFrame of the json file.
    """
    # Read the json file
    with open(path, 'r') as file:
        data = json.load(file)

    # Create a DataFrame
    # columns = ['id', 'conversation', 'subject_keyword', 'output']
    df = pd.DataFrame(data)
    df['conversation'] = df['input'].apply(lambda x: x['conversation'])
    df['subject_keyword'] = df['input'].apply(lambda x: x['subject_keyword'])

    # Drop the 'input' column
    df.drop('input', axis=1, inplace=True)

    # Speakers in the conversation
    df['speakers'] = df['conversation'].apply(lambda turns: list(set(turn['speaker'] for turn in turns)))

    # Reorder the columns
    df = df[['id', 'conversation', 'subject_keyword', 'speakers', 'output']]

    return df

In [5]:
train_df = make_dataframe('../resource/data/일상대화요약_train.json')
dev_df = make_dataframe('../resource/data/일상대화요약_dev.json')
test_df = make_dataframe('../resource/data/일상대화요약_test.json')

In [32]:
# utterance 내에 '\b([가-힣a-zA-Z0-9_]+)\s+\1\b'를 만족하는 부분이 있는지 확인하고 있다면 set에 추가하고 마지막에 return하는 함수

def find_repeated_words(df: pd.DataFrame, pattern) -> set:
    """
    Find repeated words in the utterances of the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame of the conversations.

    Returns:
    set: Set of repeated words.
    """
    # Set to store the repeated words
    repeated_words = set()
    count = 0

    # Iterate over the DataFrame
    for idx, row in df.iterrows():
        # Extract the conversation
        conversation = row['conversation']

        # Iterate over the turns
        for turn in conversation:
            # Extract the utterance
            utterance = turn['utterance']

            # Find all repeated words
            matches = re.findall(pattern, utterance)

            # Add the repeated words to the set
            repeated_words.update(matches)

            count+=len(matches)

    print(f"Total number of repeated words found: {len(repeated_words)}")
    print(f"Total number of repeated words found: {count}")

    return repeated_words

In [35]:
repeated_words = find_repeated_words(train_df, r'\b(\w)\s+\1\b')

Total number of repeated words found: 85
Total number of repeated words found: 373


In [39]:
repeated_words = find_repeated_words(test_df, r'\b([가-힣a-zA-Z0-9_]+)\s+\1\b')

Total number of repeated words found: 370
Total number of repeated words found: 676


In [40]:
repeated_words

{'10000원의',
 '2',
 '30분',
 '4년',
 'name',
 '가',
 '가게',
 '가야',
 '가야지',
 '가장',
 '가족',
 '가족이',
 '각각',
 '각자',
 '간식거리를',
 '간호를',
 '간혹',
 '감동',
 '갔는데',
 '갔다던',
 '갔던',
 '강제로',
 '개',
 '걔',
 '거래',
 '거의',
 '걱정되시는',
 '건',
 '걸어',
 '게다가',
 '경험에',
 '계속',
 '고',
 '고때',
 '골',
 '골이',
 '공무원을',
 '공유',
 '과거에',
 '괜찮다고',
 '교양',
 '국내',
 '굳이',
 '그',
 '그냥',
 '그다음에는',
 '그동안',
 '그때',
 '그래가지고',
 '그래서',
 '그러고',
 '그러니까',
 '그런',
 '그렇게',
 '그렇게는',
 '그리고',
 '근데',
 '글라이더',
 '금사빠',
 '기억에',
 '깜빡',
 '꼬박',
 '꼬시고',
 '꿈을',
 '끝까지',
 '나',
 '나는',
 '나온다',
 '나의',
 '나중에',
 '날씨',
 '남',
 '남구',
 '남는',
 '내',
 '내가',
 '너',
 '너는',
 '너무',
 '너희',
 '네가',
 '놀러',
 '뇌에',
 '누구',
 '누군데',
 '누나는',
 '누에보다리',
 '눈',
 '눈은',
 '다',
 '다른',
 '다리',
 '다시',
 '다양한',
 '당연히',
 '대면',
 '대신',
 '대충',
 '더',
 '더울',
 '도깨비',
 '도시마다',
 '동물',
 '동생',
 '되게',
 '두고',
 '딱',
 '땀이',
 '또',
 '로망을',
 '루',
 '마음이',
 '마찬가지로',
 '막',
 '만',
 '만약에',
 '많아지는',
 '많이',
 '말을',
 '맛이',
 '맛집도',
 '매일',
 '맨날',
 '먹어보니',
 '멈췄고',
 '몸',
 '못',
 '무서워',
 '무슨',
 '물론',
 '뭐',
 '뭐니',
 '뭐야',
 '뭘',
 '미리',
 '바뀌고'