# '.' 뒤에 공백이 존재 하는지

In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
from copy import deepcopy
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
import rouge

def make_dataframe(path: str) -> pd.DataFrame:
    """
    Read a json file and return a pandas DataFrame.

    Parameters:
    path (str): Path to the json file.

    Returns:
    pd.DataFrame: DataFrame of the json file.
    """
    # Read the json file
    with open(path, 'r') as file:
        data = json.load(file)

    # Create a DataFrame
    # columns = ['id', 'conversation', 'subject_keyword', 'output']
    df = pd.DataFrame(data)
    df['conversation'] = df['input'].apply(lambda x: x['conversation'])
    df['subject_keyword'] = df['input'].apply(lambda x: x['subject_keyword'])

    # Drop the 'input' column
    df.drop('input', axis=1, inplace=True)

    # Speakers in the conversation
    df['speakers'] = df['conversation'].apply(lambda turns: list(set(turn['speaker'] for turn in turns)))

    # Reorder the columns
    df = df[['id', 'conversation', 'subject_keyword', 'speakers', 'output']]

    return df

train_df = make_dataframe('../resource/data/일상대화요약_train.json')
dev_df = make_dataframe('../resource/data/일상대화요약_dev.json')
test_df = make_dataframe('../resource/data/일상대화요약_test.json')
filtered_train_df = make_dataframe('./train.json')
filtered_dev_df = make_dataframe('./dev.json')
filtered_test_df = make_dataframe('./test.json')

## output의 맨 마지막 확인 

In [2]:
# output의 맨 마지막이 '.'으로 끝나는지, '. '으로 끝나는지 확인
def check_last_character(df: pd.DataFrame) -> None:
    """
    Check if the last character of the 'output' column is '.' or '. '.

    Parameters:
    df (pd.DataFrame): DataFrame to check.
    """
    end_dot = 0
    end_dot_space = 0
    another = 0

    for output in df['output']:
        if output[-1] == '.':
            end_dot += 1
        elif output[-2:] == '. ':
            end_dot_space += 1
        else:
            another += 1
            print("another sentence : ",output)
        
    print(f"Number of outputs ending with '.': {end_dot}")
    print(f"Number of outputs ending with '. ': {end_dot_space}")
    print(f"Number of outputs ending with another character: {another}")

In [None]:
check_last_character(filtered_train_df)

another sentence :  두 화자는 이 대화에서 영화 관람 스타일에 대해 말했습니다. SD2001145는 영화관에 가서 신작을 봤으며 알아서 스스로 누르고 애플리케이션으로 들어가서 영화를 보면 되어서 영화관을 자주 이용한다고 말했습니다. 또 한국 영화 위주로 많이 보는데 요즘엔 외국 영화 중심으로 재개봉을 하고 있어 조금 아쉽다고 말했습니다. SD2001146은 코로나 때문에 영화관을 못 가서 신작 영화를 보진 못했고 넷플릭스나 웨이브에서 시청한다고 말했습니다. 집에서 티브이나 휴대폰으로 볼 땐 소리와 화면의 질이 아쉽다고 말했습니다
Number of outputs ending with '.': 498
Number of outputs ending with '. ': 7
Number of outputs ending with another character: 1


In [None]:
check_last_character(filtered_dev_df)

Number of outputs ending with '.': 100
Number of outputs ending with '. ': 2
Number of outputs ending with another character: 0


## output, utterance 내에 '.' 뒤 공백 여부

In [18]:
# utterance 내 '.' 뒤 공백 여부 확인

import re


def check_dot_space(df: pd.DataFrame) -> None:
    """
    Check if there is a space after '.' in the text

    Parameters:
    df (pd.DataFrame): DataFrame to check.
    """
    # 마침표 뒤에 공백이 있는 경우를 찾는 패턴
    pattern_with_space = re.compile(r'\. ')
    # 마침표 뒤에 공백이 없는 경우를 찾는 패턴
    pattern_no_space = re.compile(r'\.[^\s]')

    dot_with_space_in_utterance = 0
    dot_without_space_in_utterance = 0
    dot_with_space_in_output = 0
    dot_without_space_in_output = 0

    for turns in df['conversation']:
        for turn in turns:
            sentence = turn['utterance']
            # 패턴에 매치되는 모든 경우를 찾고 개수 세기
            matches_with_space = pattern_with_space.findall(sentence)
            matches_no_space = pattern_no_space.findall(sentence)
            dot_with_space_in_utterance += len(matches_with_space)
            dot_without_space_in_utterance += len(matches_no_space)

    for output in df['output']:
        matches_with_space = pattern_with_space.findall(output)
        matches_no_space = pattern_no_space.findall(output)
        dot_with_space_in_output += len(matches_with_space)
        dot_without_space_in_output += len(matches_no_space)

    
    print(f"Number of '.' with space in utterance: {dot_with_space_in_utterance}")
    print(f"Number of '.' without space in utterance: {dot_without_space_in_utterance}")
    print(f"Number of '.' with space in output: {dot_with_space_in_output}")
    print(f"Number of '.' without space in output: {dot_without_space_in_output}")

In [19]:
check_dot_space(filtered_train_df)

Number of '.' with space in utterance: 12060
Number of '.' without space in utterance: 14
Number of '.' with space in output: 2748
Number of '.' without space in output: 7


In [21]:
check_dot_space(filtered_dev_df)

Number of '.' with space in utterance: 2578
Number of '.' without space in utterance: 2
Number of '.' with space in output: 532
Number of '.' without space in output: 4


- 만약 문장 내 '.' 뒤에 공백이 존재하지 않는다면, '. ' 공백을 추가해주고
- 맨 마지막으로 strip()을 해준다

In [34]:
filtered_train_df = make_dataframe('./train.json')
filtered_dev_df = make_dataframe('./dev.json')
filtered_test_df = make_dataframe('./test.json')

In [35]:
check_last_character(filtered_train_df)

Number of outputs ending with '.': 506
Number of outputs ending with '. ': 0
Number of outputs ending with another character: 0


In [36]:
check_last_character(filtered_dev_df)

Number of outputs ending with '.': 102
Number of outputs ending with '. ': 0
Number of outputs ending with another character: 0


In [37]:
check_dot_space(filtered_train_df)

Number of '.' with space in utterance: 12074
Number of '.' without space in utterance: 0
Number of '.' with space in output: 2749
Number of '.' without space in output: 0


In [38]:
check_dot_space(filtered_dev_df)

Number of '.' with space in utterance: 2580
Number of '.' without space in utterance: 0
Number of '.' with space in output: 535
Number of '.' without space in output: 0
