In [1]:
# 필요한 library import
from googletrans import Translator
import pandas as pd
import numpy as np

import tqdm
import time

In [3]:
# 파일 가져오기

data_path = r"../data/AnnotationData_TEST(JH).xlsx"
df = pd.read_excel(data_path, index_col=None)[['text', 'label']]

# augmentated 여부 붙이기
df['augmented'] = np.zeros([len(df)], dtype=np.int64)

# label별 분류
df_label_0 = df[df.label == 0] # 혐의 없음
df_label_1 = df[df.label == 1] # 기관 사칭형
df_label_2 = df[df.label == 2] # 대출 빙자형
df_label_3 = df[df.label == 3] # 기타

# 1. Back Translation

In [4]:
class BackTranslator(Translator):

    def __call__(self, text, aux_lang = "en"):

        def __init__(self):
            super(BackTranslator, self).__init__()

        # 입력값 유효성 검증
        assert isinstance(text, str), "입력 문장은 string 형식이어야 합니다."
        assert aux_lang in ["en", "zh-cn", "ja"], "경유 언어는 \"en\", \"zh-cn\", \"ja\"중 하나여야 합니다."

        aux_text = super().translate(text, src="ko", dest=aux_lang).text
        return_text = super().translate(aux_text, src=aux_lang, dest="ko")

        return return_text.text

In [5]:
# label별로 backtranslation 하기

bt = BackTranslator()
dfs = [(2, df_label_2)] # 1이 제일 많으므로 1을 제일 마지막으로 수행
failed = []

for label, df in tqdm.tqdm(dfs):
    temp = []

    # 안정성을 위해 비효율적이더라도 나눠서 실행
    for original_text in tqdm.tqdm(df.text.values): # ko -> eng -> ko

        try:
            bt_text = bt(original_text)
            temp.append([bt_text, label, 1])
        except:
            failed.append(original_text)
            continue

    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_en_test.csv")

    temp = []
    for original_text in tqdm.tqdm(df.text.values): # ko -> ja-> ko

        try:
            bt_text = bt(original_text, aux_lang="ja")
            temp.append([bt_text, label, 1])
        except:
            failed.append(original_text)
            continue


    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_ja_test.csv")

    temp = []
    for original_text in tqdm.tqdm(df.text.values): # ko -> zh_cn -> ko

        try:
            bt_text = bt(original_text, aux_lang="zh-cn")
            temp.append([bt_text, label, 1])
        except:
            failed.append(original_text)
            continue

    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_zh_test.csv")

100%|██████████| 50/50 [01:56<00:00,  2.33s/it]
100%|██████████| 50/50 [03:25<00:00,  4.10s/it]
100%|██████████| 50/50 [02:04<00:00,  2.49s/it]
100%|██████████| 1/1 [07:26<00:00, 446.12s/it]


# 결과 병합

In [6]:
import os
import re

In [24]:
# bt & augmented data
folder_path = r"../data/"
files = os.listdir(folder_path)

label_1 = [file for file in files if file.startswith('1')]
label_2 = [file for file in files if file.startswith('2')]
label_3 = [file for file in files if file.startswith('3')]

labels = [(1, label_1), (2, label_2), (3,label_3)]

dfs = []

for label_idx, label in labels:

    temp_dfs = []

    for file in label:
        df = pd.read_csv(os.path.join(folder_path, file), index_col = 0)
        df['label'] = np.array([label_idx] * len(df))
        temp_dfs.append(df)

    temp_dfs = pd.concat(temp_dfs)
    dfs.append(temp_dfs)

dfs = pd.concat(dfs)


In [25]:
english_words_pattern = re.compile(r'\b[a-zA-Z]+\b')
english_mask = dfs['text'].apply(lambda row: bool(english_words_pattern.search(str(row))))
dfs_aft = dfs[~english_mask]

In [26]:
dfs_aft.to_csv("../data/dfs_bt_and_augmented.csv")

In [30]:
# test

folder_path = r"../data/"
files = [file for file in os.listdir(folder_path) if file.startswith('test')]

dfs = []

for file in files:
    df = pd.read_csv(os.path.join(folder_path, file), index_col = 0)
    dfs.append(df)

dfs = pd.concat(dfs)

In [32]:
dfs.to_csv("test_data_concatenated.csv")