In [9]:
# 필요한 library import
from googletrans import Translator
import pandas as pd
import numpy as np

import tqdm
import time

In [2]:
# 파일 가져오기

data_path = r"../data/annotated_data_final.xlsx"
df = pd.read_excel(data_path, index_col=None)[['text', 'label']]

# augmentated 여부 붙이기
df['augmented'] = np.zeros([len(df)], dtype=np.int64)

# label별 분류
df_label_0 = df[df.label == 0] # 혐의 없음
df_label_1 = df[df.label == 1] # 기관 사칭형
df_label_2 = df[df.label == 2] # 대출 빙자형
df_label_3 = df[df.label == 3] # 기타

# 1. Back Translation

In [3]:
class BackTranslator(Translator):

    def __call__(self, text, aux_lang = "en"):

        def __init__(self):
            super(BackTranslator, self).__init__()

        # 입력값 유효성 검증
        assert isinstance(text, str), "입력 문장은 string 형식이어야 합니다."
        assert aux_lang in ["en", "zh-cn", "ja"], "경유 언어는 \"en\", \"zh-cn\", \"ja\"중 하나여야 합니다."

        aux_text = super().translate(text, src="ko", dest=aux_lang).text
        return_text = super().translate(aux_text, src=aux_lang, dest="ko")

        return return_text.text

In [25]:
# label별로 backtranslation 하기

bt = BackTranslator()
dfs = [(2, df_label_2), (3, df_label_3), (1, df_label_1)] # 1이 제일 많으므로 1을 제일 마지막으로 수행
failed = []

for label, df in tqdm.tqdm(dfs):
    temp = []

    # 안정성을 위해 비효율적이더라도 나눠서 실행
    for original_text in tqdm.tqdm(df.text.values): # ko -> eng -> ko

        try:
            bt_text = bt(original_text)
            temp.append([bt_text, 2, 1])
        except:
            failed.append(original_text)
            continue

    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_en.csv")

    temp = []
    for original_text in tqdm.tqdm(df.text.values): # ko -> ja-> ko

        try:
            bt_text = bt(original_text, aux_lang="ja")
            temp.append([bt_text, 2, 1])
        except:
            failed.append(original_text)
            continue


    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_ja.csv")

    temp = []
    for original_text in tqdm.tqdm(df.text.values): # ko -> zh_cn -> ko

        try:
            bt_text = bt(original_text, aux_lang="zh-cn")
            temp.append([bt_text, 2, 1])
        except:
            failed.append(original_text)
            continue

    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_zh.csv")

100%|██████████| 214/214 [02:24<00:00,  1.49it/s]
100%|██████████| 214/214 [11:53<00:00,  3.34s/it]
100%|██████████| 214/214 [06:51<00:00,  1.93s/it]
100%|██████████| 97/97 [03:15<00:00,  2.02s/it]
100%|██████████| 97/97 [05:24<00:00,  3.35s/it]
100%|██████████| 97/97 [02:58<00:00,  1.84s/it]
100%|██████████| 1277/1277 [42:55<00:00,  2.02s/it]
100%|██████████| 1277/1277 [1:16:40<00:00,  3.60s/it]
100%|██████████| 1277/1277 [47:50<00:00,  2.25s/it]
100%|██████████| 3/3 [3:20:15<00:00, 4005.31s/it]


# 결과 병합

In [26]:
import os
import re

In [36]:
folder_path = r"../data"
files = [file_name for file_name in os.listdir(folder_path) if file_name[0] in ['2', '3']]

dfs = []

for file in files:
    df = pd.read_csv(os.path.join(folder_path, file), index_col = 0)
    dfs.append(df)

dfs = pd.concat(dfs)


In [55]:
english_words_pattern = re.compile(r'\b[a-zA-Z]+\b')
english_mask = dfs['text'].apply(lambda row: bool(english_words_pattern.search(str(row))))
dfs_aft = dfs[~english_mask]

In [56]:
dfs_aft.to_csv("../data/dfs_aft.csv")