In [2]:
import pandas as pd
import numpy as np
import pickle
from ksenticnet_kaist import ksenticnet

# 1. ksenticnet에서 유의어만 추출

In [3]:
word_dic = dict()

for key, value in ksenticnet.items():
    temp_value = []
    for v in value:
        if v.isalpha() and not v.isupper() and not v.islower():
            temp_value.append(v)

    if temp_value:
        word_dic[key] = temp_value

# 2. kwn (Korean WordNet)과의 병합

In [4]:
with open(r"../data/wordnet.pickle", "rb") as f:
    kwn = pickle.load(f)

for key, value in kwn.items():

    if word_dic.get(key): # key가 있다면
        value_1 = set(word_dic[key])
        value_2 = set(kwn[key])
        word_dic[key] = list(value_1 | value_2)

    elif len(value) > 1:
        word_dic[key] = value    

# 3. EDA

In [13]:
import kss
import random
import tqdm

In [6]:
def synonym_replacement(text, num_replacement):

    morphemes = kss.split_morphemes(text, drop_space=False)
    morphemes_copy = list(enumerate(morphemes.copy()))

    cnt_replacement = 0

    # 순서 섞기
    random.shuffle(morphemes_copy)

    for idx, (morph, pos) in morphemes_copy:

        if pos.startswith("N") or pos.startswith("V"):
            dict_value = word_dic.get(morph)

            if dict_value:
                replace_word = set(dict_value).pop()
                morphemes[idx] = (replace_word, pos)
                cnt_replacement += 1
            
        if cnt_replacement > num_replacement:
            break

    # 문서 재배치
    text = ""

    for morph, _ in morphemes:
        text += morph

    return text

In [7]:
def random_insertion(text, num_replacement):
    
    morphemes = kss.split_morphemes(text, drop_space=False)
    morphemes_copy = list(enumerate(morphemes.copy()))

    cnt_replacement = 0
    inserted = []

    # 순서 섞기
    random.shuffle(morphemes_copy)

    for idx, (morph, pos) in morphemes_copy:

        if pos.startswith("N") or pos.startswith("V"):
            dict_value = word_dic.get(morph)

            if dict_value:
                replace_word = set(dict_value).pop()
                rand_int = random.randint(0, len(morphemes_copy)) + random.random()
                inserted.append((rand_int, (replace_word, pos)))
                cnt_replacement += 1
            
        if cnt_replacement > num_replacement:
            break

    # 문서 재배치
    text = ""
    morphemes_copy.extend(inserted)
    morphemes_copy.sort()

    for _, (morph, _) in morphemes_copy:
        text += morph

    return text

In [10]:
with open("../data/stopword.txt", "r", encoding="utf-8") as f:
    stopwords = set([line.strip() for line in f.readlines()])

def random_deletetion(text, num_del):
    morphemes = kss.split_morphemes(text, drop_space=False)
    morphemes_copy = set(enumerate(morphemes.copy()))

    # random delete
    del_cnt = 0
    num_iter = 0

    while del_cnt < num_del and num_iter < len(text):
        popped = morphemes_copy.pop()
        _, (morph, pos) = popped

        if morph in stopwords or not morph.isalpha():
            morphemes_copy.add(popped)
            continue

        else:
            del_cnt += 1

        num_iter += 1

    morphemes_copy = sorted(list(morphemes_copy))
    
    text = ""

    for _, (morph, _) in morphemes_copy:
        text += morph

    return text

In [11]:
def EDA(text):
    num_process = int(len(text.split())*0.1) + 1
    text_1 = synonym_replacement(text, num_process)
    text_2 = random_insertion(text, num_process)
    text_3 = random_deletetion(text, num_process)

    return [text_1, text_2, text_3]

# 4. 데이터 증강하기

In [12]:
# 파일 가져오기

data_path = r"../data/annotated_data_final.xlsx"
df = pd.read_excel(data_path, index_col=None)[['text', 'label']]

# augmentated 여부 붙이기
df['augmented'] = np.zeros([len(df)], dtype=np.int64)

# label별 분류
df_label_0 = df[df.label == 0] # 혐의 없음
df_label_1 = df[df.label == 1] # 기관 사칭형
df_label_2 = df[df.label == 2] # 대출 빙자형
df_label_3 = df[df.label == 3] # 기타

In [21]:
dfs = [(2, df_label_2), (3, df_label_3)] # 1이 제일 많으므로 1을 제일 마지막으로 수행
failed = []

for label, df in tqdm.tqdm(dfs):
    temp = []

    # 안정성을 위해 비효율적이더라도 나눠서 실행
    for original_text in tqdm.tqdm(df.text.values): # ko -> eng -> ko

        try:
            texts = EDA(original_text)
            temp.extend([(text, label, 1) for text in texts])
        except:
            failed.append(original_text)
            continue
    
    temp_df = pd.DataFrame(temp, columns =['text', 'label', 'augmented'])
    temp_df.to_csv(fr"../data/{label}_eda3.csv")


100%|██████████| 214/214 [00:00<00:00, 3965.84it/s]
100%|██████████| 97/97 [00:00<00:00, 5695.19it/s]
100%|██████████| 2/2 [00:00<00:00, 22.71it/s]
