In [1]:
import jsonlines
import pandas as pd
import emoji
from soynlp.normalizer import repeat_normalize

In [2]:
from tqdm import tqdm
import selenium
from selenium import webdriver
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import urllib
import re

# Preprocess

In [4]:
def replace_masks(file_in, file_out):
    """
    &name& -> <name>
    url encoding시 &를 넣으면 제대로 들어가지 않으므로 꺾쇠로 대체
    """
    lines = []
    pattern = re.compile(r'&([a-z]+)&') # &name&, &affiliation&, ...
    count = 0
    with jsonlines.open(file_in) as f:
        for line in f:
            sentence = line['sentence_form']
            if pattern.search(sentence):
                sentence = pattern.sub(r'<\1>',sentence)
                count += 1
            line['sentence_form'] = sentence
            lines.append(line)
    
    with jsonlines.open(file_out, 'w') as f:
        f.write_all(lines)
    print("Substitued: ", count)

In [10]:
# (입력 파일명, 출력 파일명)
replace_masks('./Dataset/nikluge-sa-2022-test.jsonl', './Dataset/test_preprocess.jsonl')

Substitued:  20


In [11]:
def preprocess_sentences(file_in, file_out):
    lines = []
    with jsonlines.open(file_in) as f:
        for line in f:
            sentence = line['sentence_form']
            emojis = [e.chars for e in emoji.analyze(sentence)]

            if '♥' not in emojis: # ♥가 들어간 문장만 남기고 replace (test dataset에 있는 특수문자만 남김)
                sentence = emoji.replace_emoji(sentence, '')

            sentence = repeat_normalize(sentence, num_repeats=2) # 반복 어구 축약

            line['sentence_form'] = sentence
            lines.append(line)

    with jsonlines.open(file_out, 'w') as f:
        f.write_all(lines)

In [12]:
# (입력 파일명, 출력 파일명)
preprocess_sentences('./Dataset/test_preprocess.jsonl', './Dataset/test_preprocess.jsonl')

# Backtranslation

In [3]:
import urllib.parse

def translation(json_data, src_lang, tgt_lang, driver: WebDriver):
    """
    request format: sk={src_lang}&tk={tgt_lang}&st={text}
    """
    
    request_url = 'https://papago.naver.com/?sk={}&tk={}&st={}'
    is_formal = True
    with tqdm(json_data, total=len(json_data), desc=f'{src_lang}-{tgt_lang} Translation') as pbar:
        for line in pbar:
            text = line['sentence_form']
            try:
                text = urllib.parse.quote(text) # 특수문자로 인한 encoding
                driver.get(request_url.format(src_lang, tgt_lang, text))
                driver.implicitly_wait(60)
                time.sleep(1.0)

                if tgt_lang == 'ko' and is_formal: # 한국어로 번역할 경우 높임말 해제 (only for en, ja, cn)
                    driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/section/div/div[1]/div[3]/div/div[6]/div/button').click()
                    is_formal = False
                    time.sleep(1.5)
                text = driver.find_element(By.XPATH, '//*[@id="txtTarget"]').text # translation result

            except: # 오류 발생 시 get 방식이 아닌 직접 텍스트를 입력하도록 시도
                driver.get(request_url.format(src_lang, tgt_lang, ''))
                time.sleep(1.5)
                driver.find_element(By.XPATH, '//*[@id="txtSource"]').send_keys(text)
                time.sleep(1.5)
                text = driver.find_element(By.XPATH, '//*[@id="txtTarget"]').text
            line['sentence_form'] = text
            if text == "":
                print("N/A")

    return json_data


In [4]:
# def translation(text_data, src_lang, tgt_lang, driver: WebDriver):
#     """
#     request format:
#     ?sk={src_lang}&tk={tgt_lang}&st={text}
#     """
    
#     request_url = 'https://papago.naver.com/?sk={}&tk={}'
#     driver.get(request_url.format(src_lang, tgt_lang))
#     time.sleep(1.5)
#     # 높임말 해제
#     if tgt_lang == 'ko':
#         driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/section/div/div[1]/div[3]/div/div[6]/div/button').click()
#         disable_formal = True
#         time.sleep(1.5)
    
#     trans_list = []
#     with tqdm(text_data, total=len(text_data), desc=f'{src_lang}-{tgt_lang} Translation') as pbar:
#         for text in pbar:
#             driver.find_element(By.XPATH, '//*[@id="txtSource"]').send_keys(text)
#             time.sleep(2.0)
#             trans = driver.find_element(By.XPATH, '//*[@id="txtTarget"]').text
#             trans_list.append(trans)
#             driver.find_element(By.XPATH, '//*[@id="sourceEditArea"]/button').click()
#             time.sleep(0.1)

#     return trans_list


In [5]:
def augmentation_backtranslate(tgt_lang, file_in, file_out):
    json_data = []
    with jsonlines.open(file_in, 'r') as f:
        for line in f:
            json_data.append(line)

    print(len(json_data))

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    # options.add_argument('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36')
    driver = webdriver.Chrome(options=options)
    forward = translation(json_data, 'ko', tgt_lang, driver)
    backward = translation(forward, tgt_lang, 'ko', driver)

    print(len(backward))

    driver.close()

    # assert len(text_data) == len(backward)

    with jsonlines.open(file_out, 'w') as f:
        f.write_all(backward)

    return forward, backward

In [6]:
# (대상 언어, 입력 파일명, 출력 파일명)
forward, backward = augmentation_backtranslate('en', './Dataset/neg.jsonl', './Dataset/neg_aug.jsonl')

8321


ko-en Translation:   1%|          | 84/8321 [01:28<2:24:36,  1.05s/it]

N/A


ko-en Translation:   1%|          | 86/8321 [01:30<2:24:56,  1.06s/it]

N/A


ko-en Translation:   6%|▌         | 473/8321 [08:19<2:18:31,  1.06s/it]

N/A


ko-en Translation:   6%|▌         | 500/8321 [08:47<2:17:37,  1.06s/it]

N/A


ko-en Translation:   9%|▉         | 786/8321 [13:49<2:13:10,  1.06s/it]

N/A


ko-en Translation:  10%|▉         | 828/8321 [14:34<2:12:47,  1.06s/it]

N/A


ko-en Translation:  13%|█▎        | 1068/8321 [18:47<2:07:07,  1.05s/it]

N/A


ko-en Translation:  14%|█▍        | 1165/8321 [20:29<2:06:30,  1.06s/it]

N/A


ko-en Translation:  26%|██▋       | 2192/8321 [38:31<1:48:09,  1.06s/it]

N/A


ko-en Translation:  26%|██▋       | 2198/8321 [38:38<1:48:29,  1.06s/it]

N/A


ko-en Translation:  27%|██▋       | 2211/8321 [38:51<1:47:50,  1.06s/it]

N/A


ko-en Translation:  27%|██▋       | 2225/8321 [39:06<1:46:58,  1.05s/it]

N/A


ko-en Translation:  30%|██▉       | 2496/8321 [43:52<1:42:42,  1.06s/it]

N/A


ko-en Translation:  42%|████▏     | 3535/8321 [1:02:07<1:24:29,  1.06s/it]

N/A


ko-en Translation:  43%|████▎     | 3562/8321 [1:02:35<1:23:11,  1.05s/it]

N/A


ko-en Translation:  43%|████▎     | 3565/8321 [1:02:39<1:23:37,  1.06s/it]

N/A


ko-en Translation:  45%|████▍     | 3740/8321 [1:05:43<1:21:11,  1.06s/it]

N/A


ko-en Translation:  51%|█████     | 4211/8321 [1:14:00<1:12:17,  1.06s/it]

N/A


ko-en Translation:  52%|█████▏    | 4306/8321 [1:15:40<1:10:46,  1.06s/it]

N/A


ko-en Translation:  54%|█████▍    | 4508/8321 [1:19:13<1:06:41,  1.05s/it]

N/A


ko-en Translation:  58%|█████▊    | 4861/8321 [1:25:25<1:00:37,  1.05s/it]

N/A


ko-en Translation:  59%|█████▉    | 4889/8321 [1:25:54<1:00:10,  1.05s/it]

N/A


ko-en Translation:  61%|██████    | 5072/8321 [1:29:07<56:59,  1.05s/it]  

N/A


ko-en Translation:  61%|██████▏   | 5102/8321 [1:29:39<56:58,  1.06s/it]

N/A


ko-en Translation:  63%|██████▎   | 5283/8321 [1:32:50<53:38,  1.06s/it]

N/A


ko-en Translation:  66%|██████▌   | 5467/8321 [1:36:04<50:17,  1.06s/it]

N/A


ko-en Translation:  66%|██████▌   | 5512/8321 [1:36:52<49:35,  1.06s/it]

N/A


ko-en Translation:  78%|███████▊  | 6495/8321 [1:54:08<32:12,  1.06s/it]

N/A


ko-en Translation:  78%|███████▊  | 6506/8321 [1:54:19<31:57,  1.06s/it]

N/A


ko-en Translation:  81%|████████  | 6710/8321 [1:57:54<28:18,  1.05s/it]

N/A


ko-en Translation:  81%|████████  | 6720/8321 [1:58:05<28:08,  1.05s/it]

N/A


ko-en Translation:  81%|████████  | 6757/8321 [1:58:44<27:31,  1.06s/it]

N/A


ko-en Translation:  84%|████████▍ | 7028/8321 [2:03:29<22:45,  1.06s/it]

N/A


ko-en Translation:  86%|████████▋ | 7191/8321 [2:06:21<19:43,  1.05s/it]

N/A


ko-en Translation:  92%|█████████▏| 7624/8321 [2:13:57<12:20,  1.06s/it]

N/A


ko-en Translation:  98%|█████████▊| 8145/8321 [2:23:06<03:05,  1.05s/it]

N/A


ko-en Translation: 100%|██████████| 8321/8321 [2:26:12<00:00,  1.05s/it]
en-ko Translation:   1%|          | 75/8321 [01:20<2:23:51,  1.05s/it]

N/A


en-ko Translation:   1%|          | 84/8321 [01:29<2:24:20,  1.05s/it]

N/A


en-ko Translation:   1%|          | 86/8321 [01:31<2:25:00,  1.06s/it]

N/A


en-ko Translation:   2%|▏         | 200/8321 [03:31<2:21:40,  1.05s/it]

N/A


en-ko Translation:   3%|▎         | 239/8321 [04:12<2:20:54,  1.05s/it]

N/A


en-ko Translation:   4%|▍         | 359/8321 [06:18<2:19:40,  1.05s/it]

N/A


en-ko Translation:   6%|▌         | 467/8321 [08:12<2:18:04,  1.05s/it]

N/A


en-ko Translation:   6%|▌         | 473/8321 [08:18<2:17:29,  1.05s/it]

N/A


en-ko Translation:   6%|▌         | 500/8321 [08:46<2:16:27,  1.05s/it]

N/A


en-ko Translation:   9%|▉         | 786/8321 [13:47<2:11:37,  1.05s/it]

N/A


en-ko Translation:  10%|▉         | 828/8321 [14:31<2:11:00,  1.05s/it]

N/A


en-ko Translation:  11%|█         | 923/8321 [16:11<2:09:17,  1.05s/it]

N/A


en-ko Translation:  13%|█▎        | 1068/8321 [18:43<2:07:22,  1.05s/it]

N/A


en-ko Translation:  14%|█▍        | 1165/8321 [20:25<2:05:15,  1.05s/it]

N/A


en-ko Translation:  19%|█▉        | 1565/8321 [27:26<1:58:28,  1.05s/it]

N/A


en-ko Translation:  20%|█▉        | 1629/8321 [28:33<1:56:33,  1.04s/it]

N/A


en-ko Translation:  21%|██        | 1761/8321 [30:52<1:54:55,  1.05s/it]

N/A


en-ko Translation:  25%|██▌       | 2091/8321 [36:39<1:49:16,  1.05s/it]

N/A


en-ko Translation:  26%|██▌       | 2164/8321 [37:56<1:48:04,  1.05s/it]

N/A


en-ko Translation:  26%|██▋       | 2192/8321 [38:25<1:47:24,  1.05s/it]

N/A


en-ko Translation:  26%|██▋       | 2194/8321 [38:27<1:47:15,  1.05s/it]

N/A


en-ko Translation:  26%|██▋       | 2198/8321 [38:31<1:47:26,  1.05s/it]

N/A


en-ko Translation:  27%|██▋       | 2211/8321 [38:45<1:47:20,  1.05s/it]

N/A


en-ko Translation:  27%|██▋       | 2225/8321 [39:00<1:46:54,  1.05s/it]

N/A


en-ko Translation:  28%|██▊       | 2312/8321 [40:31<1:45:26,  1.05s/it]

N/A


en-ko Translation:  28%|██▊       | 2317/8321 [40:37<1:45:42,  1.06s/it]

N/A


en-ko Translation:  30%|██▉       | 2496/8321 [43:45<1:41:42,  1.05s/it]

N/A


en-ko Translation:  34%|███▎      | 2797/8321 [49:01<1:36:55,  1.05s/it]

N/A


en-ko Translation:  34%|███▍      | 2858/8321 [50:05<1:35:30,  1.05s/it]

N/A


en-ko Translation:  35%|███▌      | 2950/8321 [51:42<1:34:07,  1.05s/it]

N/A


en-ko Translation:  36%|███▋      | 3018/8321 [52:53<1:33:21,  1.06s/it]

N/A


en-ko Translation:  38%|███▊      | 3137/8321 [54:58<1:30:37,  1.05s/it]

N/A


en-ko Translation:  39%|███▉      | 3226/8321 [56:32<1:29:19,  1.05s/it]

N/A


en-ko Translation:  41%|████      | 3384/8321 [59:18<1:27:04,  1.06s/it]

N/A


en-ko Translation:  42%|████▏     | 3525/8321 [1:01:46<1:23:49,  1.05s/it]

N/A


en-ko Translation:  42%|████▏     | 3535/8321 [1:01:56<1:23:41,  1.05s/it]

N/A


en-ko Translation:  43%|████▎     | 3554/8321 [1:02:16<1:23:25,  1.05s/it]

N/A


en-ko Translation:  43%|████▎     | 3562/8321 [1:02:25<1:23:23,  1.05s/it]

N/A


en-ko Translation:  43%|████▎     | 3565/8321 [1:02:28<1:23:21,  1.05s/it]

N/A


en-ko Translation:  44%|████▍     | 3651/8321 [1:03:58<1:21:54,  1.05s/it]

N/A


en-ko Translation:  45%|████▍     | 3740/8321 [1:05:32<1:19:53,  1.05s/it]

N/A


en-ko Translation:  50%|████▉     | 4153/8321 [1:12:46<1:13:01,  1.05s/it]

N/A


en-ko Translation:  50%|█████     | 4186/8321 [1:13:20<1:12:26,  1.05s/it]

N/A


en-ko Translation:  51%|█████     | 4211/8321 [1:13:47<1:12:02,  1.05s/it]

N/A


en-ko Translation:  52%|█████▏    | 4306/8321 [1:15:27<1:10:04,  1.05s/it]

N/A


en-ko Translation:  54%|█████▍    | 4508/8321 [1:18:59<1:06:45,  1.05s/it]

N/A


en-ko Translation:  55%|█████▍    | 4551/8321 [1:19:44<1:06:09,  1.05s/it]

N/A


en-ko Translation:  58%|█████▊    | 4861/8321 [1:25:10<1:00:49,  1.05s/it]

N/A


en-ko Translation:  59%|█████▉    | 4889/8321 [1:25:39<1:00:02,  1.05s/it]

N/A


en-ko Translation:  59%|█████▉    | 4915/8321 [1:26:07<1:00:11,  1.06s/it]

N/A


en-ko Translation:  60%|█████▉    | 4966/8321 [1:27:00<58:29,  1.05s/it]  

N/A


en-ko Translation:  61%|██████    | 5072/8321 [1:28:52<56:52,  1.05s/it]

N/A


en-ko Translation:  61%|██████▏   | 5102/8321 [1:29:23<56:26,  1.05s/it]

N/A


en-ko Translation:  63%|██████▎   | 5222/8321 [1:31:29<54:17,  1.05s/it]

N/A


en-ko Translation:  63%|██████▎   | 5283/8321 [1:32:34<53:24,  1.05s/it]

N/A


en-ko Translation:  66%|██████▌   | 5467/8321 [1:35:47<50:18,  1.06s/it]

N/A


en-ko Translation:  66%|██████▌   | 5512/8321 [1:36:34<49:03,  1.05s/it]

N/A


en-ko Translation:  67%|██████▋   | 5605/8321 [1:38:12<47:34,  1.05s/it]

N/A


en-ko Translation:  69%|██████▉   | 5768/8321 [1:41:04<44:44,  1.05s/it]

N/A


en-ko Translation:  73%|███████▎  | 6059/8321 [1:46:10<39:23,  1.04s/it]

N/A


en-ko Translation:  78%|███████▊  | 6460/8321 [1:53:11<32:36,  1.05s/it]

N/A


en-ko Translation:  78%|███████▊  | 6495/8321 [1:53:48<31:59,  1.05s/it]

N/A


en-ko Translation:  78%|███████▊  | 6506/8321 [1:54:00<31:57,  1.06s/it]

N/A


en-ko Translation:  78%|███████▊  | 6525/8321 [1:54:20<31:42,  1.06s/it]

N/A


en-ko Translation:  79%|███████▉  | 6564/8321 [1:55:01<30:41,  1.05s/it]

N/A


en-ko Translation:  79%|███████▉  | 6583/8321 [1:55:21<30:29,  1.05s/it]

N/A


en-ko Translation:  81%|████████  | 6710/8321 [1:57:34<28:07,  1.05s/it]

N/A


en-ko Translation:  81%|████████  | 6720/8321 [1:57:45<28:06,  1.05s/it]

N/A


en-ko Translation:  81%|████████  | 6757/8321 [1:58:24<27:19,  1.05s/it]

N/A


en-ko Translation:  84%|████████▍ | 7028/8321 [2:03:09<22:44,  1.06s/it]

N/A


en-ko Translation:  85%|████████▍ | 7061/8321 [2:03:44<22:09,  1.06s/it]

N/A


en-ko Translation:  86%|████████▋ | 7191/8321 [2:06:00<19:43,  1.05s/it]

N/A


en-ko Translation:  92%|█████████▏| 7624/8321 [2:13:35<12:14,  1.05s/it]

N/A


en-ko Translation:  98%|█████████▊| 8116/8321 [2:22:13<03:35,  1.05s/it]

N/A


en-ko Translation:  98%|█████████▊| 8145/8321 [2:22:43<03:05,  1.05s/it]

N/A


en-ko Translation: 100%|██████████| 8321/8321 [2:25:48<00:00,  1.05s/it]


8321


# Postprocess


### Check N/A

In [10]:
count = 0
lst = []
with jsonlines.open('./Dataset/neg_aug.jsonl', 'r') as f:
    for line in f:
        if len(line['sentence_form']) == 0: # 오류로 인해 빈 줄이 들어가진 않았는지 확인
            # print (line['sentence_form'])
            lst.append(line['id'])
            count += 1
          
print(count)

75


In [12]:
count = 0
new_lst = []
with jsonlines.open('./Dataset/neg_aug.jsonl', 'r') as f:
    for line in f:
        if len(line['sentence_form']) > 0: # 오류로 인해 빈 줄이 들어가진 않았는지 확인
            # print (line['sentence_form'])
            new_lst.append(line)
            count += 1
          
with jsonlines.open('./Dataset/neg_aug.jsonl', 'w') as f:
    f.write_all(new_lst)

### Merge multiple augmented files

In [14]:
files = [
    './Dataset/train_preprocess.jsonl',
    './Dataset/train_aug_en.jsonl',
    './Dataset/train_aug_jp.jsonl',
    './Dataset/train_aug_cn.jsonl',
    './Dataset/train_aug_es.jsonl',
]

data = []
for i, file in enumerate(files):
    file_data = []
    with jsonlines.open(file, 'r') as f:
        for line in f:
            file_data.append(line)
    data.append(file_data)

with jsonlines.open('./Dataset/train_aug_merged.jsonl', 'w') as f:
    for i in range(len(data[0])):
        for j in range(len(files)):
            f.write(data[j][i])

In [19]:
files = [
    './Dataset/neg.jsonl',
    './Dataset/neg_aug.jsonl',
]

data = []
for i, file in enumerate(files):
    file_data = []
    with jsonlines.open(file, 'r') as f:
        for line in f:
            file_data.append(line)
    data.append(file_data)

with jsonlines.open('./Dataset/neg_merged.jsonl', 'w') as f:
    for i in range(len(files)):
        for j in range(len(data[i])):
            f.write(data[i][j])

In [23]:
files = [
    './Dataset/neg_merged.jsonl',
]

data = []
for i, file in enumerate(files):
    file_data = []
    with jsonlines.open(file, 'r') as f:
        for line in f:
            data.append(line)

sorted(data, key=lambda x: x['id'])
print(data[:10])
# with jsonlines.open('./Dataset/neg_merged_1.jsonl', 'w') as f:
#     for j in range(len(data)):
#         f.write(data[j])

[{'id': 'sentiment-00003', 'sentence_form': '바다쪽으로 내려 가보려고 내려왔는데.. 손가락만 한 바다 바퀴벌레 진짜 많았음.. 여태 본 바다바퀴벌레는 그냥 애기 수준?', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00012', 'sentence_form': '아닌가... 맞나.. 는 모르지만 너무 옛기억 아니 맞다', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00022', 'sentence_form': '너무 넓어서 너무 아깝기도 한답니다.', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00034', 'sentence_form': '단점이라면 주차가 부족하고 주차비가 정말 비싸다.', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00038', 'sentence_form': '하지만 찌개가 거기에 못 따라가는 느낌. 그래도 맛은 개인 취향이니까 슴슴 심심한 맛이 내겐 안 맞을 뿐.', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00048', 'sentence_form': '나중에 들어보니 이 구간을 택시 기사님들이 상당히 꺼려 하신대요..', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00071', 'sentence_form': '지역적으로 레포츠와 축제 음식 등 서울과 수도권 인구의 휴식처로 자리매김하는 중이지만 아직까지 저평가 받고 있는데요', 'annotation': [['', [], 'negative']]}, {'id': 'sentiment-00078', 'sentence_form': '자판기밖에 없어서 카드로는 안되고 꼭 현금이 있어야해요.', 'annotation':