Experiment 2: Knowledge Transfer

In [1]:
from transformers import MarianMTModel, MarianTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
from tqdm import tqdm
import time
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig')
# data=data.drop(['small', 'big', 'OPUS_small', 'OPUS_big', 'kakao_org', 'M2M100_org', 'M2M100_small', 'kakao_small', 'M2M100_big', 'kakao_big', 'google_org', 'google_small', 'google_big'], axis=1)

In [3]:
data.columns

Index(['original', 'MarianMT_DE', 'MarianMT_DE_M2M100_KO',
       'MarianMT_DE_MBart_KO', 'M2M100_DE', 'M2M100_DE_MBart_KO'],
      dtype='object')

Experiment 2-1: MarianMT

In [18]:
en_de_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
en_de_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")

Downloading (…)olve/main/source.spm: 100%|██████████| 768k/768k [00:00<00:00, 820kB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 797k/797k [00:00<00:00, 3.91MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.27M/1.27M [00:00<00:00, 6.69MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 16.3kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.33k/1.33k [00:00<00:00, 725kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 298M/298M [02:49<00:00, 1.75MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 112kB/s]


In [26]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['original']
    try:
        encoded_input=en_de_tokenizer(for_translate, return_tensors='pt')
        output=en_de_model.generate(**encoded_input)
        out_text=en_de_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

768it [12:10,  1.05it/s]


In [27]:
data['MarianMT_DE']=translated

In [30]:
de_ko_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
de_ko_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="de", tgt_lang="ko")

Downloading (…)neration_config.json: 100%|██████████| 233/233 [00:00<00:00, 139kB/s]


In [34]:
translated=[]
for i in tqdm(data['MarianMT_DE']):
    try:
        encoded_hi=de_ko_tokenizer(i, return_tensors='pt')
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.get_lang_id("ko"), max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [1:04:22<00:00,  5.03s/it]


In [38]:
data['MarianMT_DE_M2M100_KO']=translated

In [6]:
de_ko_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
de_ko_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [21]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['MarianMT_DE']
    try:
        de_ko_tokenizer.src_lang = "de_DE"
        encoded_hi=de_ko_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [4:11:13, 19.63s/it] 


In [22]:
translated[:2]

['그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다.',
 '40일 동안에 한 남자가 그와 함께 했습니다. 하지만 40일 동안에 한 남자의 부모님이 그에게 말했습니다. 그 old 남자는 이제 궁하게 사라오라고요. 그건 최악의 불행입니다. 그리고 그 남자는 그녀의 명령에 따라 다른 배로 갔습니다. 그 배는 그 첫 주에 세 마리의 좋은 물고를 잡았습니다.']

In [23]:
data['MarianMT_DE_MBart_KO']=translated

In [24]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-2: M2M100

In [25]:
en_de_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
en_de_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="de")

In [28]:
translated=[]
for i in tqdm(data['original']):
    try:
        encoded_hi=en_de_tokenizer(i, return_tensors='pt')
        generated_tokens = en_de_model.generate(**encoded_hi, forced_bos_token_id=en_de_tokenizer.get_lang_id("de"))
        translated.append(en_de_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [1:04:06<00:00,  5.01s/it]


In [30]:
data['M2M100_DE']=translated

In [32]:
de_ko_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
de_ko_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [37]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['M2M100_DE']
    try:
        de_ko_tokenizer.src_lang = "de_DE"
        encoded_hi=de_ko_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [3:02:49, 14.28s/it] 


In [38]:
data['M2M100_DE_MBart_KO']=translated 

In [39]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-3: MBart

In [4]:
en_de_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
en_de_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [9]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['original']
    try:
        en_de_tokenizer.src_lang = "en_XX"
        encoded_hi=en_de_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = en_de_model.generate(**encoded_hi, forced_bos_token_id=en_de_tokenizer.lang_code_to_id["de_DE"], max_length=1024)
        translated.append(en_de_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:07:05,  5.24s/it]


In [10]:
data['MBart_DE']=translated

In [11]:
de_ko_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
de_ko_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="de", tgt_lang="ko")

In [14]:
translated=[]
for i in tqdm(data['MBart_DE']):
    try:
        encoded_hi=de_ko_tokenizer(i, return_tensors='pt')
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.get_lang_id("ko"), max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [1:06:00<00:00,  5.16s/it]


In [15]:
data['MBart_DE_M2M100_KO']=translated

In [16]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-4: M2M100 + M2M100

In [17]:
de_ko_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
de_ko_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="de", tgt_lang="ko")

In [18]:
translated=[]
for i in tqdm(data['M2M100_DE']):
    try:
        encoded_hi=de_ko_tokenizer(i, return_tensors='pt')
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.get_lang_id("ko"), max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [59:51<00:00,  4.68s/it]  


In [19]:
data['M2M100_DE_M2M100_KO']=translated

In [20]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-5: MBart + MBart

In [21]:
de_ko_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
de_ko_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [22]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['MBart_DE']
    try:
        de_ko_tokenizer.src_lang = "de_DE"
        encoded_hi=de_ko_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [3:59:46, 18.73s/it] 


In [23]:
data['MBart_DE_MBart_KO']=translated

In [24]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 1: Iterative Translation

1-1. MBart first round

In [2]:
exp1=pd.read_csv('/Volumes/T7/mt-hemingway/data/book_final.csv')

In [4]:
exp1=exp1.drop(['small', 'big', 'OPUS_small', 'OPUS_big', 'kakao_org', 'M2M100_org', 'M2M100_small', 'kakao_small', 'M2M100_big', 'kakao_big', 'google_org', 'google_small', 'google_big'], axis=1)

In [5]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [6]:
exp1

Unnamed: 0,original
0,He was an old man who fished alone in a skiff ...
1,In the first forty days a boy had been with hi...
2,It made the boy sad to see the old man come in...
3,"The sail was patched with flour sacks and, fur..."
4,The old man was thin and gaunt with deep wrink...
...,...
763,What's that? she asked a waiter and pointed to...
764,"Tiburon, the waiter said, ""Eshark."" He was mea..."
765,"I didn't know sharks had such handsome, beauti..."
766,"I didn't either, her male companion said."


In [11]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['original']
    try:
        tokenizer.src_lang = "en_XX"
        encoded_hi=tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:24:30,  6.60s/it]


In [12]:
translated[:2]

['그는 멕시코만에 있는 스키프에서 혼자 낚시를 한 늙은 남자였습니다. 그는 이제 84일 동안 물고기를 잡지 않고 갔습니다.',
 '처음 40일 동안 한 남자아이가 그와 함께 있었습니다. 하지만 40일 동안 물고기가 없었을 때, 그 남자아이의 부모님은 그에게 그 늙은 남자가 이제 분명히, 그리고 마침내 사라오라고 말했습니다. 그건 최악의 불행의 형태입니다. 그리고 그 남자아이는 그들 주문대로 다른 배로 갔습니다. 그 배는 첫 주에 세 마리의 좋은 물고기를 잡았습니다.']

In [13]:
exp1['MBart_ko1']=translated

In [14]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MBart_ko1']
    try:
        tokenizer.src_lang = "ko_KR"
        encoded_hi=tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"], max_length=1024)
        translated.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:17:22,  6.04s/it]


In [15]:
translated[:2]

["He was an old man who fished alone on a skiff in the Gulf of Mexico, and he's gone without fishing for 84 days now.",
 "A boy was with him for the first 40 days, but when there was no fish for the first 40 days, the boy's parents told him that the old man must be gone now, and finally, that's the worst form of misery, and he went on to another boat on their orders, and the boat caught three good fish in the first week."]

In [16]:
exp1['MBart_en2']=translated

In [17]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MBart_en2']
    try:
        tokenizer.src_lang = "en_XX"
        encoded_hi=tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:31:05,  7.12s/it]


In [18]:
exp1['MBart_ko3']=translated

In [19]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MBart_ko3']
    try:
        tokenizer.src_lang = "ko_KR"
        encoded_hi=tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"], max_length=1024)
        translated.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:18:27,  6.13s/it]


In [20]:
exp1['MBart_en4']=translated

In [21]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MBart_en4']
    try:
        tokenizer.src_lang = "en_XX"
        encoded_hi=tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:26:34,  6.76s/it]


In [22]:
exp1['MBart_ko5']=translated

In [14]:
exp1['MarianMT_ko1']=exp1['MarianMT_ko1'].str.replace('<unk>', '')
exp1['MarianMT_EN2']=exp1['MarianMT_EN2'].str.replace('<unk>', '')
exp1['MarianMT_ko3']=exp1['MarianMT_ko3'].str.replace('<unk>', '')
exp1['MarianMT_EN4']=exp1['MarianMT_EN4'].str.replace('<unk>', '')

In [23]:
exp1.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1_MBart.csv', encoding='utf-8-sig', index=False)

In [24]:
exp1

Unnamed: 0,original,MBart_ko1,MBart_en2,MBart_ko3,MBart_en4,MBart_ko5
0,He was an old man who fished alone in a skiff ...,그는 멕시코만에 있는 스키프에서 혼자 낚시를 한 늙은 남자였습니다. 그는 이제 84...,He was an old man who fished alone on a skiff ...,그는 멕시코만에 있는 스키프에서 혼자 어획하는 노인이었습니다. 그는 84일 동안 어...,He was an elderly man who fished alone on a sk...,그는 멕시코만에 있는 스키프에서 혼자 어획하는 노인이었습니다. 84일 동안 어획을 ...
1,In the first forty days a boy had been with hi...,처음 40일 동안 한 남자아이가 그와 함께 있었습니다. 하지만 40일 동안 물고기가...,"A boy was with him for the first 40 days, but ...",첫 40일 동안 한 남자아이가 그와 함께 있었습니다. 하지만 첫 40일 동안 물고기...,"A boy was with him for the first 40 days, but ...","한 소년은 처음 40일 동안 그와 함께 있었지만, 처음 40일 동안 물고기가 없었을..."
2,It made the boy sad to see the old man come in...,그 소년은 그 노년이 매일 스키프가 텅 비어 들어오는 것을 보고 슬픔을 느꼈습니다....,The boy was sad to see the old man coming in e...,그 소년은 그 노년이 매일 비어있는 스키프를 입고 오는 것을 보고 슬퍼했습니다. 그...,The boy was sad to see that the old man was co...,그 소년은 그 노년이 매일 비어있는 스키를 타고 오는 것을 보고 슬퍼했습니다. 그는...
3,"The sail was patched with flour sacks and, fur...","배는 빵 봉투로 덮여있었고, 털이 붙어서 영구적인 패배의 깃발처럼 보였습니다.","The ship was covered in a bag of bread, and it...","배는 빵 봉투로 덮여있었고, 장식이 되어서 영구적인 패배 깃발처럼 보였습니다.","The ship was covered in a bag of bread, and it...",배는 빵 가방에 덮여있었고 패배의 영구적인 깃발처럼 장식되어 있었습니다.
4,The old man was thin and gaunt with deep wrink...,늙은 남자는 얇고 엉덩이에 깊은 흉터가 있었습니다.,"The old man had thin, deep scars on his buttocks.",그 노인은 엉덩이에 얇고 깊은 상처가 있었습니다.,"The old man had a thin, profound scar on his b...",그 노인은 엉덩이에 얇고 심각한 상처가 있었습니다.
...,...,...,...,...,...,...
763,What's that? she asked a waiter and pointed to...,그게 뭘까요? 그녀는 웨이터에게 물어봤습니다. 그리고 그 거대한 물고기의 긴 척추를...,"What was that? She asked the waiter, and she p...",그게 무엇일까요? 그녀는 웨이터에게 물었습니다. 그리고 거대한 물고기의 긴 척추를 ...,"What is that? She asked the waiter, and she po...",그게 무엇일까요? 그녀는 웨이터에게 물었습니다. 그리고 그녀는 이 거대한 물고기의 ...
764,"Tiburon, the waiter said, ""Eshark."" He was mea...","웨이터는 ""에셔크""라고 말했습니다. 그는 무슨 일이 일어났는지 설명하고 싶었습니다.","And the waiter said, ""Escherk,"" and he wanted ...","그 웨이터가 ""에셔크""라고 말했습니다. 그는 무슨 일이 일어났는지 설명하고 싶었습니다.","And the waiter said, ""Escherk,"" and he wanted ...","그 웨이터가 ""에셔크""라고 말했습니다. 그는 무슨 일이 일어났는지 설명하고 싶었습니다."
765,"I didn't know sharks had such handsome, beauti...","저는 상어가 이렇게 멋진, 아름답게 형성된 꼬리를 가지고 있다는 것을 몰랐습니다.","I didn't know sharks had this beautiful, beaut...",저는 상어가 이렇게 아름답고 잘 형성된 꼬리를 가지고 있다는 것을 몰랐습니다.,"I didn't know sharks had this beautiful, well-...",저는 상어가 이렇게 아름답고 잘 형성된 꼬리를 가지고 있다는 것을 몰랐습니다.
766,"I didn't either, her male companion said.",저도 몰랐습니다. 그녀의 남자 동료가 말했습니다.,"I didn't know, her male colleagues said.",저는 몰랐습니다. 그녀의 남성 동료들은 말했습니다.,I didn't know. Her male colleagues told me.,저는 몰랐습니다. 그녀의 남성 동료들이 말했습니다.
