Experiment 2: Knowledge Transfer

In [1]:
from transformers import MarianMTModel, MarianTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
from tqdm import tqdm
import time
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig')
# data=data.drop(['small', 'big', 'OPUS_small', 'OPUS_big', 'kakao_org', 'M2M100_org', 'M2M100_small', 'kakao_small', 'M2M100_big', 'kakao_big', 'google_org', 'google_small', 'google_big'], axis=1)

In [3]:
data.columns

Index(['original', 'MarianMT_DE', 'MarianMT_DE_M2M100_KO',
       'MarianMT_DE_MBart_KO', 'M2M100_DE', 'M2M100_DE_MBart_KO'],
      dtype='object')

Experiment 2-1: MarianMT

In [18]:
en_de_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
en_de_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")

Downloading (…)olve/main/source.spm: 100%|██████████| 768k/768k [00:00<00:00, 820kB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 797k/797k [00:00<00:00, 3.91MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.27M/1.27M [00:00<00:00, 6.69MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 16.3kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.33k/1.33k [00:00<00:00, 725kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 298M/298M [02:49<00:00, 1.75MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 112kB/s]


In [26]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['original']
    try:
        encoded_input=en_de_tokenizer(for_translate, return_tensors='pt')
        output=en_de_model.generate(**encoded_input)
        out_text=en_de_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

768it [12:10,  1.05it/s]


In [27]:
data['MarianMT_DE']=translated

In [30]:
de_ko_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
de_ko_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="de", tgt_lang="ko")

Downloading (…)neration_config.json: 100%|██████████| 233/233 [00:00<00:00, 139kB/s]


In [34]:
translated=[]
for i in tqdm(data['MarianMT_DE']):
    try:
        encoded_hi=de_ko_tokenizer(i, return_tensors='pt')
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.get_lang_id("ko"), max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [1:04:22<00:00,  5.03s/it]


In [38]:
data['MarianMT_DE_M2M100_KO']=translated

In [6]:
de_ko_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
de_ko_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [21]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['MarianMT_DE']
    try:
        de_ko_tokenizer.src_lang = "de_DE"
        encoded_hi=de_ko_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [4:11:13, 19.63s/it] 


In [22]:
translated[:2]

['그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다. 그는 홀로 스키프에 낚시를 했습니다.',
 '40일 동안에 한 남자가 그와 함께 했습니다. 하지만 40일 동안에 한 남자의 부모님이 그에게 말했습니다. 그 old 남자는 이제 궁하게 사라오라고요. 그건 최악의 불행입니다. 그리고 그 남자는 그녀의 명령에 따라 다른 배로 갔습니다. 그 배는 그 첫 주에 세 마리의 좋은 물고를 잡았습니다.']

In [23]:
data['MarianMT_DE_MBart_KO']=translated

In [24]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-2: M2M100

In [25]:
en_de_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
en_de_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="de")

In [28]:
translated=[]
for i in tqdm(data['original']):
    try:
        encoded_hi=en_de_tokenizer(i, return_tensors='pt')
        generated_tokens = en_de_model.generate(**encoded_hi, forced_bos_token_id=en_de_tokenizer.get_lang_id("de"))
        translated.append(en_de_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [1:04:06<00:00,  5.01s/it]


In [30]:
data['M2M100_DE']=translated

In [32]:
de_ko_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
de_ko_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [37]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['M2M100_DE']
    try:
        de_ko_tokenizer.src_lang = "de_DE"
        encoded_hi=de_ko_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [3:02:49, 14.28s/it] 


In [38]:
data['M2M100_DE_MBart_KO']=translated 

In [39]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-3: MBart

In [4]:
en_de_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
en_de_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [9]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['original']
    try:
        en_de_tokenizer.src_lang = "en_XX"
        encoded_hi=en_de_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = en_de_model.generate(**encoded_hi, forced_bos_token_id=en_de_tokenizer.lang_code_to_id["de_DE"], max_length=1024)
        translated.append(en_de_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [1:07:05,  5.24s/it]


In [10]:
data['MBart_DE']=translated

In [11]:
de_ko_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
de_ko_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="de", tgt_lang="ko")

In [14]:
translated=[]
for i in tqdm(data['MBart_DE']):
    try:
        encoded_hi=de_ko_tokenizer(i, return_tensors='pt')
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.get_lang_id("ko"), max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [1:06:00<00:00,  5.16s/it]


In [15]:
data['MBart_DE_M2M100_KO']=translated

In [16]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-4: M2M100 + M2M100

In [17]:
de_ko_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
de_ko_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="de", tgt_lang="ko")

In [18]:
translated=[]
for i in tqdm(data['M2M100_DE']):
    try:
        encoded_hi=de_ko_tokenizer(i, return_tensors='pt')
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.get_lang_id("ko"), max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(i)
        continue

100%|██████████| 768/768 [59:51<00:00,  4.68s/it]  


In [19]:
data['M2M100_DE_M2M100_KO']=translated

In [20]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 2-5: MBart + MBart

In [21]:
de_ko_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
de_ko_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [22]:
translated=[]
for idx, val in tqdm(data.iterrows()):
    for_translate=val['MBart_DE']
    try:
        de_ko_tokenizer.src_lang = "de_DE"
        encoded_hi=de_ko_tokenizer(for_translate, return_tensors='pt', max_length=1024, truncation=True)
        generated_tokens = de_ko_model.generate(**encoded_hi, forced_bos_token_id=de_ko_tokenizer.lang_code_to_id["ko_KR"], max_length=1024)
        translated.append(de_ko_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
    except TypeError:
        translated.append(val)
        continue

768it [3:59:46, 18.73s/it] 


In [23]:
data['MBart_DE_MBart_KO']=translated

In [24]:
data.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/knowledge_transfer.csv', encoding='utf-8-sig', index=False)

Experiment 1: Iterative Translation

1-1. MarianMT first round

In [2]:
exp1=pd.read_csv('/Volumes/T7/mt-hemingway/data/book_final.csv')

In [3]:
exp1=exp1.drop(['small', 'big', 'OPUS_small', 'OPUS_big', 'kakao_org', 'M2M100_org', 'M2M100_small', 'kakao_small', 'M2M100_big', 'kakao_big', 'google_org', 'google_small', 'google_big'], axis=1)

In [4]:
en_ko_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")
en_ko_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")

In [5]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['original']
    try:
        encoded_input=en_ko_tokenizer(for_translate, return_tensors='pt')
        output=en_ko_model.generate(**encoded_input)
        out_text=en_ko_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

768it [52:31,  4.10s/it]


In [6]:
exp1['MarianMT_ko1']=translated

In [7]:
ko_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-ko-en")
ko_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-ko-en")

In [8]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MarianMT_ko1']
    try:
        encoded_input=ko_en_tokenizer(for_translate, return_tensors='pt')
        output=ko_en_model.generate(**encoded_input)
        out_text=ko_en_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

539it [1:11:07,  7.62s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (760 > 512). Running this sequence through the model will result in indexing errors
768it [1:48:36,  8.48s/it]


In [9]:
exp1['MarianMT_EN2']=translated

In [11]:
en_ko_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")
en_ko_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")

In [12]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MarianMT_EN2']
    try:
        encoded_input=en_ko_tokenizer(for_translate, return_tensors='pt')
        output=en_ko_model.generate(**encoded_input)
        out_text=en_ko_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

768it [1:14:46,  5.84s/it]


In [13]:
exp1['MarianMT_ko3']=translated

In [14]:
exp1.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1.csv', encoding='utf-8-sig', index=False)

In [4]:
exp1=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1.csv')

In [6]:
ko_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-ko-en")
ko_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-ko-en")

In [7]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MarianMT_ko3']
    try:
        encoded_input=ko_en_tokenizer(for_translate, return_tensors='pt')
        output=ko_en_model.generate(**encoded_input)
        out_text=ko_en_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

83it [08:05,  3.96s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (761 > 512). Running this sequence through the model will result in indexing errors
768it [1:23:45,  6.54s/it]


In [9]:
exp1['MarianMT_EN4']=translated

In [14]:
exp1['MarianMT_ko1']=exp1['MarianMT_ko1'].str.replace('<unk>', '')
exp1['MarianMT_EN2']=exp1['MarianMT_EN2'].str.replace('<unk>', '')
exp1['MarianMT_ko3']=exp1['MarianMT_ko3'].str.replace('<unk>', '')
exp1['MarianMT_EN4']=exp1['MarianMT_EN4'].str.replace('<unk>', '')

In [16]:
en_ko_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")
en_ko_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")

In [19]:
translated=[]
for idx, val in tqdm(exp1.iterrows()):
    for_translate=val['MarianMT_EN4']
    try:
        encoded_input=en_ko_tokenizer(for_translate, return_tensors='pt')
        output=en_ko_model.generate(**encoded_input)
        out_text=en_ko_tokenizer.batch_decode(output, skip_specual_tokens=True)
        out_text=out_text[0]
        out_text=out_text.replace('<pad>', '').replace('</s>', '')
        out_text=out_text.strip()
        translated.append(out_text)
    except TypeError:
        translated.append(val)
        continue

768it [11:38,  1.10it/s]


In [20]:
exp1['MarianMT_ko5']=translated

In [21]:
exp1

Unnamed: 0,original,MarianMT_ko1,MarianMT_EN2,MarianMT_ko3,MarianMT_EN4,MarianMT_ko5
0,He was an old man who fished alone in a skiff ...,Penther historical Cana Adult portfolio until ...,He was trademarked by the detailed man23 wise ...,기술을 잘 때까지 Penther Adultport lio Sharether Chin...,He was man23 Gal 600 was a ff a ...,펜더 어덜트 포트 리오 공급 업체 인 Chinas Xiaomi Chinas.
1,In the first forty days a boy had been with hi...,Un wellgrad、 China、866 exhibits wiserator. fli...,"There is no In, but first a all the between J...",...,...,이름 *
2,It made the boy sad to see the old man come in...,웨이브 동맥 하트 킨 웰 카나 성인 솔직히 히로 비난 지혜 파업 샤오 미 테크 ...,...,,,이름 *
3,"The sail was patched with flour sacks and, fur...","process、ther。 지혜 、 ING 끝까지, (32, CD、 명예는 잘。).","The was , led, it -.",process、ther、 、 (프로세스、더、더、시작、CD、).,It was a sleight of hand.,wavether Chinasing은 지속 가능성을 제공합니다.
4,The old man was thin and gaunt with deep wrink...,process Cana Adultther endque wisdom ing upon ...,Thet details man was and g author Social s ha...,과정 에서、 Adultther、 endque、 examination、sing 위 요...,"All the efforts were made, all the efforts wer...","1.5 well-HBM 동맥, 도매 well-HBM 동맥, 도매 well-HBM 동..."
...,...,...,...,...,...,...
763,What's that? she asked a waiter and pointed to...,VipTube Hoteling values forUAL、 China、 His end...,philosophic km's? 2015 a er and to but not b...,...,,이름 *
764,"Tiburon, the waiter said, ""Eshark."" He was mea...","다운로드、 중국어, 잘、 그의 모기, 에너지 수집 내장 최고의 Penther、 he...","He was, he was, he was, he was, he was, he was...","펜더, 메이커더, 메이커더, 메이커더, 메이커더, 메이커더, 메이커더",",,,,,,","(,,,,"
765,"I didn't know sharks had such handsome, beauti...","US 호텔 에서 단독으로 866 tentacle, .","I am a sinner, a sinner, a sinner, a sinner in...","미국 심리 China, China, China, China upon Ford.(포드...",.,에스.
766,"I didn't either, her male companion said.","US 호텔 From,리스 모기.",I Hatet Innovate.,US‐From‐Un‐ vehicle‐s에서 제공합니다.,"I'm sorry, but I'm only 9th graders in Japan.","US 호텔 8 월, 수용 US 호텔 8Firstve는 OD에 trust-ically."


In [22]:
exp1.to_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1.csv', encoding='utf-8-sig', index=False)