In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel, MarianMTModel, M2M100ForConditionalGeneration
import tqdm
import torch
import time

# nllb
def _eng2kor(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["kor_Hang"])
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

# opus-mt
def __eng2kor(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    translated_tokens = model.generate(**inputs)
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

# m2m100
def eng2kor(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.get_lang_id("ko"))
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

def load_model(model_name):
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # nllb
    # model = MarianMTModel.from_pretrained(model_name) # opus-mt
    model = M2M100ForConditionalGeneration.from_pretrained(model_name) # m2m100
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return model, tokenizer, device

def Generate_KR(txt_file: str, model_name: str, data_type: str) -> None:
    """Generate Korean text from English text using pretrained model from huggingface.

    Args:
        txt_file (str): path to txt file to translate.
        model_name (str): name of pretrained model from huggingface.
        data_type (str): type of data to translate. ['MELD', 'EMORY', 'IEMOCAP', 'DD']
    """
    print("*** Load model start ***")
    model, tokenizer, device = load_model(model_name)
    print(f"*   device : {device}   *")
    print(f"*   model : {model_name}   *")
    
    print("*** Read file start ***")
    print(f"*   data_type : {data_type}   *")
    print(f"*   txt_file : {txt_file}   *")
    f = open(txt_file, 'r')
    dataset = f.readlines()
    f.close()
    print("*   total " + str(len(dataset)) + " lines   *")
    
    print("*** write file start ***")
    base_file = txt_file.split('_')
    ko_txt_file = base_file[0] + '_' + model_name.replace('/', '-') + '_' + base_file[1][:-4] + '_ko.txt' 
    print(f"*   write_file : {ko_txt_file}   *")
    
    with open(ko_txt_file, 'w') as f:
        for i, data in tqdm.tqdm(enumerate(dataset), total=len(dataset)):
            if data == '\n':
                f.write('\n')
            else:
                if data_type == 'MELD':
                    if i <= 1:
                        continue
                    # emotion: 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'
                    # sentiment: {'positive': ["joy"], 'negative': ["anger", "disgust", "fear", "sadness"], 'neutral': ["neutral", "surprise"]}
                    speaker, utt, emo, senti = data.strip().split('\t')
                    emodict = {'anger': "anger", 'disgust': "disgust", 'fear': "fear", 'joy': "joy", 'neutral': "neutral", 'sadness': "sad", 'surprise': 'surprise'}
                    emo = emodict[emo]
                    utt = eng2kor(utt, model, tokenizer, device)[0]
                    out = speaker + '\t' + utt + '\t' + emo + '\t' + senti + '\n'
                    f.write(out)
                elif data_type == 'EMORY':
                    speaker = data.strip().split('\t')[0]
                    utt = ' '.join(data.strip().split('\t')[1:-1])
                    emo = data.strip().split('\t')[-1]
                    emodict = {'Joyful': "joy", 'Mad': "anger", 'Peaceful': "neutral", 'Powerful': "surprise", 'Neutral': "neutral", 'Sad': "sad", 'Scared': 'fear'}
                    emo = emodict[emo]
                    sentidict = {"joy": "positive", "anger": "negative", "disgust": "negative", "fear": "negative", "sad": "negative", "neutral": "neutral", "surprise": "neutral"}
                    senti = sentidict[emo]
                    utt = eng2kor(utt, model, tokenizer, device)[0]
                    out = speaker + '\t' + utt + '\t' + emo + '\t' + senti + '\n'
                    f.write(out)
                elif data_type == 'IEMOCAP':
                    speaker = data.strip().split('\t')[0]
                    utt = ' '.join(data.strip().split('\t')[1:-1])
                    emo = data.strip().split('\t')[-1]
                    emodict = {'ang': "anger", 'hap': "joy", 'neu': "neutral", 'sad': "sad", 'exc': "surprise", 'fru': "disgust", 'fea': "fear"}
                    emo = emodict[emo]
                    sentidict = {"joy": "positive", "anger": "negative", "disgust": "negative", "fear": "negative", "sad": "negative", "neutral": "neutral", "surprise": "neutral"}
                    senti = sentidict[emo]
                    utt = eng2kor(utt, model, tokenizer, device)[0]
                    out = speaker + '\t' + utt + '\t' + emo + '\t' + senti + '\n'
                    f.write(out)
                elif data_type == 'DD':
                    speaker, utt, emo = data.strip().split('\t')
                    emodict = {'anger': "anger", 'disgust': "disgust", 'fear': "fear", 'happiness': "joy", 'neutral': "neutral", 'sadness': "sad", 'surprise': "surprise"}
                    emo = emodict[emo]
                    sentidict = {"joy": "positive", "anger": "negative", "disgust": "negative", "fear": "negative", "sad": "negative", "neutral": "neutral", "surprise": "neutral"}
                    senti = sentidict[emo]
                    utt = eng2kor(utt, model, tokenizer, device)[0]
                    out = speaker + '\t' + utt + '\t' + emo + '\t' + senti + '\n'
                    f.write(out)
                else:
                    print("* unknown data_type *")
                    break

# in facebook/nllb-200 some errors in translation
model_name = "facebook/nllb-200-1.3B"
model_name = "facebook/nllb-200-distilled-600M"
model_name = "Helsinki-NLP/opus-mt-tc-big-en-ko"
model_name = "facebook/m2m100_1.2B"

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
Generate_KR('../data/MELD/multi/MELD_dev.txt', model_name, 'MELD')

*** Load model start ***


RuntimeError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 11.77 GiB total capacity; 10.81 GiB already allocated; 5.50 MiB free; 10.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
Generate_KR('../data/MELD/multi/MELD_test.txt', model_name, 'MELD')

*** Load model start ***
*   device : cuda   *
*** Read file start ***
*   data_type : MELD   *
*   txt_file : ../data/MELD/multi/MELD_test.txt   *
*   total 2891 lines   *
*** write file start ***
*   write_file : ../data/MELD/multi/MELD_facebook-nllb-200-1.3B_test_ko.txt   *


100%|██████████| 2891/2891 [07:36<00:00,  6.34it/s]


In [2]:
Generate_KR('../data/MELD/multi/MELD_train.txt', model_name, 'MELD')

*** Load model start ***
*   device : cuda   *
*   model : facebook/m2m100_1.2B   *
*** Read file start ***
*   data_type : MELD   *
*   txt_file : ../data/MELD/multi/MELD_train.txt   *
*   total 11028 lines   *
*** write file start ***
*   write_file : ../data/MELD/multi/MELD_facebook-m2m100_1.2B_train_ko.txt   *


100%|██████████| 11028/11028 [48:05<00:00,  3.82it/s] 


In [10]:
Generate_KR('../data/EMORY/EMORY_dev.txt', model_name, 'EMORY')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : EMORY **
** txt_file : ../data/EMORY/EMORY_dev.txt **
**  total 1442 lines **
*** write file start ***
** write_file : ../data/EMORY/EMORY_dev_facebook-nllb-200-1.3B_ko.txt **


100%|██████████| 1442/1442 [04:22<00:00,  5.49it/s]


In [11]:
Generate_KR('../data/EMORY/EMORY_test.txt', model_name, 'EMORY')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : EMORY **
** txt_file : ../data/EMORY/EMORY_test.txt **
**  total 1412 lines **
*** write file start ***
** write_file : ../data/EMORY/EMORY_test_facebook-nllb-200-1.3B_ko.txt **


100%|██████████| 1412/1412 [04:48<00:00,  4.89it/s]


In [12]:
Generate_KR('../data/EMORY/EMORY_train.txt', model_name, 'EMORY')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : EMORY **
** txt_file : ../data/EMORY/EMORY_train.txt **
**  total 10646 lines **
*** write file start ***
** write_file : ../data/EMORY/EMORY_train_facebook-nllb-200-1.3B_ko.txt **


100%|██████████| 10646/10646 [34:01<00:00,  5.22it/s] 


In [10]:
Generate_KR('../data/iemocap/iemocap_dev.txt', model_name, 'IEMOCAP')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : IEMOCAP **
** txt_file : ../data/iemocap/iemocap_dev.txt **
**  total 658 lines **
*** write file start ***
** write_file : ../data/iemocap/iemocap_facebook-nllb-200-1.3B_dev_ko.txt **


100%|██████████| 658/658 [01:53<00:00,  5.81it/s]


In [11]:
Generate_KR('../data/iemocap/iemocap_test.txt', model_name, 'IEMOCAP')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : IEMOCAP **
** txt_file : ../data/iemocap/iemocap_test.txt **
**  total 1653 lines **
*** write file start ***
** write_file : ../data/iemocap/iemocap_facebook-nllb-200-1.3B_test_ko.txt **


100%|██████████| 1653/1653 [05:12<00:00,  5.29it/s]


In [12]:
Generate_KR('../data/iemocap/iemocap_train.txt', model_name, 'IEMOCAP')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : IEMOCAP **
** txt_file : ../data/iemocap/iemocap_train.txt **
**  total 5270 lines **
*** write file start ***
** write_file : ../data/iemocap/iemocap_facebook-nllb-200-1.3B_train_ko.txt **


100%|██████████| 5270/5270 [16:24<00:00,  5.35it/s]  


In [13]:
Generate_KR('../data/dailydialog/dailydialog_dev.txt', model_name, 'DD')

*** Load model start ***
** device : cuda **
*** Read file start ***
** data_type : DD **
** txt_file : ../data/dailydialog/dailydialog_dev.txt **
**  total 9068 lines **
*** write file start ***
** write_file : ../data/dailydialog/dailydialog_facebook-nllb-200-1.3B_dev_ko.txt **


100%|██████████| 9068/9068 [24:09<00:00,  6.26it/s]  


In [5]:
Generate_KR('../data/dailydialog/dailydialog_test.txt', model_name, 'DD')

*** Load model start ***
*   device : cuda   *
*** Read file start ***
*   data_type : DD   *
*   txt_file : ../data/dailydialog/dailydialog_test.txt   *
*   total 8739 lines   *
*** write file start ***
*   write_file : ../data/dailydialog/dailydialog_facebook-nllb-200-1.3B_test_ko.txt   *


100%|██████████| 8739/8739 [23:48<00:00,  6.12it/s]  


In [6]:
Generate_KR('../data/dailydialog/dailydialog_train.txt', model_name, 'DD')

*** Load model start ***
*   device : cuda   *
*** Read file start ***
*   data_type : DD   *
*   txt_file : ../data/dailydialog/dailydialog_train.txt   *
*   total 98287 lines   *
*** write file start ***
*   write_file : ../data/dailydialog/dailydialog_facebook-nllb-200-1.3B_train_ko.txt   *


100%|██████████| 98287/98287 [4:25:14<00:00,  6.18it/s]   


In [11]:
import shutil
import glob
from pathlib import Path

def cat_files(path: str, out_file: str) -> None:
    """Concat all files in path to out_file

    Args:
        path (str): path to directory containing files to concatenate
        out_file (str): output file
    """
    print(f"*** Concatenating files ***\n*   in {path}   *\n*   to {out_file}   *")
    # create parent directory if not exists
    out_path = Path(out_file)
    out_dir = out_path.parent
    out_dir.mkdir(parents=True, exist_ok=True)
    
    files = glob.glob(path, recursive=True)
    if out_file in files: files.remove(out_file)
    print(f"*** Found {len(files)} files ***")
    if len(files) <= 30:
        for f in files:
            print(f"*   {f}   *")
            
    with open(out_path, 'w') as outfile:
        for fname in files:
            with open(fname, 'r') as infile:
                shutil.copyfileobj(infile, outfile)
                

In [12]:
cat_files("../data/**/*_facebook-nllb-200-1.3B_dev_ko.txt", "../data/EMOTION/multi/full_facebook-nllb-200-1.3B_dev_ko.txt")

*** Concatenating files ***
*   in ../data/**/*_facebook-nllb-200-1.3B_dev_ko.txt   *
*   to ../data/EMOTION/multi/full_facebook-nllb-200-1.3B_dev_ko.txt   *
*** Found 4 files ***
*   ../data/EMORY/EMORY_facebook-nllb-200-1.3B_dev_ko.txt   *
*   ../data/MELD/multi/MELD_facebook-nllb-200-1.3B_dev_ko.txt   *
*   ../data/iemocap/iemocap_facebook-nllb-200-1.3B_dev_ko.txt   *
*   ../data/dailydialog/dailydialog_facebook-nllb-200-1.3B_dev_ko.txt   *


In [13]:
cat_files("../data/**/*_facebook-nllb-200-1.3B_test_ko.txt", "../data/EMOTION/multi/full_facebook-nllb-200-1.3B_test_ko.txt")

*** Concatenating files ***
*   in ../data/**/*_facebook-nllb-200-1.3B_test_ko.txt   *
*   to ../data/EMOTION/multi/full_facebook-nllb-200-1.3B_test_ko.txt   *
*** Found 4 files ***
*   ../data/EMORY/EMORY_facebook-nllb-200-1.3B_test_ko.txt   *
*   ../data/MELD/multi/MELD_facebook-nllb-200-1.3B_test_ko.txt   *
*   ../data/iemocap/iemocap_facebook-nllb-200-1.3B_test_ko.txt   *
*   ../data/dailydialog/dailydialog_facebook-nllb-200-1.3B_test_ko.txt   *


In [14]:
cat_files("../data/**/*_facebook-nllb-200-1.3B_train_ko.txt", "../data/EMOTION/multi/full_facebook-nllb-200-1.3B_train_ko.txt")

*** Concatenating files ***
*   in ../data/**/*_facebook-nllb-200-1.3B_train_ko.txt   *
*   to ../data/EMOTION/multi/full_facebook-nllb-200-1.3B_train_ko.txt   *
*** Found 4 files ***
*   ../data/EMORY/EMORY_facebook-nllb-200-1.3B_train_ko.txt   *
*   ../data/MELD/multi/MELD_facebook-nllb-200-1.3B_train_ko.txt   *
*   ../data/iemocap/iemocap_facebook-nllb-200-1.3B_train_ko.txt   *
*   ../data/dailydialog/dailydialog_facebook-nllb-200-1.3B_train_ko.txt   *


In [16]:
torch.cuda.empty_cache()