# Fine tune the opus-mt model

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import pandas as pd
import numpy as np

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

from datasets import load_dataset, load_metric
from datasets import Dataset
from datasets import DatasetDict

import os
import glob
from tqdm import tqdm
import torch
import pickle
import re

In [None]:
path1 = '/export/home/cse200093/wmt_biomed/2016/pubmed_en_fr.txt' # txt file containing corresponding en and fr texts
path2 = '/export/home/cse200093/wmt_biomed/2019/train/fr-en/' # folder

In [None]:
# load metric offline
# bleu.py requires bleu1.py to run
metric = load_metric("/export/home/cse200093/Expe_Translation/bleu.py")
metric

In [None]:
# Load wmt_biomed/2016
en = []
fr = []
ids = []
file1 = open(path1, 'r')
lines = file1.readlines()
for line in tqdm(lines):
    sent_id = line.split('|')[0]
    sent_en = line.split('|')[1]
    sent_fr = line.split('|')[2]
    if sent_en != '[Not Available].':
        en.append(sent_en.replace('[','').replace(']',''))
        fr.append(sent_fr.rstrip('\n'))
        ids.append(sent_id)

In [None]:
df1 = pd.DataFrame({'id':ids, 'English':en, 'French':fr})
df1

In [None]:
# Load wmt_biomed/2019
# select all ann files containing annotations
os.chdir(r'/export/home/cse200093/wmt_biomed/2019/train/fr-en') # eng
eng_files = glob.glob('*_en.txt')

os.chdir(r'/export/home/cse200093/wmt_biomed/2019/train/fr-en') # fre
fre_files = glob.glob('*_fr.txt')

In [None]:
eng_files.sort()
fre_files.sort()

In [None]:
en1 = []
fr1 = []
ids1 = []
#pattern= r'\b[A-ZÀ-Ÿ]+(?:\s+[A-ZÀ-Ÿ]+)*\b'
for file in tqdm(eng_files):
    f = open(path2+file, 'r')
    #print([x for x in re.findall(pattern, f.read()) if len(x)>6])
    #print(f.read().split("\n",2)[2])
    en1.append(f.read().split("\n",2)[2])
#     lines = f.readlines()
#     ids1.append(lines[0].rstrip('\n').split(' ')[-1])
#     # lines[1] are authors 
#     for line in lines[2:]:
#         line = line.rstrip('\n')
#         en1.append(line)
    f.close()
    
for file in tqdm(fre_files):
    f = open(path2+file, 'r')
    #print([x for x in re.findall(pattern, f.read()) if len(x)>5])
    #print(f.read().split("\n",2)[2])
    fr1.append(f.read().split("\n",2)[2])
#     lines = f.readlines()
#     # lines[1] are authors 
#     for line in lines[2:]:
#         line = line.rstrip('\n')
#         fr1.append(line)
    f.close()

In [None]:
my_dict = {"en": en+en1, "fr": fr+fr1}

dataset = Dataset.from_dict(my_dict)
dataset

In [None]:
# train test validation split
train_dataset, test_dataset= dataset.train_test_split(test_size=0.1).values()
train_dataset, validation_dataset= train_dataset.train_test_split(test_size=0.1).values()
raw_datasets = DatasetDict({"train":train_dataset,"validation":validation_dataset,"test":test_dataset})
raw_datasets

In [None]:
import pickle
# save raw datasets
open_file = open('/export/home/cse200093/Expe_Translation/raw_datasets_wmt_biomed_2016_2019.pkl', "wb")
pickle.dump(raw_datasets, open_file)
open_file.close()

In [None]:
# load raw_datasets
open_file = open('raw_datasets_wmt_biomed_2016_2019.pkl', "rb")
raw_datasets = pickle.load(open_file)
open_file.close()
raw_datasets

In [None]:
from transformers import AutoTokenizer
# The model we want to fine-tune
model_checkpoint = '/export/home/cse200093/opus-mt-fr-en'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
max_source_length = 128
max_target_length = 128
source_lang = "fr"
target_lang = "en"

def batch_tokenize_fn(examples):
    """
    Generate the input_ids and labels field for huggingface dataset/dataset dict.
    
    Truncation is enabled, so we cap the sentence to the max length, padding will be done later
    in a data collator, so pad examples to the longest length in the batch and not the whole dataset.
    """
    sources = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(sources, max_length=max_source_length, truncation=True)

    # setup the tokenizer for targets,
    # huggingface expects the target tokenized ids to be stored in the labels field
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(batch_tokenize_fn, batched=True)

In [None]:
# solve device problem
class torch_global:
    device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')

    class set_device(object):
        def __init__(self, device, error='ignore'):
            try:
                count = torch.cuda.device_count()
                print(f'Available CUDA devices: {count}')
            except:
                print('No available CUDA devices')
                #logger.error('No available CUDA devices')
            self.previous = torch_global.device
            try:
                new_device = torch.device(device) if isinstance(device, str) else device
                torch.as_tensor([0]).to(new_device)
            except:
                msg = f"Device {device} is not available"
                if error == "ignore":
                    print(msg)
                else:
                    raise
            else:
                torch_global.device = new_device
            print(f'Current device: {torch_global.device}')

        def __enter__(self):
            pass

        def __exit__(self, exc_type, exc_val, exc_tb):
            torch_global.device = self.previous


In [None]:
torch_global.set_device('cuda:6')
device = torch_global.device

In [None]:
# The model we want to fine-tune
model_checkpoint = '/export/home/cse200093/opus-mt-fr-en'
# model_checkpoint = "Helsinki-NLP/opus-mt-fr-en"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
# model.to(device)
print('# of parameters: ', model.num_parameters())

In [None]:
# function to generate translation for a model
def generate_translation(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    source = example[source_lang]
    target = example[target_lang]
    input_ids = example['input_ids']
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    generated_ids = model.generate(input_ids)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    print('source: ', source)
    print('target: ', target)
    print('prediction: ', prediction)

In [None]:
example = tokenized_datasets['train'][1]
generate_translation(model, tokenizer, example)

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"/export/home/cse200093/Expe_Translation/{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15, # to change
    predict_with_generate=True,
    remove_unused_columns=True,
    fp16=True,
    #push_to_hub=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

In [None]:
# start fine tuning
trainer_output = trainer.train("opus-mt-fr-en-finetuned-fr-to-en/checkpoint-28000")
trainer_output

In [None]:
# save fine tuned model
model.save_pretrained("/export/home/cse200093/Expe_Translation/opus-mt-fr-en-finetuned-fr-to-en/FT_opus_model")

In [None]:
from easynmt import EasyNMT, models
# load the model after FT :
model_fr_en = EasyNMT(translator = models.AutoModel('/export/home/cse200093/Expe_Translation/opus-mt-fr-en-finetuned-fr-to-en/FT_opus_model'))

In [None]:
# use the model after FT to translate
model_fr_en.translate('''lupus diagnostiqué à l’âge de 13 ans avec atteinte pleuro-péricardique et du système nerveux central pris
en charge à Bordeaux : 6 bolus d’ENDOXAN relayés par corticoïdes et PLAQUENIL pendant environ 2 ans.
Pas d’atteinte rénale. Plus de traitement depuis l’âge de 16 ans. Patiente revue en consultation à Paris à
Cochin par le Docteur BOINI en 2011 : pas de signe clinique d’activité lupique.
Sur le plan immunologique : FAN 1/180 sans spécificité, anti-DNA négatif, anticorps anti-ECT négatif,
anticorps anti-cardiolipine positifs à 29 unités mais pas d’anti-bêta 2 GPI et pas d’anti-coagulant circulant.
Sédiment urinaire calme. Pas de traitement spécifique mis en place.''',source_lang = 'fr',target_lang='en')