Alex Jones (alexander.g.jones.23@dartmouth.edu) \\
March 15, 2022 \\
LING 28 (Rolando Coto-Solano), Winter 2022 \\
Final Project


---

This notebook contains code for finetuning and evaluating a [Kalaallisut-English NMT model](https://huggingface.co/Helsinki-NLP/opus-mt-kl-en/tree/112da788d18d56b8ac0699d57c4b087c919d1fe6) from Hugging Face.

In [None]:
!pip install -U transformers[sentencepiece]
!pip install bleu datasets
from bleu import list_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianMTModel
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
import time
from datasets import load_metric, DatasetDict, load_dataset
import json
from google.colab import drive
import os
os.environ['WANDDB_DISABLED']='true' 

In [None]:
drive.mount('/content/drive/')

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    
    if torch.cuda.get_device_name(0) == "Tesla K40m":
        raise GPUError("GPU Error: No compatible GPU found")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
MODEL_NAME = 'Helsinki-NLP/opus-mt-kl-en'

In [None]:
# Load tokenizer
!pip install sentencepiece
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
# Put model on GPU
model.cuda()

In [None]:
dataset = load_dataset('json', data_files='/content/drive/MyDrive/ling28_final_proj/corpus.json', field='field')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 6393
    })
})

In [None]:
train_testvalid = dataset['train'].train_test_split(test_size=0.1)

In [None]:
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

In [None]:
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [None]:
def preprocess(data,
               prefix,
               max_input_length,
               max_target_length,
               source_lang,
               target_lang):
  '''
  Tokenize and reorganize train/val/test data
  '''
  inputs = [prefix + ex[source_lang] for ex in data["translation"]] #[prefix + ex for ex in data["kl"]]
  targets = [ex[target_lang] for ex in data["translation"]] #[ex for ex in data["en"]]
  model_inputs = tokenizer(inputs, padding=True, truncation=True)
  with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, padding=True, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
    
  return model_inputs

In [None]:
# Some constants for preprocessing
PREFIX = ''
MAX_INPUT_LENGTH = MAX_TARGET_LENGTH = 128
SRC_LANG = 'kl'
TGT_LANG = 'en'

In [None]:
# Preprocess entire dataset
custom_preprocess = lambda data: preprocess(data,
                                            PREFIX,
                                            MAX_INPUT_LENGTH,
                                            MAX_TARGET_LENGTH,
                                            SRC_LANG,
                                            TGT_LANG)
tokenized_data = data.map(custom_preprocess, batched=True)

In [None]:
tokenized_data

In [None]:
# Define the finetuning hyperparameters
BATCH_SIZE = 8
MODEL_NAME = MODEL_NAME.split('/')[-1]
EVAL_STRATEGY = 'epoch'
LR = 2e-5 # learning rate
WEIGHT_DECAY = 0.01
SAVE_LIMIT = 3
TRAIN_EPOCHS = 1
PRED_GEN_FLAG = True

args = Seq2SeqTrainingArguments(
    f"{MODEL_NAME}-finetuned-{SRC_LANG}-to-{TGT_LANG}",
    evaluation_strategy = EVAL_STRATEGY,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=SAVE_LIMIT,
    num_train_epochs=TRAIN_EPOCHS,
    predict_with_generate=PRED_GEN_FLAG   
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Functions for computing metrics from model predictions
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# Instantiate model trainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Finetune model!
trainer.train()

### Evaluate model

In [None]:
FINETUNED_MODEL = '/content/opus-mt-kl-en-finetuned-kl-to-en/checkpoint-500'
finetuned_model = MarianMTModel.from_pretrained(FINETUNED_MODEL)

In [None]:
test_set = open('/content/drive/MyDrive/ling28_final_proj/test_set.txt', 'r').readlines()

In [None]:
test_tuples = []
tup = []
for sent in test_set:
  if sent == '\n':
    test_tuples.append(tup)
    tup = []
  else:
    tup.append(sent)

In [None]:
kl_sents = [tup[0] for tup in test_tuples if len(tup)>1]
en_sents_gold = [tup[1] for tup in test_tuples if len(tup)>1]

In [None]:
len(en_sents_gold)==len(kl_sents)

True

In [None]:
finetuned_model.cuda() # Put model on GPU

In [None]:
def translateKLtoEN(sentences,
                    tokenizer,
                    model,
                    device):
    
    tokenized = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    tokenized.to(device)
    translated = model.generate(**tokenized)
    decoded = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    tokenized.to('cpu')
    return decoded

In [None]:
NUM_SENTS = len(kl_sents)
BATCH_SIZE = 8
NUM_BATCHES = (NUM_SENTS // BATCH_SIZE) + 1
print(f'We will translate {NUM_BATCHES} batches of size {BATCH_SIZE}')

We will translate 615 batches of size 8


In [None]:
i = 0
transl_kl_sents = []
start = time.time()
for i in range(NUM_BATCHES):
    transl_kl_sents.extend(translateKLtoEN(kl_sents[i*BATCH_SIZE : (i+1)*BATCH_SIZE],
                                           tokenizer,
                                           model,
                                           device))
    print("Completed batch {:} of {:}".format(i+1, NUM_BATCHES))
end = time.time()
print("Time taken: {:.3f}".format(end-start))
open('/content/drive/MyDrive/ling28_final_proj/en_preds.txt', 'w').writelines(transl_kl_sents)

In [None]:
def getBLEU(s1, s2):
  return sentence_bleu([s1.split()], s2.split(), smoothing_function=SmoothingFunction().method4)*100

In [None]:
bleu_scores = []
for s1,s2 in zip(en_sents_gold, transl_kl_sents):
  try:
    bleu_scores.append(getBLEU(s1, s2))
  except:
    continue

In [None]:
print(f'Average BLEU score: {sum(bleu_scores) / len(bleu_scores)}')

Average BLEU score: 27.753683440550986
