In [None]:
from transformers import (
    DataCollatorForSeq2Seq,
    # HfArgumentParser,
    # M2M100Tokenizer,
    # M2M100Config,
    # M2M100Model,
    # M2M100ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    # default_data_collator,
    # set_seed,
)
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import load_dataset, load_metric
import numpy as np

In [None]:
config = AutoConfig.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M",src_lang="en",trg_lang="rw")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
forced_bos_token = "kin_Latn"
model.config.forced_bos_token_id = tokenizer.lang_code_to_id[forced_bos_token]
tokenizer.tgt_lang = 'kin_Latn'
max_source_length = max_target_length = 128
padding = "max_length"
truncation = True
epochs = 30
batch_size = 10
src_lang = "eng_Latn"
trg_lang = "kin_Latn"


In [None]:
train_dir = '/home/kabanda/data/train/en_kin_train.tsv'
val_dir = '/home/kabanda/data/val/en_kin_val.tsv'
test_dir = '/home/kabanda/data/test/en_kin_test.tsv'
# train_dir = 'en_kin_train.tsv'
# val_dir = 'en_kin_val.tsv'
# test_dir = 'en_kin_test.tsv'
file_dict = {'train': train_dir, 'validation': val_dir,'test': test_dir}
dataset = load_dataset('csv',data_files = file_dict, sep="\t",encoding="cp1252")


In [None]:
def preprocess(data):
    #column_names: ['kin','en']
    inputs = data[column_names[1]]
    targets = data[column_names[0]]
    model_inputs = tokenizer(inputs, max_length = max_source_length, padding  = padding, truncation = truncation)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length = max_target_length, padding = padding, truncation = truncation)
    labels["input_ids"] = [[(i if i != tokenizer.pad_token_id else -100) for i in label] for label in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
column_names = dataset['train'].column_names
train_dataset = dataset['train'].shuffle(seed=10)
test_dataset = dataset['test'].shuffle(seed=10)
val_dataset = dataset['validation'].shuffle(seed=10)

In [None]:
train_dataset = train_dataset.map(preprocess, batched = True, remove_columns = column_names, desc = "tokenizer train dataset")
val_dataset = val_dataset.map(preprocess, batched = True, remove_columns = column_names, desc = "tokenizer val dataset")
test_dataset = test_dataset.map(preprocess, batched = True, remove_columns = column_names, desc = "tokenizer test dataset")

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, padding = padding) 
metric = load_metric("sacrebleu")

In [None]:
def metrics_calc(data):
    preds, true_labels = data
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)
    true_labels = np.where(true_labels != -100, true_labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(true_labels, skip_special_tokens = True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions = decoded_preds, references = decoded_labels)
    result = {"bleu":result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result



In [None]:
def metrics_calc(data):
    preds, true_labels = data
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)
    true_labels = np.where(true_labels != -100, true_labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(true_labels, skip_special_tokens = True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions = decoded_preds, references = decoded_labels)
    spm_result = metric.compute(predictions = decoded_preds, references = decoded_labels,tokenize='spm')
    chrf_metric = load_metric("chrf")
    chrf_result = chrf_metric.compute(predictions=decoded_preds,references=decoded_labels,word_order=2)
    ter_metric = load_metric("ter")
    ter_result = ter_metric.compute(predictions = decoded_preds,references = decoded_labels)
    result = {"bleu":result["score"],"spbleu":spm_result['score'],'ter':ter_result['score'],'chrf++':chrf_result['score']}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='nllb_results_trial1',
    num_train_epochs=epochs,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps = 10000,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    predict_with_generate=True,
    do_train = True,
    do_eval = True,
    do_predict = True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=metrics_calc,
    data_collator=data_collator,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.test()