In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install datasets==2.18.0 transformers[torch]==4.38.2 tensorboard



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset, load_metric
import numpy as np
import torch
from huggingface_hub import notebook_login
import evaluate

## Load Model

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
# lug_Latn actually already exists as a language code
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", target_lang="lug_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Just a test sentence

In [None]:
article = "All refugees were requested to register with the chairman."
inputs = tokenizer(article, return_tensors="pt")

In [None]:
translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["lug_Latn"],
    max_length=360,
)
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


"Abanoonyi b'obubudamu bonna basabiddwa okwewandiisa mu ssentebe."

# Freeze some layers to make the model smaller
- I will experiment by freezing the entire encoder, which is less task-specific compared to the decoder (in translation)

In [None]:
for param in model.model.encoder.parameters():
    param.requires_grad = False

for name, param in model.model.encoder.named_parameters():
    if param.requires_grad:
        print(f"{name} is trainable")
    else:
        print(f"{name} is frozen")

embed_tokens.weight is frozen
layers.0.self_attn.k_proj.weight is frozen
layers.0.self_attn.k_proj.bias is frozen
layers.0.self_attn.v_proj.weight is frozen
layers.0.self_attn.v_proj.bias is frozen
layers.0.self_attn.q_proj.weight is frozen
layers.0.self_attn.q_proj.bias is frozen
layers.0.self_attn.out_proj.weight is frozen
layers.0.self_attn.out_proj.bias is frozen
layers.0.self_attn_layer_norm.weight is frozen
layers.0.self_attn_layer_norm.bias is frozen
layers.0.fc1.weight is frozen
layers.0.fc1.bias is frozen
layers.0.fc2.weight is frozen
layers.0.fc2.bias is frozen
layers.0.final_layer_norm.weight is frozen
layers.0.final_layer_norm.bias is frozen
layers.1.self_attn.k_proj.weight is frozen
layers.1.self_attn.k_proj.bias is frozen
layers.1.self_attn.v_proj.weight is frozen
layers.1.self_attn.v_proj.bias is frozen
layers.1.self_attn.q_proj.weight is frozen
layers.1.self_attn.q_proj.bias is frozen
layers.1.self_attn.out_proj.weight is frozen
layers.1.self_attn.out_proj.bias is froze

## Load Data

In [None]:
df = pd.read_csv("gdrive/My Drive/nllb-finetuning/Luganda.csv", encoding="latin-1")
print(df.shape)
print(df.columns)
df.head()

(16000, 4)
Index(['English', 'Luganda', 'Unnamed: 2', 'Unnamed: 3'], dtype='object')


Unnamed: 0,English,Luganda,Unnamed: 2,Unnamed: 3
0,All refugees were requested to register with t...,Abanoonyiboobubudamu bonna baasabiddwa beewand...,,
1,They called for a refugees' meeting yesterday.,Baayise olukungaana lw'abanoonyiboobubudamu eg...,,
2,Refugees had misunderstandings between thems...,Abanoonyiboobubudamu b'abadde n'obutakkaanya w...,,
3,We were urged to welcome refugees into our com...,Twakubirizibwa okwaniriza abanoonyiboobubudamu...,,
4,More development is achieved when we work toge...,Bwe tukolera awamu enkulaakulana enyingi efuni...,,


In [None]:
# since df has 2 unnamed columns, let's remove them
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df.head()

Unnamed: 0,English,Luganda
0,All refugees were requested to register with t...,Abanoonyiboobubudamu bonna baasabiddwa beewand...
1,They called for a refugees' meeting yesterday.,Baayise olukungaana lw'abanoonyiboobubudamu eg...
2,Refugees had misunderstandings between thems...,Abanoonyiboobubudamu b'abadde n'obutakkaanya w...
3,We were urged to welcome refugees into our com...,Twakubirizibwa okwaniriza abanoonyiboobubudamu...
4,More development is achieved when we work toge...,Bwe tukolera awamu enkulaakulana enyingi efuni...


In [None]:
print(df.isna().sum())

English    978
Luganda    979
dtype: int64


In [None]:
df.dropna(inplace=True)
print(df.isna().sum())

English    0
Luganda    0
dtype: int64


## Training

In [None]:
def tokenize_and_create_dataset(tokenizer, df):
    # tokenizer() returns a dictionary w keys: input_ids (numerical representation of tokens), attention_mask
    encodings = tokenizer(
        list(df["English"]),
        padding=True,
        truncation=True
    )

    decodings = tokenizer(
        list(df["Luganda"]),
        padding=True,
        truncation=True
    )

    dataset = Dataset.from_dict({
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": decodings["input_ids"]
    })
    return dataset

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels] #turns into list of lists to be suitable for evaluation like bleu
    return preds, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # decodes the logits to get predicted tokens
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu = evaluate.load("bleu")
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}
    return result


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_dataset = tokenize_and_create_dataset(tokenizer, train_df)
test_dataset = tokenize_and_create_dataset(tokenizer, test_df)
print('Data has been tokenized')

Data has been tokenized


In [None]:
print(train_dataset)
print(train_dataset[0])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12016
})
{'input_ids': [256047, 64049, 69796, 2442, 205680, 108, 8334, 20362, 30330, 202, 17334, 8334, 107024, 462, 5346, 248075, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [256047, 1147, 2395, 8078, 161, 214656, 2651, 26346, 77, 1137, 93273, 211525, 94565, 41002, 133918, 183239, 60798, 23966, 1017, 115, 22978, 107540, 248075, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
NUM_EPOCHS = 2
model_args = Seq2SeqTrainingArguments(
    output_dir="gdrive/My Drive/nllb-finetuning",
    weight_decay = 0.02,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    logging_strategy="steps",
    learning_rate=2e-4,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=model_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu
500,4.894,4.488097,0.203021
1000,4.5071,4.056671,0.223967
1500,4.1587,3.795963,0.244555
2000,3.8485,3.64652,0.250823
2500,3.7106,3.549755,0.259737
3000,3.6423,3.514868,0.265577


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3004, training_loss=4.126187973111352, metrics={'train_runtime': 3962.8028, 'train_samples_per_second': 6.064, 'train_steps_per_second': 0.758, 'total_flos': 2746398751064064.0, 'train_loss': 4.126187973111352, 'epoch': 2.0})