In [1]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m801.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.8.2 sacrebleu-2.4.2


In [2]:
import os
os.environ["WANDB_DISABLED"]="true"

In [3]:
import transformers
print(transformers.__version__)

4.39.3


# Fine-tuning a model on a translation

In [4]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-mr"

In [7]:
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("Helsinki-NLP/opus-100", "en-mr")
metric = load_metric("sacrebleu")

Downloading readme:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 128k/128k [00:00<00:00, 439kB/s]
Downloading data: 100%|██████████| 1.58M/1.58M [00:00<00:00, 5.06MB/s]
Downloading data: 100%|██████████| 127k/127k [00:00<00:00, 502kB/s]


Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/27007 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [8]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 27007
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [9]:
raw_datasets["train"][0]

{'translation': {'en': 'Ethiopian', 'mr': 'इथियोपिक@ item Calendar system'}}

In [11]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
show_random_elements(raw_datasets["train"])

Unnamed: 0,translation
0,"{'en': 'This shouldn't happen. Please file a bug report at bugzilla.gnome.org describing how you can cause this message to appear.', 'mr': 'असे व्हायला नाही पाहिजे. कृपया bugzilla.gnome.org येथे त्रुटी अहवाल दाखल करा व हा संदेश कसे दर्शवायचे त्याचे वर्णन करा.'}"
1,"{'en': 'My wife isn't beautiful. Yours is.', 'mr': 'माझी पत्नी सुंदर नाहीये. तुमची आहे.'}"
2,"{'en': 'ALeastSignificant (LSB)', 'mr': 'ALeastSignificant (LSB)'}"
3,"{'en': 'Do you want to remove %1 too?', 'mr': 'तुम्हाला% 1 देखिल काढूण टाकायचे?'}"
4,"{'en': '& Enable smartcard support', 'mr': 'स्मार्टकार्ड समर्थन कार्यान्वीत करा (E)'}"


In [12]:
metric

Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
        - `'none'`: no smoothing
        - `'floor'`: increment zero counts
        - `'add-k'`: increment num/deno

In [13]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'score': 0.0,
 'counts': [4, 2, 0, 0],
 'totals': [4, 2, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

# Preprocessing the data

In [14]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]



In [15]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [16]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[50289, 3, 67, 86, 6913, 70, 0], [235, 17, 374, 6913, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [18]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, what are you doing?", "तुम्ही काय करत आहात?"]))

{'input_ids': [[2844, 10105, 1475, 3, 84, 43, 23, 75, 71, 9, 0], [94, 97, 213, 695, 9, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}




In [19]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "mr"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [20]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[11906, 0], [9453, 28499, 35953, 4575, 26666, 1475, 0]], 'attention_mask': [[1, 1], [1, 1, 1, 1, 1, 1, 1]], 'labels': [[53463, 4233, 6436, 54, 34525, 1054, 0], [9453, 4560, 30842, 49609, 7448, 206, 0]]}

In [21]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/27007 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Fine-tuning the model

In [22]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

2024-04-29 10:17:10.866220: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 10:17:10.866325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 10:17:11.128578: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [23]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True    
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [26]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [27]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,3.0094,2.396553,16.8914,9.677


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61673]], 'forced_eos_token_id': 0}


TrainOutput(global_step=844, training_loss=2.973104467889144, metrics={'train_runtime': 347.3281, 'train_samples_per_second': 77.756, 'train_steps_per_second': 2.43, 'total_flos': 195011996811264.0, 'train_loss': 2.973104467889144, 'epoch': 1.0})

Files that are created in directory after fine-tuning

In [30]:
import os
for dirname, _, filenames in os.walk('opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/config.json
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/vocab.json
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/rng_state.pth
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/generation_config.json
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/trainer_state.json
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/training_args.bin
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/tokenizer_config.json
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/special_tokens_map.json
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/target.spm
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/optimizer.pt
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/model.safetensors
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/source.spm
opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500/scheduler.pt


# Predict sample text

In [41]:
from transformers import MarianMTModel, MarianTokenizer
src_text = ['Where are your daughter?']
model_name = 'opus-mt-en-mr-finetuned-en-to-mr/checkpoint-500'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

['तुझी मुलगी कुठे आहे?']