<a href="https://colab.research.google.com/github/BilalAhmed7072/M2M100FineTuned/blob/main/M2M100FineTuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from dataclasses import dataclass
from typing import Dict, Any, List

import torch
from datasets import load_dataset
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from peft import LoraConfig, get_peft_model

In [12]:
!pip install evaluate sacrebleu


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py

In [36]:

MODEL_NAME = "facebook/m2m100_418M"
CSV_PATH   = "/content/en_to_multi_dataset.csv"
SRC_LANG   = "en"
TGT_LANGS  = ["ur", "ar", "zh"]
MAX_LEN    = 128

In [3]:

tokenizer = M2M100Tokenizer.from_pretrained(MODEL_NAME)
model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [4]:

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)
model = get_peft_model(model, lora_config)

In [3]:
def preprocess(batch):
    src_texts = batch["src_text"]
    tgt_texts = batch["tgt_text"]
    tgt_langs = batch["tgt_lang"]

    # Encode sources
    tokenizer.src_lang = "en"
    inputs = tokenizer(
        src_texts,
        max_length=MAX_LEN,
        truncation=True,
        padding=False,
    )


    all_labels = []
    for tgt_text, lang in zip(tgt_texts, tgt_langs):
        tokenizer.tgt_lang = lang
        with tokenizer.as_target_tokenizer():
            label_ids = tokenizer(
                tgt_text,
                max_length=MAX_LEN,
                truncation=True,
                padding=False,
            )["input_ids"]
        all_labels.append(label_ids)

    inputs["labels"] = all_labels
    inputs["tgt_lang"] = tgt_langs
    return inputs


In [17]:
processed_train = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
processed_dev   = dev.map(preprocess, batched=True, remove_columns=dev.column_names)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [10]:

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [13]:

import evaluate
sacrebleu = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # decode generated tokens
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # replace -100 in the labels as we can't decode them
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    bleu = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu["score"]}

Downloading builder script: 0.00B [00:00, ?B/s]

In [15]:

train_args = Seq2SeqTrainingArguments(
    output_dir="m2m100_en2multi_lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=50,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

In [18]:

trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=processed_train,
    eval_dataset=processed_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [19]:
if __name__ == "__main__":
    trainer.train()
    # Save LoRA adapter only (small)
    trainer.save_model()
    tokenizer.save_pretrained(train_args.output_dir)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Bleu
100,0.9361,0.68804,0.049409


In [21]:
!pip install huggingface_hub
from huggingface_hub import login

login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
trainer.save_model("./m2m100-finetuned-tech")
tokenizer.save_pretrained("./m2m100-finetuned-tech")


('./m2m100-finetuned-tech/tokenizer_config.json',
 './m2m100-finetuned-tech/special_tokens_map.json',
 'm2m100-finetuned-tech/vocab.json',
 'm2m100-finetuned-tech/sentencepiece.bpe.model',
 './m2m100-finetuned-tech/added_tokens.json')

In [24]:
model.push_to_hub("Bilal7072/m2m100-finetuned-tech")
tokenizer.push_to_hub("Bilal7072/m2m100-finetuned-tech")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...uned-tech/adapter_model.safetensors:  12%|#1        |  547kB / 4.74MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...etuned-tech/sentencepiece.bpe.model: 100%|##########| 2.42MB / 2.42MB            

CommitInfo(commit_url='https://huggingface.co/Bilal7072/m2m100-finetuned-tech/commit/1d42969669d47b08a07040ecd55930d4ea46ea8f', commit_message='Upload tokenizer', commit_description='', oid='1d42969669d47b08a07040ecd55930d4ea46ea8f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Bilal7072/m2m100-finetuned-tech', endpoint='https://huggingface.co', repo_type='model', repo_id='Bilal7072/m2m100-finetuned-tech'), pr_revision=None, pr_num=None)

In [31]:
from huggingface_hub import HfApi, HfFolder

In [2]:
repo_id = "Bilal7072/m2m100-finetuned-tech"

model_card = """# M2M100 Fine-tuned for Tech Translation (English → Urdu, Arabic, Chinese)

## Model Description
This model is a fine-tuned version of [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M).
It is optimized for translating IT, AI, and software documentation.

## Evaluation Results
| Step | Training Loss | Validation Loss | BLEU |
|------|---------------|-----------------|------|
| 100  | 0.9361        | 0.6880          | 0.0494 |
"""

with open("README.md", "w", encoding="utf-8") as f:
    f.write(model_card)


In [34]:
api = HfApi()
api.upload_file(
path_or_fileobj="README.md",
path_in_repo="README.md",
repo_id=repo_id,
repo_type="model"
)

- empty or missing yaml metadata in repo card


CommitInfo(commit_url='https://huggingface.co/Bilal7072/m2m100-finetuned-tech/commit/bc285bc73bc87b4d118f18a8e49aa0b3757160ec', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='bc285bc73bc87b4d118f18a8e49aa0b3757160ec', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Bilal7072/m2m100-finetuned-tech', endpoint='https://huggingface.co', repo_type='model', repo_id='Bilal7072/m2m100-finetuned-tech'), pr_revision=None, pr_num=None)