In [None]:
Fine-Tuning MarianMT on Custom Data 

# datasets for loading and preprocessing data

# transformers for the MarianMT model and training

# Trainer API for training loop management





In [None]:
# install dependencies
!pip install transformers datasets sentencepiece sacrebleu --quiet

In [None]:
# prepare the dataset

[
  { "translation": { "en": "Hello, how are you?", "fr": "Bonjour, comment ça va ?" }},
  { "translation": { "en": "Good morning", "fr": "Bon matin" }}
]

# save it as en_fr_dataset.json

In [None]:

#Save as .py file - fine tuning the model for language translation 

from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import numpy as np
import evaluate
import torch

# Define source and target languages
SRC_LANG = "en"
TGT_LANG = "fr"
MODEL_NAME = f"Helsinki-NLP/opus-mt-{SRC_LANG}-{TGT_LANG}"

# Load tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)

# Load parallel dataset (assumes 'translation' key with src and tgt)
raw_data = load_dataset("json", data_files="en_fr_dataset.json", split="train")

# Tokenization function
def preprocess_function(example):
    inputs = example["translation"][SRC_LANG]
    targets = example["translation"][TGT_LANG]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = raw_data.map(preprocess_function, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Evaluation metric
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt_en_fr_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available()
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    eval_dataset=tokenized_data,  # replace with validation set if available
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Save final model
trainer.save_model("./marianmt_en_fr_finetuned")


In [None]:
# run the script
python fine_tune_marianmt.py



In [None]:
# model for inference

from transformers import pipeline

translator = pipeline("translation", model="./marianmt_en_fr_finetuned", tokenizer=MODEL_NAME)
result = translator("How can I help you?", max_length=128)
print(result[0]["translation_text"])


