In [None]:
#Fine-Tuning DistilBART for Lightweight Translation or Summarization



In [None]:
!pip install transformers datasets sentencepiece sacrebleu --quiet

In [None]:
#Prepare Your Data (en_fr_dataset.json)
# save as en_fr_dataset.json

[
  { "source": "Hello, how are you?", "target": "Bonjour, comment ça va ?" },
  { "source": "Good morning", "target": "Bon matin" }
]

In [None]:
#Step 3: Fine-Tuning Script (fine_tune_distilbart.py)
# save as fine_tune_distilbart.py

from transformers import DistilBartTokenizer, DistilBartForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import numpy as np
import evaluate
import torch

# Config
model_checkpoint = "sshleifer/distilbart-cnn-12-6"  # Pretrained DistilBART
source_column = "source"
target_column = "target"

# Load model & tokenizer
tokenizer = DistilBartTokenizer.from_pretrained(model_checkpoint)
model = DistilBartForConditionalGeneration.from_pretrained(model_checkpoint)

# Load dataset
dataset = load_dataset("json", data_files="en_fr_dataset.json", split="train")

# Tokenization
def preprocess(example):
    inputs = tokenizer(example[source_column], max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(example[target_column], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Metric
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./distilbart-finetuned-en-fr",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    learning_rate=5e-5,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=torch.cuda.is_available()
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train!
trainer.train()

# Save
trainer.save_model("./distilbart-finetuned-en-fr")




In [None]:
# inference example

from transformers import pipeline

translator = pipeline("translation", model="./distilbart-finetuned-en-fr", tokenizer="sshleifer/distilbart-cnn-12-6")
result = translator("How can I help you today?")
print(result[0]["translation_text"])


In [None]:
# further enhancements to explore

1.Use datasets.Dataset.train_test_split() for validation.

2. Reduce max_length or batch size for low-RAM machines.

3. Replace DistilBART with another distilled encoder-decoder model for your language pair or summarization.