In [1]:
# Install necessary libraries for the project
!pip install -q transformers datasets torch sentencepiece tqdm

In [2]:
# Import all required libraries
import torch
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from tqdm.auto import tqdm

# Configure tqdm to show progress bars
tqdm.pandas()

print(" All libraries imported and configured.")

 All libraries imported and configured.


In [3]:
# --- Define a placeholder translation function ---
def pseudo_translate_to_hindi(text_batch):
    """
    This is a dummy translation function that works on batches of text.
    It simulates translation by adding a Hindi prefix.
    """
    return [f"हिन्दी: {text}" for text in text_batch]


# --- Load the dataset from Hugging Face Hub ---

print("Loading a sample of the XSum dataset...")
dataset = load_dataset("xsum", split="train[:200]", trust_remote_code=True)
print(f"Dataset loaded with {len(dataset)} examples.")


# --- Create the cross-lingual column ---
# We use .map() to apply our pseudo-translation to each summary.
# The 'batched=True' argument processes multiple rows at once for speed.
print("\nCreating synthetic Hindi summaries (this will show a progress bar)...")
dataset = dataset.map(
    lambda batch: {'hindi_summary': pseudo_translate_to_hindi(batch['summary'])},
    batched=True,
    batch_size=16  # Process in batches of 16
)

print("\n Step 1 complete. Dataset prepared.")
print("\n--- Let's inspect one example to confirm: ---")
print(f"ENGLISH DOCUMENT: {dataset[0]['document'][:250]}...")
print(f"HINDI SUMMARY: {dataset[0]['hindi_summary']}")

Loading a sample of the XSum dataset...
Dataset loaded with 200 examples.

Creating synthetic Hindi summaries (this will show a progress bar)...

 Step 1 complete. Dataset prepared.

--- Let's inspect one example to confirm: ---
ENGLISH DOCUMENT: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disru...
HINDI SUMMARY: हिन्दी: Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.


In [4]:
print("Loading the mBART model and tokenizer...")

# Define the model name from Hugging Face
model_name = "facebook/mbart-large-50-one-to-many-mmt"

# Load the pre-trained model using safetensors to avoid the error
model = MBartForConditionalGeneration.from_pretrained(
    model_name,
    use_safetensors=True
)

# Load the tokenizer with specified source and target languages
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="hi_IN")

print(" Step 2 complete. Model and tokenizer are ready.")

Loading the mBART model and tokenizer...
 Step 2 complete. Model and tokenizer are ready.


In [5]:
# Define the function to tokenize our data
def preprocess_function(examples):
    # Tokenize the English documents (the inputs to the model)
    model_inputs = tokenizer(examples['document'], max_length=512, truncation=True)

    # Tokenize the Hindi summaries (the labels/targets for the model)
    # The 'as_target_tokenizer' context manager ensures the tokenizer is set up for the target language.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['hindi_summary'], max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

print("Applying tokenizer to the entire dataset (this will show a progress bar)...")
# Use .map() to apply the preprocessing function to all examples
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names # Remove old text columns to save memory
)

print("\n Step 3 complete. Data is now tokenized and ready for training.")
print("\n--- Let's inspect the tokenized data: ---")
print(tokenized_dataset)

Applying tokenizer to the entire dataset (this will show a progress bar)...

 Step 3 complete. Data is now tokenized and ready for training.

--- Let's inspect the tokenized data: ---
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})


In [6]:
import transformers
print(transformers.__version__)

4.54.0


In [9]:
# The Data Collator intelligently pads batches of data to the same length.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the training arguments (move predict_with_generate here)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=20,
    save_total_limit=2,
    predict_with_generate=True,  # <-- Move this here
)

# Initialize the Trainer (remove predict_with_generate from here)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# --- Start the training ---
print("\n Starting model fine-tuning... The Trainer will show a detailed progress bar.")
trainer.train()

print("\n Step 4 complete. Model has been fine-tuned!")

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'predict_with_generate'

In [8]:
# --- Save the final model and tokenizer ---
final_model_path = "./my_finetuned_news_summarizer"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Model and tokenizer saved to '{final_model_path}'")


# --- Test the new model with the pipeline ---
from transformers import pipeline

print("\nLoading the fine-tuned model for inference...")
# Load the fine-tuned model using the pipeline for easy inference
summarizer_pipe = pipeline(
    "summarization",
    model=final_model_path,
    tokenizer=final_model_path,
    src_lang="en_XX",
    tgt_lang="hi_IN",
    device=0 if torch.cuda.is_available() else -1 # Use GPU if available
)

# Our example article
english_article_text = """
The Indian Space Research Organisation (ISRO) is set to launch its third lunar mission, Chandrayaan-3, aiming for a soft landing on the moon's surface. A successful landing would make India the fourth country in the world to achieve this feat, marking a major milestone for its space program.
"""

print("\n--- Generating summary with the fine-tuned model: ---")
result = summarizer_pipe(english_article_text, max_length=60)

print(f"\nOriginal Article:\n{english_article_text}")
print(f"\nFine-Tuned Hindi Summary:\n{result[0]['summary_text']}")

print("\n Step 5 complete. Project baseline is now fine-tuned and tested!")

NameError: name 'trainer' is not defined