In [1]:
pip install transformers[torch] datasets sentencepiece rouge_score sacrebleu

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting absl-py (from rouge_score)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting portalocker (from sacrebleu)
  Using cached portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading accelerate-1.10.1-py3-none-any.whl (374 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Using cached absl_py-2.3.1-py3-none-any.whl (135 kB)
Using cached portalocker-3.2.0-py3-none-any.whl (22 kB)
Building wheels for collected 

  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print("--- GPU Information ---")
if torch.cuda.is_available():
    print("‚úÖ CUDA is available!")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("‚ùå CUDA is not available. PyTorch is running on CPU.")

PyTorch version: 2.5.1+cu121
--- GPU Information ---
‚úÖ CUDA is available!
Number of GPUs: 1
GPU Name: NVIDIA RTX A5000


In [11]:
from datasets import DatasetDict, Dataset
import pandas as pd

# Load data using pandas and remove any empty rows
df = pd.read_csv('../Dataset/final_cleaned_dataset_CNN.csv').dropna().reset_index(drop=True)
raw_dataset = Dataset.from_pandas(df)

# Define the task prefixes for the model
PREFIX_ENG = "summarize English: "
PREFIX_HIN = "summarize Hindi: "

def format_dataset(batch):
    """Transforms a batch of data for the two summarization tasks."""
    inputs, targets = [], []
    
    # Create alternating examples for English and Hindi summarization
    for article, eng_summary, hin_summary in zip(
        batch['raw_news_article'],
        batch['english_summary'],
        batch['hindi_summary']
    ):
        if isinstance(article, str):
            # English Task
            inputs.append(PREFIX_ENG + article)
            targets.append(eng_summary)
            
            # Hindi Task
            inputs.append(PREFIX_HIN + article)
            targets.append(hin_summary)
            
    return {'inputs': inputs, 'targets': targets}

# Apply the formatting and create train/test splits
processed_dataset = raw_dataset.map(
    format_dataset,
    batched=True,
    remove_columns=raw_dataset.column_names
).flatten()

train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)

final_datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# Verify the structure and samples
print(final_datasets)

print("\n--- English Task Sample ---")
print("Input:", final_datasets['train'][0]['inputs'])
print("Target:", final_datasets['train'][0]['targets'])

print("\n--- Hindi Task Sample ---")
print("Input:", final_datasets['train'][1]['inputs'])
print("Target:", final_datasets['train'][1]['targets'])

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4919/4919 [00:00<00:00, 15757.47 examples/s]

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 8854
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 984
    })
})

--- English Task Sample ---
Input: summarize Hindi: (CNN)Canadian actor Jonathan Crombie, who co-starred in the "Anne of Green Gables" TV movies, died this week at age 48. Crombie died Wednesday from complications of a brain hemorrhage, "Anne of Green Gables" producer Kevin Sullivan said. "It's a real tragedy to see someone at age 48 go like that," he said. "I will remember him as someone who worked extremely hard to make the roles he played onscreen come to life." Based on Canadian author Lucy Maud Montgomery's children's books, "Anne of Green Gables" debuted in Canada on CBC TV in 1984 and became a cultural touchstone. The plot focused on the adventures of fiery orphan Anne Shirley, played by Megan Follows, who is sent to live on a farm in Prince Edward Island. Crombie played Gilbert Bly




Data Loading and Preparation

In [2]:
from datasets import DatasetDict, Dataset
import pandas as pd

# Load data using pandas and remove any empty rows
df = pd.read_csv('../Dataset/final_cleaned_dataset_CNN.csv').dropna().reset_index(drop=True)
raw_dataset = Dataset.from_pandas(df)

# Define the task prefixes for the model
PREFIX_ENG = "summarize English: "
PREFIX_HIN = "summarize Hindi: "

def format_dataset(batch):
    """Transforms a batch of data for the two summarization tasks."""
    inputs, targets = [], []
    
    # Create alternating examples for English and Hindi summarization
    for article, eng_summary, hin_summary in zip(
        batch['raw_news_article'],
        batch['english_summary'],
        batch['hindi_summary']
    ):
        if isinstance(article, str):
            # English Task
            inputs.append(PREFIX_ENG + article)
            targets.append(eng_summary)
            
            # Hindi Task
            inputs.append(PREFIX_HIN + article)
            targets.append(hin_summary)
            
    return {'inputs': inputs, 'targets': targets}

# Apply the formatting and create train/test splits
processed_dataset = raw_dataset.map(
    format_dataset,
    batched=True,
    remove_columns=raw_dataset.column_names
).flatten()

train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)

final_datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# --- NEW AND IMPROVED VERIFICATION ---
# Loop through the first 6 samples of the shuffled training set to see the mix of tasks.
print("--- Verifying the first 6 samples of the SHUFFLED training data ---")
for i in range(6):
    print(f"\n--- Sample {i+1} ---")
    print(final_datasets['train'][i]['inputs'])
    print("--------------------")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4919/4919 [00:00<00:00, 23037.16 examples/s]

--- Verifying the first 6 samples of the SHUFFLED training data ---

--- Sample 1 ---
summarize Hindi: (CNN)Canadian actor Jonathan Crombie, who co-starred in the "Anne of Green Gables" TV movies, died this week at age 48. Crombie died Wednesday from complications of a brain hemorrhage, "Anne of Green Gables" producer Kevin Sullivan said. "It's a real tragedy to see someone at age 48 go like that," he said. "I will remember him as someone who worked extremely hard to make the roles he played onscreen come to life." Based on Canadian author Lucy Maud Montgomery's children's books, "Anne of Green Gables" debuted in Canada on CBC TV in 1984 and became a cultural touchstone. The plot focused on the adventures of fiery orphan Anne Shirley, played by Megan Follows, who is sent to live on a farm in Prince Edward Island. Crombie played Gilbert Blythe, who evolves over time from Anne's pigtail-tugging tormentor to friend to husband. Follows and Crombie reprised the roles in the sequels "Anne of




Tokenization

In [3]:
from transformers import MT5Tokenizer

MODEL_CHECKPOINT = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(MODEL_CHECKPOINT)

MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256

def tokenize_function(examples):
    """Converts text inputs and targets into token IDs."""
    model_inputs = tokenizer(
        examples['inputs'], 
        max_length=MAX_INPUT_LENGTH, 
        truncation=True, 
    )
    
    # Ensures labels are tokenized correctly for seq2seq tasks
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['targets'], 
            max_length=MAX_TARGET_LENGTH, 
            truncation=True,
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = final_datasets.map(tokenize_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8854/8854 [00:32<00:00, 270.83 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 984/984 [00:03<00:00, 276.40 examples/s]


Model Training

In [4]:
import numpy as np
import evaluate
from transformers import (
    MT5ForConditionalGeneration, 
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

model = MT5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
rouge_metric = evaluate.load("rouge")

BATCH_SIZE = 4
MODEL_NAME = "mt5-base-cnn-summarizer-en-hi"

training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_NAME,
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    fp16=False, # <--- THIS IS THE FIX
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    """Calculates ROUGE scores for evaluation."""
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]
    
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    return {k: round(v * 100, 4) for k, v in result.items()}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()
trainer.save_model(f"{MODEL_NAME}/final_model")

  1%|          | 50/6642 [01:16<2:44:49,  1.50s/it]

{'loss': 18.8898, 'grad_norm': 25843.544921875, 'learning_rate': 5e-06, 'epoch': 0.02}


  2%|‚ñè         | 100/6642 [02:53<5:24:53,  2.98s/it]

{'loss': 17.3691, 'grad_norm': 24710.8671875, 'learning_rate': 1e-05, 'epoch': 0.05}


  2%|‚ñè         | 150/6642 [04:29<2:45:05,  1.53s/it]

{'loss': 15.1896, 'grad_norm': 1253.855712890625, 'learning_rate': 1.5e-05, 'epoch': 0.07}


  3%|‚ñé         | 200/6642 [05:47<2:38:21,  1.47s/it]

{'loss': 11.4705, 'grad_norm': 1275.7470703125, 'learning_rate': 2e-05, 'epoch': 0.09}


  4%|‚ñç         | 250/6642 [06:58<2:38:23,  1.49s/it]

{'loss': 7.257, 'grad_norm': 19.44481086730957, 'learning_rate': 2.5e-05, 'epoch': 0.11}


  5%|‚ñç         | 300/6642 [07:58<2:12:24,  1.25s/it]

{'loss': 4.3326, 'grad_norm': 11.529706954956055, 'learning_rate': 3e-05, 'epoch': 0.14}


  5%|‚ñå         | 350/6642 [09:01<1:42:27,  1.02it/s]

{'loss': 3.6727, 'grad_norm': 3.133930206298828, 'learning_rate': 3.5e-05, 'epoch': 0.16}


  6%|‚ñå         | 400/6642 [10:18<2:50:58,  1.64s/it]

{'loss': 3.3826, 'grad_norm': 3.2587127685546875, 'learning_rate': 4e-05, 'epoch': 0.18}


  7%|‚ñã         | 450/6642 [11:22<2:01:21,  1.18s/it]

{'loss': 3.4258, 'grad_norm': 2.94587779045105, 'learning_rate': 4.5e-05, 'epoch': 0.2}


  8%|‚ñä         | 500/6642 [12:22<2:09:01,  1.26s/it]

{'loss': 3.2653, 'grad_norm': 2.916131019592285, 'learning_rate': 5e-05, 'epoch': 0.23}


  8%|‚ñä         | 550/6642 [13:28<1:49:31,  1.08s/it]

{'loss': 3.1574, 'grad_norm': 2.276667833328247, 'learning_rate': 4.959296646043634e-05, 'epoch': 0.25}


  9%|‚ñâ         | 600/6642 [14:29<1:44:05,  1.03s/it]

{'loss': 3.1425, 'grad_norm': 3.2969565391540527, 'learning_rate': 4.9185932920872686e-05, 'epoch': 0.27}


 10%|‚ñâ         | 650/6642 [15:53<2:37:25,  1.58s/it]

{'loss': 3.0699, 'grad_norm': 2.586760997772217, 'learning_rate': 4.8778899381309024e-05, 'epoch': 0.29}


 11%|‚ñà         | 700/6642 [17:06<1:37:12,  1.02it/s]

{'loss': 3.0012, 'grad_norm': 2.9604198932647705, 'learning_rate': 4.837186584174536e-05, 'epoch': 0.32}


 11%|‚ñà‚ñè        | 750/6642 [17:56<1:43:36,  1.06s/it]

{'loss': 2.9563, 'grad_norm': 2.398463487625122, 'learning_rate': 4.79648323021817e-05, 'epoch': 0.34}


 12%|‚ñà‚ñè        | 800/6642 [18:55<2:05:23,  1.29s/it]

{'loss': 2.9003, 'grad_norm': 2.998619794845581, 'learning_rate': 4.7557798762618045e-05, 'epoch': 0.36}


 13%|‚ñà‚ñé        | 850/6642 [19:57<2:11:00,  1.36s/it]

{'loss': 2.909, 'grad_norm': 2.534437894821167, 'learning_rate': 4.7150765223054384e-05, 'epoch': 0.38}


 14%|‚ñà‚ñé        | 900/6642 [20:47<1:21:09,  1.18it/s]

{'loss': 2.96, 'grad_norm': 2.140286445617676, 'learning_rate': 4.674373168349072e-05, 'epoch': 0.41}


 14%|‚ñà‚ñç        | 950/6642 [21:40<1:24:51,  1.12it/s]

{'loss': 2.8684, 'grad_norm': 2.402892589569092, 'learning_rate': 4.633669814392706e-05, 'epoch': 0.43}


 15%|‚ñà‚ñå        | 1000/6642 [22:33<1:50:25,  1.17s/it]

{'loss': 2.877, 'grad_norm': 2.3655755519866943, 'learning_rate': 4.5929664604363405e-05, 'epoch': 0.45}


 16%|‚ñà‚ñå        | 1050/6642 [23:35<2:05:17,  1.34s/it]

{'loss': 2.7866, 'grad_norm': 3.0120809078216553, 'learning_rate': 4.552263106479974e-05, 'epoch': 0.47}


 17%|‚ñà‚ñã        | 1100/6642 [24:38<1:59:18,  1.29s/it]

{'loss': 2.799, 'grad_norm': 2.3288731575012207, 'learning_rate': 4.511559752523608e-05, 'epoch': 0.5}


 17%|‚ñà‚ñã        | 1150/6642 [25:45<1:56:08,  1.27s/it]

{'loss': 2.761, 'grad_norm': 2.0763607025146484, 'learning_rate': 4.470856398567242e-05, 'epoch': 0.52}


 18%|‚ñà‚ñä        | 1200/6642 [26:41<2:15:02,  1.49s/it]

{'loss': 2.787, 'grad_norm': 2.4499988555908203, 'learning_rate': 4.4301530446108765e-05, 'epoch': 0.54}


 19%|‚ñà‚ñâ        | 1250/6642 [27:35<1:22:43,  1.09it/s]

{'loss': 2.7463, 'grad_norm': 2.4600257873535156, 'learning_rate': 4.38944969065451e-05, 'epoch': 0.56}


 20%|‚ñà‚ñâ        | 1300/6642 [28:34<1:38:50,  1.11s/it]

{'loss': 2.7063, 'grad_norm': 2.149730682373047, 'learning_rate': 4.348746336698144e-05, 'epoch': 0.59}


 20%|‚ñà‚ñà        | 1350/6642 [29:34<1:45:07,  1.19s/it]

{'loss': 2.7094, 'grad_norm': 2.884317636489868, 'learning_rate': 4.308042982741778e-05, 'epoch': 0.61}


 21%|‚ñà‚ñà        | 1400/6642 [30:25<1:17:47,  1.12it/s]

{'loss': 2.707, 'grad_norm': 2.2151288986206055, 'learning_rate': 4.2673396287854124e-05, 'epoch': 0.63}


 22%|‚ñà‚ñà‚ñè       | 1450/6642 [31:24<2:26:38,  1.69s/it]

{'loss': 2.6963, 'grad_norm': 2.1696739196777344, 'learning_rate': 4.226636274829046e-05, 'epoch': 0.65}


 23%|‚ñà‚ñà‚ñé       | 1500/6642 [32:34<1:37:04,  1.13s/it]

{'loss': 2.7001, 'grad_norm': 1.9839057922363281, 'learning_rate': 4.18593292087268e-05, 'epoch': 0.68}


 23%|‚ñà‚ñà‚ñé       | 1550/6642 [33:36<2:00:39,  1.42s/it]

{'loss': 2.6753, 'grad_norm': 2.3683249950408936, 'learning_rate': 4.1452295669163146e-05, 'epoch': 0.7}


 24%|‚ñà‚ñà‚ñç       | 1600/6642 [34:48<1:34:02,  1.12s/it]

{'loss': 2.6208, 'grad_norm': 2.190981388092041, 'learning_rate': 4.1045262129599484e-05, 'epoch': 0.72}


 25%|‚ñà‚ñà‚ñç       | 1650/6642 [35:41<1:39:47,  1.20s/it]

{'loss': 2.6683, 'grad_norm': 2.2084054946899414, 'learning_rate': 4.063822859003582e-05, 'epoch': 0.75}


 26%|‚ñà‚ñà‚ñå       | 1700/6642 [37:04<2:15:07,  1.64s/it]

{'loss': 2.6425, 'grad_norm': 2.8109753131866455, 'learning_rate': 4.023119505047216e-05, 'epoch': 0.77}


 26%|‚ñà‚ñà‚ñã       | 1750/6642 [38:15<2:01:18,  1.49s/it]

{'loss': 2.6371, 'grad_norm': 2.0437192916870117, 'learning_rate': 3.9824161510908506e-05, 'epoch': 0.79}


 27%|‚ñà‚ñà‚ñã       | 1800/6642 [39:34<1:33:57,  1.16s/it]

{'loss': 2.61, 'grad_norm': 2.209165096282959, 'learning_rate': 3.9417127971344844e-05, 'epoch': 0.81}


 28%|‚ñà‚ñà‚ñä       | 1850/6642 [40:32<1:41:25,  1.27s/it]

{'loss': 2.6236, 'grad_norm': 2.6539177894592285, 'learning_rate': 3.901009443178118e-05, 'epoch': 0.84}


 29%|‚ñà‚ñà‚ñä       | 1900/6642 [41:26<1:58:56,  1.51s/it]

{'loss': 2.5873, 'grad_norm': 2.196930408477783, 'learning_rate': 3.860306089221752e-05, 'epoch': 0.86}


 29%|‚ñà‚ñà‚ñâ       | 1950/6642 [42:20<1:11:22,  1.10it/s]

{'loss': 2.575, 'grad_norm': 2.296056032180786, 'learning_rate': 3.8196027352653865e-05, 'epoch': 0.88}


 30%|‚ñà‚ñà‚ñà       | 2000/6642 [43:22<1:29:03,  1.15s/it]

{'loss': 2.5951, 'grad_norm': 2.392451524734497, 'learning_rate': 3.7788993813090204e-05, 'epoch': 0.9}


 31%|‚ñà‚ñà‚ñà       | 2050/6642 [44:13<1:01:55,  1.24it/s]

{'loss': 2.5612, 'grad_norm': 2.19561505317688, 'learning_rate': 3.738196027352654e-05, 'epoch': 0.93}


 32%|‚ñà‚ñà‚ñà‚ñè      | 2100/6642 [45:48<2:25:21,  1.92s/it]

{'loss': 2.5424, 'grad_norm': 2.6735122203826904, 'learning_rate': 3.697492673396288e-05, 'epoch': 0.95}


 32%|‚ñà‚ñà‚ñà‚ñè      | 2150/6642 [47:21<1:31:32,  1.22s/it]

{'loss': 2.5664, 'grad_norm': 2.106947898864746, 'learning_rate': 3.6567893194399225e-05, 'epoch': 0.97}


 33%|‚ñà‚ñà‚ñà‚ñé      | 2200/6642 [49:05<2:28:31,  2.01s/it]

{'loss': 2.5255, 'grad_norm': 2.2351150512695312, 'learning_rate': 3.616085965483556e-05, 'epoch': 0.99}


                                                     
 33%|‚ñà‚ñà‚ñà‚ñé      | 2214/6642 [53:05<1:53:43,  1.54s/it]

{'eval_loss': 1.9671177864074707, 'eval_rouge1': 10.9929, 'eval_rouge2': 4.2532, 'eval_rougeL': 9.6798, 'eval_rougeLsum': 11.0107, 'eval_runtime': 217.3446, 'eval_samples_per_second': 4.527, 'eval_steps_per_second': 1.132, 'epoch': 1.0}


 34%|‚ñà‚ñà‚ñà‚ñç      | 2250/6642 [55:44<1:20:17,  1.10s/it]  

{'loss': 2.5857, 'grad_norm': 2.328514337539673, 'learning_rate': 3.57538261152719e-05, 'epoch': 1.02}


 35%|‚ñà‚ñà‚ñà‚ñç      | 2300/6642 [56:57<1:38:11,  1.36s/it]

{'loss': 2.4735, 'grad_norm': 2.3959734439849854, 'learning_rate': 3.534679257570824e-05, 'epoch': 1.04}


 35%|‚ñà‚ñà‚ñà‚ñå      | 2350/6642 [57:59<1:47:00,  1.50s/it]

{'loss': 2.5136, 'grad_norm': 2.202885866165161, 'learning_rate': 3.4939759036144585e-05, 'epoch': 1.06}


 36%|‚ñà‚ñà‚ñà‚ñå      | 2400/6642 [59:09<1:14:42,  1.06s/it]

{'loss': 2.5112, 'grad_norm': 2.123568296432495, 'learning_rate': 3.453272549658092e-05, 'epoch': 1.08}


 36%|‚ñà‚ñà‚ñà‚ñå      | 2403/6642 [59:12<1:12:46,  1.03s/it]

KeyboardInterrupt: 

In [2]:
import numpy as np
import evaluate
from transformers import (
    MT5ForConditionalGeneration, 
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

model = MT5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
rouge_metric = evaluate.load("rouge")

BATCH_SIZE = 4
MODEL_NAME = "mt5-base-cnn-summarizer-en-hi-v4"

training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_NAME,
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    fp16=False, # <--- THIS IS THE FIX
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    """Calculates ROUGE scores for evaluation."""
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]
    
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    return {k: round(v * 100, 4) for k, v in result.items()}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()
trainer.save_model(f"{MODEL_NAME}/final_model")



  0%|          | 0/6642 [00:00<?, ?it/s]

{'loss': 18.8898, 'grad_norm': 25843.544921875, 'learning_rate': 5e-06, 'epoch': 0.02}
{'loss': 17.3691, 'grad_norm': 24710.8671875, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 15.1896, 'grad_norm': 1253.855712890625, 'learning_rate': 1.5e-05, 'epoch': 0.07}
{'loss': 11.4705, 'grad_norm': 1275.7470703125, 'learning_rate': 2e-05, 'epoch': 0.09}
{'loss': 7.257, 'grad_norm': 19.44481086730957, 'learning_rate': 2.5e-05, 'epoch': 0.11}
{'loss': 4.3326, 'grad_norm': 11.529706954956055, 'learning_rate': 3e-05, 'epoch': 0.14}
{'loss': 3.6727, 'grad_norm': 3.133930206298828, 'learning_rate': 3.5e-05, 'epoch': 0.16}
{'loss': 3.3826, 'grad_norm': 3.2587127685546875, 'learning_rate': 4e-05, 'epoch': 0.18}
{'loss': 3.4258, 'grad_norm': 2.94587779045105, 'learning_rate': 4.5e-05, 'epoch': 0.2}
{'loss': 3.2653, 'grad_norm': 2.916131019592285, 'learning_rate': 5e-05, 'epoch': 0.23}
{'loss': 3.1574, 'grad_norm': 2.276667833328247, 'learning_rate': 4.959296646043634e-05, 'epoch': 0.25}
{'loss': 3.14



  0%|          | 0/246 [00:00<?, ?it/s]

2025-09-22 07:33:32,718 [INFO] - Using default tokenizer.


{'eval_loss': 1.9671177864074707, 'eval_rouge1': 11.0001, 'eval_rouge2': 4.2691, 'eval_rougeL': 9.6964, 'eval_rougeLsum': 10.9729, 'eval_runtime': 211.7028, 'eval_samples_per_second': 4.648, 'eval_steps_per_second': 1.162, 'epoch': 1.0}
{'loss': 2.5857, 'grad_norm': 2.328514337539673, 'learning_rate': 3.57538261152719e-05, 'epoch': 1.02}
{'loss': 2.4735, 'grad_norm': 2.3959734439849854, 'learning_rate': 3.534679257570824e-05, 'epoch': 1.04}
{'loss': 2.5136, 'grad_norm': 2.202885866165161, 'learning_rate': 3.4939759036144585e-05, 'epoch': 1.06}
{'loss': 2.5112, 'grad_norm': 2.123568296432495, 'learning_rate': 3.453272549658092e-05, 'epoch': 1.08}
{'loss': 2.4621, 'grad_norm': 2.071408271789551, 'learning_rate': 3.412569195701726e-05, 'epoch': 1.11}
{'loss': 2.4829, 'grad_norm': 1.851739525794983, 'learning_rate': 3.37186584174536e-05, 'epoch': 1.13}
{'loss': 2.4779, 'grad_norm': 2.233159065246582, 'learning_rate': 3.331162487788994e-05, 'epoch': 1.15}
{'loss': 2.4767, 'grad_norm': 2.140



  0%|          | 0/246 [00:00<?, ?it/s]

2025-09-22 08:32:07,485 [INFO] - Using default tokenizer.


{'eval_loss': 1.8547613620758057, 'eval_rouge1': 10.9926, 'eval_rouge2': 4.1325, 'eval_rougeL': 9.5914, 'eval_rougeLsum': 10.9835, 'eval_runtime': 204.6945, 'eval_samples_per_second': 4.807, 'eval_steps_per_second': 1.202, 'epoch': 2.0}
{'loss': 2.3587, 'grad_norm': 2.2020087242126465, 'learning_rate': 1.784435037447086e-05, 'epoch': 2.01}
{'loss': 2.3383, 'grad_norm': 2.1042938232421875, 'learning_rate': 1.7437316834907197e-05, 'epoch': 2.03}
{'loss': 2.321, 'grad_norm': 2.182854413986206, 'learning_rate': 1.7030283295343538e-05, 'epoch': 2.06}
{'loss': 2.319, 'grad_norm': 2.20234751701355, 'learning_rate': 1.6623249755779876e-05, 'epoch': 2.08}
{'loss': 2.2927, 'grad_norm': 2.4622364044189453, 'learning_rate': 1.6216216216216218e-05, 'epoch': 2.1}
{'loss': 2.3131, 'grad_norm': 2.276627540588379, 'learning_rate': 1.5809182676652556e-05, 'epoch': 2.12}
{'loss': 2.3021, 'grad_norm': 2.378438711166382, 'learning_rate': 1.5402149137088898e-05, 'epoch': 2.15}
{'loss': 2.2758, 'grad_norm': 



  0%|          | 0/246 [00:00<?, ?it/s]

2025-09-22 09:16:36,932 [INFO] - Using default tokenizer.


{'eval_loss': 1.8242502212524414, 'eval_rouge1': 11.8785, 'eval_rouge2': 4.6153, 'eval_rougeL': 10.457, 'eval_rougeLsum': 11.9025, 'eval_runtime': 204.8678, 'eval_samples_per_second': 4.803, 'eval_steps_per_second': 1.201, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


{'train_runtime': 9039.8248, 'train_samples_per_second': 2.938, 'train_steps_per_second': 0.735, 'train_loss': 2.9490119522862033, 'epoch': 3.0}


In [None]:
import logging
import pandas as pd
import numpy as np
import evaluate
from datetime import datetime
from datasets import DatasetDict, Dataset
from transformers import (
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    MT5Tokenizer,
    pipeline
)

# Configure logger to save to a unique file and print to console
log_filename = f"training_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)

logging.info("--- Starting New Training Run ---")

try:
    logging.info("Step 2: Starting data loading and preparation.")
    df = pd.read_csv('../Dataset/final_cleaned_dataset_CNN.csv').dropna().reset_index(drop=True)
    raw_dataset = Dataset.from_pandas(df)

    PREFIX_ENG = "summarize English: "
    PREFIX_HIN = "summarize Hindi: "

    def format_dataset(batch):
        inputs, targets = [], []
        for article, eng_summary, hin_summary in zip(
            batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
        ):
            if isinstance(article, str):
                inputs.append(PREFIX_ENG + article)
                targets.append(eng_summary)
                inputs.append(PREFIX_HIN + article)
                targets.append(hin_summary)
        return {'inputs': inputs, 'targets': targets}

    processed_dataset = raw_dataset.map(
        format_dataset, batched=True, remove_columns=raw_dataset.column_names
    ).flatten()

    train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
    final_datasets = DatasetDict({
        'train': train_test_split['train'], 'test': train_test_split['test']
    })
    logging.info(f"Data prepared successfully. Training samples: {len(final_datasets['train'])}, Test samples: {len(final_datasets['test'])}")

    logging.info("Step 3: Starting tokenization.")
    MODEL_CHECKPOINT = "google/mt5-base"
    tokenizer = MT5Tokenizer.from_pretrained(MODEL_CHECKPOINT)
    MAX_INPUT_LENGTH, MAX_TARGET_LENGTH = 1024, 256

    def tokenize_function(examples):
        model_inputs = tokenizer(examples['inputs'], max_length=MAX_INPUT_LENGTH, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples['targets'], max_length=MAX_TARGET_LENGTH, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = final_datasets.map(tokenize_function, batched=True)
    logging.info("Tokenization complete.")

    logging.info("Step 4: Initializing Trainer and preparing for model training.")
    model = MT5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
    rouge_metric = evaluate.load("rouge")
    BATCH_SIZE = 4
    MODEL_NAME = "mt5-base-cnn-summarizer-en-hi"

    training_args = Seq2SeqTrainingArguments(
        output_dir=MODEL_NAME,
        num_train_epochs=3,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f"{MODEL_NAME}/logs",
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=False,
        load_best_model_at_end=True,
        metric_for_best_model="rouge2",
        generation_max_length=256
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
        decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]
        result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        return {k: round(v * 100, 4) for k, v in result.items()}

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    logging.info("Starting model training...")
    trainer.train()
    logging.info("Training finished successfully.")
    trainer.save_model(f"{MODEL_NAME}/final_model")
    logging.info(f"Model saved to {MODEL_NAME}/final_model")

    logging.info("Step 5: Performing inference with the trained model.")
    model_path = f"{MODEL_NAME}/final_model"
    summarizer = pipeline("summarization", model=model_path, tokenizer=model_path)
    
    article_text = """
    The Indian Space Research Organisation (ISRO) successfully launched its ambitious Mars Orbiter Mission, also known as Mangalyaan, making India the first nation to succeed on its maiden attempt to reach Mars. The low-cost mission, which cost only $74 million, was designed to study the Martian atmosphere and surface. The spacecraft orbited Mars for several years, sending back valuable data and images, far exceeding its expected mission life. The success of Mangalyaan was a major milestone for India's space program, demonstrating its capability to execute complex interplanetary missions and cementing its position as a major player in space exploration.
    """

    english_summary = summarizer(PREFIX_ENG + article_text, max_length=100)
    logging.info(f"--- English Summary ---\n{english_summary[0]['summary_text']}")

    hindi_summary = summarizer(PREFIX_HIN + article_text, max_length=120)
    logging.info(f"--- Hindi Summary ---\n{hindi_summary[0]['summary_text']}")

    logging.info("--- Run Completed Successfully ---")

except Exception as e:
    logging.error(f"An unexpected error occurred during the run: {e}", exc_info=True)
    logging.error("--- Run Failed ---")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4919/4919 [00:00<00:00, 21255.17 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8854/8854 [00:33<00:00, 263.35 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 984/984 [00:03<00:00, 272.22 examples/s]
                                                     
 36%|‚ñà‚ñà‚ñà‚ñå      | 2403/6642 [1:32:22<1:12:46,  1.03s/it]

{'loss': 18.8898, 'grad_norm': 25843.544921875, 'learning_rate': 5e-06, 'epoch': 0.02}


                                                       
 36%|‚ñà‚ñà‚ñà‚ñå      | 2403/6642 [1:59:13<1:12:46,  1.03s/it]

{'loss': 17.3691, 'grad_norm': 24710.8671875, 'learning_rate': 1e-05, 'epoch': 0.05}


                                                       
 36%|‚ñà‚ñà‚ñà‚ñå      | 2403/6642 [2:30:24<1:12:46,  1.03s/it]

{'loss': 15.1896, 'grad_norm': 1253.855712890625, 'learning_rate': 1.5e-05, 'epoch': 0.07}




In [None]:
import logging
import pandas as pd
import numpy as np
import evaluate
from datetime import datetime
from datasets import DatasetDict, Dataset
from transformers import (
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    MT5Tokenizer,
    pipeline
)

# Configure logger
log_filename = f"training_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)

logging.info("--- Starting New Training Run with GPU ---")

try:
    logging.info("Step 2: Starting data loading and preparation.")
    df = pd.read_csv('../Dataset/final_cleaned_dataset_CNN.csv', engine='python', on_bad_lines='skip')
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    raw_dataset = Dataset.from_pandas(df)

    PREFIX_ENG = "summarize English: "
    PREFIX_HIN = "summarize Hindi: "

    def format_dataset(batch):
        inputs, targets = [], []
        for article, eng_summary, hin_summary in zip(
            batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
        ):
            if isinstance(article, str):
                inputs.append(PREFIX_ENG + article)
                targets.append(eng_summary)
                inputs.append(PREFIX_HIN + article)
                targets.append(hin_summary)
        return {'inputs': inputs, 'targets': targets}

    processed_dataset = raw_dataset.map(
        format_dataset, batched=True, remove_columns=raw_dataset.column_names
    ).flatten()

    train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
    final_datasets = DatasetDict({
        'train': train_test_split['train'], 'test': train_test_split['test']
    })
    logging.info(f"Data prepared successfully. Samples: {len(final_datasets['train'])} train, {len(final_datasets['test'])} test.")

    logging.info("Step 3: Starting tokenization.")
    MODEL_CHECKPOINT = "google/mt5-base"
    tokenizer = MT5Tokenizer.from_pretrained(MODEL_CHECKPOINT)
    MAX_INPUT_LENGTH, MAX_TARGET_LENGTH = 1024, 256

    def tokenize_function(examples):
        model_inputs = tokenizer(examples['inputs'], max_length=MAX_INPUT_LENGTH, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples['targets'], max_length=MAX_TARGET_LENGTH, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = final_datasets.map(tokenize_function, batched=True)
    logging.info("Tokenization complete.")

    logging.info("Step 4: Initializing Trainer.")
    model = MT5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
    rouge_metric = evaluate.load("rouge")
    BATCH_SIZE = 4
    MODEL_NAME = "mt5-base-cnn-summarizer-en-hi"

    training_args = Seq2SeqTrainingArguments(
        output_dir=MODEL_NAME,
        num_train_epochs=3,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f"{MODEL_NAME}/logs",
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=True,  # Re-enabled for RTX A5000 performance
        load_best_model_at_end=True,
        metric_for_best_model="rouge2",
        generation_max_length=256
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
        decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]
        result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        return {k: round(v * 100, 4) for k, v in result.items}

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    logging.info("Starting model training on GPU...")
    trainer.train()
    logging.info("Training finished successfully.")
    trainer.save_model(f"{MODEL_NAME}/final_model")
    logging.info(f"Model saved to {MODEL_NAME}/final_model")

    logging.info("--- Run Completed Successfully ---")

except Exception as e:
    logging.error(f"An unexpected error occurred: {e}", exc_info=True)
    logging.error("--- Run Failed ---")

2025-09-22 05:18:51,366 [INFO] - --- Starting New Training Run with GPU ---
2025-09-22 05:18:51,366 [INFO] - Step 2: Starting data loading and preparation.


Map:   0%|          | 0/4919 [00:00<?, ? examples/s]

2025-09-22 05:18:52,218 [INFO] - Data prepared successfully. Samples: 8854 train, 984 test.
2025-09-22 05:18:52,218 [INFO] - Step 3: Starting tokenization.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/8854 [00:00<?, ? examples/s]



Map:   0%|          | 0/984 [00:00<?, ? examples/s]

2025-09-22 05:19:31,647 [INFO] - Tokenization complete.
2025-09-22 05:19:31,647 [INFO] - Step 4: Initializing Trainer.
2025-09-22 05:19:45,913 [INFO] - Starting model training on GPU...


  0%|          | 0/6642 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.02}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.05}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.09}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.11}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.16}


KeyboardInterrupt: 

In [None]:
import logging
import pandas as pd
import numpy as np
import evaluate
from datetime import datetime
from datasets import DatasetDict, Dataset
from transformers import (
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    MT5Tokenizer,
    pipeline
)

# Configure a logger to save progress and errors to a timestamped file
log_filename = f"training_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)

logging.info("--- Starting New Training Run with GPU ---")

try:
    logging.info("Starting data loading and preparation.")
    # Robustly read the CSV, skipping any malformed lines
    df = pd.read_csv('../Dataset/final_cleaned_dataset_CNN.csv', engine='python', on_bad_lines='skip')
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    raw_dataset = Dataset.from_pandas(df)

    PREFIX_ENG = "summarize English: "
    PREFIX_HIN = "summarize Hindi: "

    def format_dataset(batch):
        inputs, targets = [], []
        for article, eng_summary, hin_summary in zip(
            batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
        ):
            if isinstance(article, str):
                inputs.append(PREFIX_ENG + article)
                targets.append(eng_summary)
                inputs.append(PREFIX_HIN + article)
                targets.append(hin_summary)
        return {'inputs': inputs, 'targets': targets}

    processed_dataset = raw_dataset.map(
        format_dataset, batched=True, remove_columns=raw_dataset.column_names
    ).flatten()

    train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
    final_datasets = DatasetDict({
        'train': train_test_split['train'], 'test': train_test_split['test']
    })
    logging.info(f"Data prepared. Samples: {len(final_datasets['train'])} train, {len(final_datasets['test'])} test.")

    logging.info("Starting tokenization.")
    MODEL_CHECKPOINT = "google/mt5-base"
    tokenizer = MT5Tokenizer.from_pretrained(MODEL_CHECKPOINT)
    MAX_INPUT_LENGTH, MAX_TARGET_LENGTH = 1024, 256

    def tokenize_function(examples):
        model_inputs = tokenizer(examples['inputs'], max_length=MAX_INPUT_LENGTH, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples['targets'], max_length=MAX_TARGET_LENGTH, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = final_datasets.map(tokenize_function, batched=True)
    logging.info("Tokenization complete.")

    logging.info("Initializing Trainer.")
    model = MT5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
    rouge_metric = evaluate.load("rouge")
    BATCH_SIZE = 4
    MODEL_NAME = "mt5-base-cnn-summarizer-en-hi_v3"

    training_args = Seq2SeqTrainingArguments(
        output_dir=MODEL_NAME,
        num_train_epochs=3,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f"{MODEL_NAME}/logs",
        logging_steps=50,
        evaluation_strategy="epoch", # Use "eval_strategy" in newer library versions
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=False, # Set to False for maximum stability
        load_best_model_at_end=True,
        metric_for_best_model="rouge2",
        generation_max_length=256,
        adam_epsilon=1e-8 # Added for optimizer stability
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        # Clean up potentially invalid token IDs before decoding
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
        decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]
        result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        return {k: round(v * 100, 4) for k, v in result.items()}

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    logging.info("Starting model training on GPU...")
    trainer.train()
    logging.info("Training finished successfully.")
    trainer.save_model(f"{MODEL_NAME}/final_model")
    logging.info(f"Model saved to {MODEL_NAME}/final_model")

    logging.info("Performing inference with the trained model.")
    model_path = f"{MODEL_NAME}/final_model"
    summarizer = pipeline("summarization", model=model_path, tokenizer=model_path)
    
    article_text = """
    The Indian Space Research Organisation (ISRO) successfully launched its ambitious Mars Orbiter Mission, also known as Mangalyaan, making India the first nation to succeed on its maiden attempt to reach Mars. The low-cost mission, which cost only $74 million, was designed to study the Martian atmosphere and surface. The spacecraft orbited Mars for several years, sending back valuable data and images, far exceeding its expected mission life. The success of Mangalyaan was a major milestone for India's space program, demonstrating its capability to execute complex interplanetary missions and cementing its position as a major player in space exploration.
    """

    english_summary = summarizer(PREFIX_ENG + article_text, max_length=100)
    logging.info(f"--- English Summary ---\n{english_summary[0]['summary_text']}")

    hindi_summary = summarizer(PREFIX_HIN + article_text, max_length=120)
    logging.info(f"--- Hindi Summary ---\n{hindi_summary[0]['summary_text']}")

    logging.info("--- Run Completed Successfully ---")

except Exception as e:
    logging.error(f"An unexpected error occurred: {e}", exc_info=True)
    logging.error("--- Run Failed ---")

2025-09-22 13:56:36,347 [INFO] - --- Starting New Training Run with GPU ---
2025-09-22 13:56:36,347 [INFO] - Starting data loading and preparation.


Map:   0%|          | 0/4919 [00:00<?, ? examples/s]

2025-09-22 13:56:37,191 [INFO] - Data prepared. Samples: 8854 train, 984 test.
2025-09-22 13:56:37,191 [INFO] - Starting tokenization.


Map:   0%|          | 0/8854 [00:00<?, ? examples/s]



Map:   0%|          | 0/984 [00:00<?, ? examples/s]

2025-09-22 13:57:16,484 [INFO] - Tokenization complete.
2025-09-22 13:57:16,484 [INFO] - Initializing Trainer.
2025-09-22 13:57:27,517 [INFO] - Starting model training on GPU...


  0%|          | 0/6642 [00:00<?, ?it/s]

{'loss': 18.8898, 'grad_norm': 25843.544921875, 'learning_rate': 5e-06, 'epoch': 0.02}


Inference of the MOdel


In [4]:
from transformers import pipeline

model_path = f"{MODEL_NAME}/final_model"
summarizer = pipeline("summarization", model=model_path, tokenizer=model_path)

article_text = """
‚ÄòInsult To Every Mother‚Äô: PM Modi Tears Into Congress-RJD For Abusing His Late Mother
"It is well-known that someone who has disowned his own sister and stabbed his own brother in the back cannot be expected to respect any woman. If he had education and values, he would never have resorted to such cheap tactics just before the Devi Paksha," he continued.He went on to declare that the women in poll-bound Bihar would "give a fitting reply to Tejashwi, who stood on the stage and got his own workers to insult the late mother of Prime Minister ji.""The people of Bihar are hurt and outraged by the insult to Prime Minister Modi ji‚Äôs late mother," he said.The slugfest over alleged abuses flared up again after Bihar deputy CM Samrat Choudhary shared a video of Tejashwi's rally accusing RJD leader of encouraging party workers to hurl abuses. This came only weeks after a similar row during Rahul Gandhi‚Äôs Darbhanga rally."Tejashwi Yadav once again had Modi's deceased mother abused. He has once again shattered Bihar's culture. The more RJD workers hurled abuses at the rally, the more Tejashwi Yadav encouraged them. The mothers and sisters of Bihar will surely take account of this hooligan mentality and abuse," he said.RJD defended itself saying that the video was "fabricated". ‚ÄúLeader of opposition Tejashwi Prasad Yadav ji is delivering a speech today at Mahua during the Bihar Adhikar Yatra. The speech is available on my Facebook page ‚Äî you can listen to it. In it, no worker or any person has abused or used foul language for the Hon‚Äôble Prime Minister,‚Äù party leader Mukesh Raushan said."""

# Generate English Summary using the English prefix
english_summary = summarizer(PREFIX_ENG + article_text, max_length=100)
print("--- English Summary ---")
print(english_summary[0]['summary_text'])

# Generate Hindi Summary using the Hindi prefix
hindi_summary = summarizer(PREFIX_HIN + article_text, max_length=120)
print("\n--- Hindi Summary ---")
print(hindi_summary[0]['summary_text'])

--- English Summary ---
Bihar PM Modi threatened Congress-RJD for Abusing His Late Mother, asserting that someone who disowned his sister and stabbed his own brother in the back cannot be expected to respect any woman. He asserted that Bihar would "give a fitting reply" to Tejashwi, who stood on stage and encouraged party workers to hurl abuses. Bihar deputy CM Samrat Choudhar

--- Hindi Summary ---
‡§Æ‡•à‡§®‡•á‡§ú‡§∞ PM ‡§Æ‡•ã‡§¶‡•Ä ‡§®‡•á ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏-RJD ‡§ï‡•á ‡§Ö‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑ ‡§ü‡•Ä‡§ú‡§æ‡§∂‡§µ‡•Ä ‡§™‡•Ä‡§∞‡§æ‡§¶‡§æ ‡§Ø‡§æ‡§¶‡§¶‡§æ ‡§ï‡•á ‡§µ‡§ø‡§∞‡•ã‡§ß ‡§Æ‡•á‡§Ç ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä ‡§ï‡•á ‡§Ö‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑ ‡§§‡•á‡§ú‡§æ‡§∂‡§µ‡•Ä ‡§™‡•Ä‡§∞‡§æ‡§¶‡§æ ‡§Ø‡§æ‡§¶‡§¶‡§æ ‡§ï‡•á ‡§µ‡§ø‡§∞‡•ã‡§ß ‡§Æ‡•á‡§Ç ‡§µ‡§ø‡§∞‡•ã‡§ß ‡§™‡•ç‡§∞‡§¶‡§∞‡•ç‡§∂‡§® ‡§ï‡§ø‡§Ø‡§æ, ‡§ú‡§ø‡§∏‡§Æ‡•á‡§Ç ‡§µ‡§π ‡§Ö‡§™‡§®‡•á ‡§™‡§ø‡§§‡§æ ‡§î‡§∞ ‡§Ö‡§™‡§®‡•Ä ‡§¨‡•á‡§ü‡•Ä ‡§ï‡•ã ‡§ó‡§æ‡§≤‡§ø‡§Ø‡•ã‡§Ç ‡§∏‡•á ‡§∏‡§Ç‡§¨‡•ã‡§ß‡§ø‡§§ ‡§ï‡§∞‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§µ‡§ø‡§∞‡•ã‡§ß ‡§™‡•ç‡§∞‡§¶‡§∞‡•ç‡§∂‡§® 