In [4]:
# run_finetuning_v5

import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "mt5-base-cnn-summarizer-en-hi_v3/final_model"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v5"
NEW_DATA_PATH = "../Dataset/filtered_articles_CNN.csv"

# Training hyperparameters
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.01

# Generation parameters for evaluation
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# Configure logging
log_filename = f"finetuning_log_v5_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint based on trainer state and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_checkpoint_path = state.get("best_model_checkpoint")
        if not best_checkpoint_path:
            logging.error("Could not find 'best_model_checkpoint' in trainer_state.json.")
            return

        best_metric = state.get("best_metric", "N/A")
        logging.info(f"Best checkpoint found: {best_checkpoint_path} with metric {best_metric}")
        
        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not find or save the best model due to: {e}", exc_info=True)


def main():
    try:
        logging.info(f"Loading existing model from: {BASE_MODEL_PATH}")
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        logging.info(f"Loading new data from: {NEW_DATA_PATH}")
        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        df_new.reset_index(drop=True, inplace=True)

        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        logging.info(f"New data prepared. Samples: {len(final_datasets['train'])} train, {len(final_datasets['test'])} test.")

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        logging.info("Tokenization complete.")
        
        train_size = len(tokenized_datasets["train"])
        effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
        steps_per_epoch = train_size // effective_batch_size
        if steps_per_epoch == 0: steps_per_epoch = 1

        logging.info("Initializing Seq2SeqTrainer with fully compatible settings...")
        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            
            # --- FINAL COMPATIBILITY FIX ---
            # Removing all modern arguments like 'evaluation_strategy' and 'save_strategy'
            # and using only the older arguments that your library version understands.
            do_eval=True,
            eval_steps=steps_per_epoch,
            save_steps=steps_per_epoch,
            
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False, # Disable the problematic feature
            
            # These are still valid in older versions
            metric_for_best_model="rouge2",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        rouge_metric = evaluate.load("rouge")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            return {k: round(v * 100, 4) for k, v in result.items()}

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting continued fine-tuning...")
        trainer.train()
        logging.info("Fine-tuning finished successfully.")

        # Manually find and save the best model from all checkpoints
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()

2025-09-23 02:05:09,912 [INFO] - Loading existing model from: mt5-base-cnn-summarizer-en-hi_v3/final_model
2025-09-23 02:05:12,814 [INFO] - Loading new data from: ../Dataset/filtered_articles_CNN.csv


Map:   0%|          | 0/686 [00:00<?, ? examples/s]

2025-09-23 02:05:12,940 [INFO] - New data prepared. Samples: 1234 train, 138 test.


Map:   0%|          | 0/1234 [00:00<?, ? examples/s]



Map:   0%|          | 0/138 [00:00<?, ? examples/s]

2025-09-23 02:05:14,049 [INFO] - Tokenization complete.
2025-09-23 02:05:14,050 [INFO] - Initializing Seq2SeqTrainer with fully compatible settings...
  trainer = Seq2SeqTrainer(
2025-09-23 02:05:23,026 [INFO] - Starting continued fine-tuning...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


2025-09-23 02:19:58,275 [INFO] - Fine-tuning finished successfully.
2025-09-23 02:19:58,275 [ERROR] - Could not find or save the best model due to: [Errno 2] No such file or directory: 'mt5-base-cnn-summarizer-en-hi_v5\\trainer_state.json'
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_33624\2965819654.py", line 49, in find_and_save_best_model
    with open(state_path, "r") as f:
  File "c:\Users\admin\anaconda3\envs\summarizer_env\lib\site-packages\IPython\core\interactiveshell.py", line 324, in _modified_open
    return io_open(file, *args, **kwargs)
FileNotFoundError: [Errno 2] No such file or directory: 'mt5-base-cnn-summarizer-en-hi_v5\\trainer_state.json'


In [8]:
# run_finetuning_v5_final_compatible.py

import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "mt5-base-cnn-summarizer-en-hi_v3/final_model"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v5"
NEW_DATA_PATH = "../Dataset/filtered_articles_CNN.csv"

# Training hyperparameters
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.01

# Generation parameters for evaluation
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# Configure logging
log_filename = f"finetuning_log_v5_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint based on trainer state and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_checkpoint_path = state.get("best_model_checkpoint")
        if not best_checkpoint_path:
            logging.error("Could not find 'best_model_checkpoint' in trainer_state.json.")
            return

        best_metric = state.get("best_metric", "N/A")
        logging.info(f"Best checkpoint found: {best_checkpoint_path} with metric {best_metric}")
        
        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not find or save the best model due to: {e}", exc_info=True)


def main():
    try:
        logging.info(f"Loading existing model from: {BASE_MODEL_PATH}")
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        logging.info(f"Loading new data from: {NEW_DATA_PATH}")
        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        df_new.reset_index(drop=True, inplace=True)

        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        logging.info(f"New data prepared. Samples: {len(final_datasets['train'])} train, {len(final_datasets['test'])} test.")

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        logging.info("Tokenization complete.")
        
        train_size = len(tokenized_datasets["train"])
        effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
        steps_per_epoch = train_size // effective_batch_size
        if steps_per_epoch == 0: steps_per_epoch = 1

        logging.info("Initializing Seq2SeqTrainer with manual best model selection strategy...")
        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            
            # --- COMPATIBILITY FIX ---
            # Using older, explicit arguments for evaluation and saving
            do_eval=True,
            eval_steps=steps_per_epoch,
            save_steps=steps_per_epoch,
            
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            
            # --- MANUAL BEST MODEL FIX ---
            load_best_model_at_end=False, # Disable the problematic feature
            
            metric_for_best_model="rouge2",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        rouge_metric = evaluate.load("rouge")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            return {k: round(v * 100, 4) for k, v in result.items()}

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting continued fine-tuning...")
        trainer.train()
        logging.info("Fine-tuning finished successfully.")

        # Manually find and save the best model from all checkpoints
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()

2025-09-23 02:42:56,163 [INFO] - Loading existing model from: mt5-base-cnn-summarizer-en-hi_v3/final_model
2025-09-23 02:42:58,810 [INFO] - Loading new data from: ../Dataset/filtered_articles_CNN.csv


Map:   0%|          | 0/686 [00:00<?, ? examples/s]

2025-09-23 02:42:58,936 [INFO] - New data prepared. Samples: 1234 train, 138 test.


Map:   0%|          | 0/1234 [00:00<?, ? examples/s]



Map:   0%|          | 0/138 [00:00<?, ? examples/s]

2025-09-23 02:43:00,416 [INFO] - Tokenization complete.
2025-09-23 02:43:00,416 [INFO] - Initializing Seq2SeqTrainer with manual best model selection strategy...
  trainer = Seq2SeqTrainer(
2025-09-23 02:43:06,395 [INFO] - Starting continued fine-tuning...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


2025-09-23 03:22:54,662 [INFO] - Fine-tuning finished successfully.
2025-09-23 03:22:54,662 [ERROR] - Could not find or save the best model due to: [Errno 2] No such file or directory: 'mt5-base-cnn-summarizer-en-hi_v5\\trainer_state.json'
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_33624\4155698402.py", line 49, in find_and_save_best_model
    with open(state_path, "r") as f:
  File "c:\Users\admin\anaconda3\envs\summarizer_env\lib\site-packages\IPython\core\interactiveshell.py", line 324, in _modified_open
    return io_open(file, *args, **kwargs)
FileNotFoundError: [Errno 2] No such file or directory: 'mt5-base-cnn-summarizer-en-hi_v5\\trainer_state.json'


In [None]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "mt5-base-cnn-summarizer-en-hi_v3/final_model"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v6"
NEW_DATA_PATH = "../Dataset/filtered_articles_CNN.csv"

# Training hyperparameters
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.01

# Generation parameters for evaluation
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# Configure logging
log_filename = f"finetuning_log_v5_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint based on trainer state and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_checkpoint_path = state.get("best_model_checkpoint")
        if not best_checkpoint_path:
            logging.error("Could not find 'best_model_checkpoint' in trainer_state.json.")
            return

        best_metric = state.get("best_metric", "N/A")
        logging.info(f"Best checkpoint found: {best_checkpoint_path} with metric {best_metric}")
        
        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not find or save the best model due to: {e}", exc_info=True)


def main():
    try:
        logging.info(f"Loading existing model from: {BASE_MODEL_PATH}")
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        logging.info(f"Loading new data from: {NEW_DATA_PATH}")
        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        df_new.reset_index(drop=True, inplace=True)

        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        logging.info(f"New data prepared. Samples: {len(final_datasets['train'])} train, {len(final_datasets['test'])} test.")

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        logging.info("Tokenization complete.")
        
        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            
            evaluation_strategy="epoch",
            save_strategy="epoch",
            
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            
            load_best_model_at_end=False,
            metric_for_best_model="rouge2",

            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        rouge_metric = evaluate.load("rouge")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            return {k: round(v * 100, 4) for k, v in result.items()}

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting continued fine-tuning...")
        trainer.train()
        logging.info("Fine-tuning finished successfully.")

        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()

Inference and Demonstration.


In [9]:
!pip install -q transformers[torch] sentencepiece

In [10]:
import torch
from transformers import AutoTokenizer, MT5ForConditionalGeneration

# --- Configuration ---
MODEL_PATH = "mt5-base-cnn-summarizer-en-hi_v5/final_model"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- Load Model and Tokenizer ---
print(f"Loading model from: {MODEL_PATH}")
print(f"Using device: {DEVICE}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = MT5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(DEVICE)
model.eval()

print("Model loaded successfully.")

Loading model from: mt5-base-cnn-summarizer-en-hi_v5/final_model
Using device: cuda


OSError: mt5-base-cnn-summarizer-en-hi_v5/final_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [None]:
def generate_high_quality_summary(article_text):
    """
    Takes a news article string and prints high-quality English and Hindi summaries.
    """
    PREFIX_ENG = "summarize English: "
    PREFIX_HIN = "summarize Hindi: "

    # --- Generation Hyperparameters ---
    NUM_BEAMS = 8
    LENGTH_PENALTY = 2.0
    NO_REPEAT_NGRAM_SIZE = 3
    MIN_SUMMARY_LENGTH = 50
    MAX_SUMMARY_LENGTH = 256
    
    # --- Print Source Article ---
    print("="*80)
    print("SOURCE ARTICLE:")
    print("="*80)
    print(article_text)

    # --- Generate English Summary ---
    eng_input_text = PREFIX_ENG + article_text
    eng_inputs = tokenizer(eng_input_text, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
    
    eng_summary_ids = model.generate(
        eng_inputs.input_ids,
        num_beams=NUM_BEAMS,
        max_length=MAX_SUMMARY_LENGTH,
        min_length=MIN_SUMMARY_LENGTH,
        length_penalty=LENGTH_PENALTY,
        no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
        early_stopping=True
    )
    english_summary = tokenizer.decode(eng_summary_ids[0], skip_special_tokens=True)
    
    print("\n" + "="*80)
    print("GENERATED ENGLISH SUMMARY:")
    print("="*80)
    print(english_summary)

    # --- Generate Hindi Summary ---
    hin_input_text = PREFIX_HIN + article_text
    hin_inputs = tokenizer(hin_input_text, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)

    hin_summary_ids = model.generate(
        hin_inputs.input_ids,
        num_beams=NUM_BEAMS,
        max_length=MAX_SUMMARY_LENGTH,
        min_length=MIN_SUMMARY_LENGTH,
        length_penalty=LENGTH_PENALTY,
        no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
        early_stopping=True
    )
    hindi_summary = tokenizer.decode(hin_summary_ids[0], skip_special_tokens=True)
    
    print("\n" + "="*80)
    print("GENERATED HINDI SUMMARY:")
    print("="*80)
    print(hindi_summary)
    print("\n" + "="*80)

In [None]:
article_to_test = """
India secured a decisive victory over Australia in the final match of the T20 series, winning by a margin of 35 runs in Bengaluru. Batting first, India posted a competitive total of 198 for 4, thanks to a powerful half-century from captain Suryakumar Yadav, who scored 78 off just 45 balls. In response, Australia's chase faltered early as they lost key wickets to India's fast bowlers.
"""

generate_high_quality_summary(article_to_test)