In [5]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainerCallback
)

# --- Configuration ---
BASE_MODEL_PATH = "facebook/mbart-large-50"
NEW_MODEL_OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4 
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
WEIGHT_DECAY = 0.3
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bleurt_f1"

# --- Setup Logging ---
log_filename = f"mbart_large_training_log_v14_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

class ZeroLossCallback(TrainerCallback):
    """A callback that stops training if the training loss is zero to prevent wasted resources."""
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and 'loss' in logs and logs['loss'] == 0.0:
            logging.error("CRITICAL: Training loss is zero. This indicates a data issue. Stopping training.")
            control.should_training_stop = True

def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        tokenizer = MBart50TokenizerFast.from_pretrained(BASE_MODEL_PATH)
        
        # --- THE DEFINITIVE FIX: Use safetensors to bypass the security check ---
        logging.info("Attempting to load model using safetensors to bypass torch.load vulnerability check.")
        model = MBartForConditionalGeneration.from_pretrained(BASE_MODEL_PATH, use_safetensors=True)
        logging.info("Model loaded successfully using safetensors.")
        # --------------------------------------------------------------------

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Sanitization & Normalization ---")
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        logging.info("--- Text Sanitization & Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        def format_dataset_mbart(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(article)
                    targets.append(eng_summary)
                    langs.append("en_XX")
                    inputs.append(article)
                    targets.append(hin_summary)
                    langs.append("hi_IN")
            return {'article': inputs, 'summary': targets, 'target_lang': langs}

        processed_dataset = raw_dataset.map(
            format_dataset_mbart, batched=True, remove_columns=raw_dataset.column_names
        )
        
        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            tokenizer.src_lang = "en_XX"
            model_inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
            
            labels_batch = []
            for i in range(len(examples['summary'])):
                tokenizer.tgt_lang = examples['target_lang'][i]
                labels = tokenizer(
                    text_target=examples['summary'][i], 
                    max_length=MAX_SUMMARY_LENGTH_EVAL, 
                    truncation=True
                )
                labels_batch.append(labels['input_ids'])
            
            model_inputs["labels"] = labels_batch
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['article', 'summary', 'target_lang'])
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {
                "rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"],
                "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])
            }
            return {k: round(v * 100, 4) for k, v in result.items()}

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_strategy="steps",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            report_to="tensorboard",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[ZeroLossCallback()]
        )

        logging.info("Starting final training (v14) from scratch with mBART-LARGE...")
        trainer.train()
        logging.info("Training finished. All checkpoints and logs are saved.")
        
    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-07 20:18:00,424 [INFO] - Attempting to load model using safetensors to bypass torch.load vulnerability check.
2025-10-07 20:18:03,701 [INFO] - Model loaded successfully using safetensors.
2025-10-07 20:18:04,413 [INFO] - --- Starting Text Sanitization & Normalization ---
2025-10-07 20:18:05,972 [INFO] - --- Text Sanitization & Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-07 20:18:55,190 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-07 20:18:55,190 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-07 20:18:55,190 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-07 20:18:55,190 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-07 20:18:55,207 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-07 20:18:55,207 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-07 20:18:55,213 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-07 20:18:55,214 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-07 20:18:55,217 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-07 20:18:55,219 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-07 20:18:55,221 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-07 20:18:55,224 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 20:18:55,226 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 20:18:55,228 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-07 20:18:55,230 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-07 20:18:55,728 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-07 20:18:55,728 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-07 20:18:55,741 [INFO] - Loading model.
2025-10-07 20:19:01,388 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-07 20:19:01,388 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-07 20:19:01,391 [INFO] - BLEURT initialized.
  trainer = Seq2SeqTrainer(
2025-10-07 20:19:05,307 [INFO] - Starting final training (v14) from scratch with mBART-LARGE...


Step,Training Loss
50,3.0856
100,2.6166
150,2.4919
200,2.4002
250,2.2965
300,2.2085
350,2.1497
400,2.1239
450,2.0618
500,2.0241


2025-10-07 22:53:59,299 [INFO] - Training finished. All checkpoints and logs are saved.


Selecting best model

In [8]:
import os
import json
import shutil
import logging
from datetime import datetime

# --- Configuration: ---
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" 
METRIC_NAME = "bleurt_f1"
# -------------------------------------------------

# --- Setup Logging ---
log_filename = f"select_best_model_log_v14_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def find_and_save_best_checkpoint(output_dir, metric_name):
    """
    Finds and saves the best model checkpoint from a training run.
    If no evaluation metrics are found, it defaults to saving the last available checkpoint.
    """
    try:
        logging.info(f"Attempting to find best model in: {output_dir}")

        if not os.path.isdir(output_dir):
            logging.error(f"FATAL: The directory '{output_dir}' does not exist.")
            return
        
        logging.info(f"Contents of '{output_dir}': {os.listdir(output_dir)}")
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check.lower()
        log_history = []
        
        # --- Multi-level Fallback to find logs ---
        # 1. Check top-level state file
        main_state_path = os.path.join(output_dir, "trainer_state.json")
        if os.path.exists(main_state_path):
            with open(main_state_path, "r") as f: state = json.load(f)
            log_history = state.get("log_history", [])

        # 2. Check individual checkpoint state files
        if not log_history:
            checkpoint_dirs = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
            for chkpt_dir in checkpoint_dirs:
                chkpt_state_path = os.path.join(output_dir, chkpt_dir, "trainer_state.json")
                if os.path.exists(chkpt_state_path):
                    with open(chkpt_state_path, "r") as f: chkpt_state = json.load(f)
                    for log in chkpt_state.get("log_history", []):
                        if metric_to_check in log: log_history.append(log)
        
        if log_history:
            logging.info(f"Found log history. Searching for best score using metric: '{metric_to_check}'")
            for log in log_history:
                if metric_to_check in log:
                    metric_value, step = log[metric_to_check], log.get('step')
                    if step is None: continue
                    if best_metric_value is None or \
                       (not is_loss and metric_value > best_metric_value) or \
                       (is_loss and metric_value < best_metric_value):
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_metric_value, best_checkpoint_path = metric_value, potential_path
                            logging.info(f"New best found -> Step: {step}, {metric_to_check}: {metric_value}")
        else:
            # --- FINAL FALLBACK: No logs found, use the latest checkpoint ---
            logging.warning("Could not find any evaluation metric logs.")
            logging.warning("Defaulting to the LAST saved checkpoint as the best model.")
            checkpoint_dirs = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
            if checkpoint_dirs:
                latest_step = -1
                for chkpt_dir in checkpoint_dirs:
                    try:
                        step = int(chkpt_dir.split('-')[-1])
                        if step > latest_step:
                            latest_step = step
                            best_checkpoint_path = os.path.join(output_dir, chkpt_dir)
                    except ValueError:
                        continue
                if best_checkpoint_path:
                     logging.info(f"Identified last checkpoint: {best_checkpoint_path}")

        if not best_checkpoint_path:
            logging.error("FATAL: Could not find any valid checkpoints to save.")
            return

        logging.info(f"--- Model Identified for Saving ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        if best_metric_value is not None:
            logging.info(f"Metric ({metric_to_check}): {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    find_and_save_best_checkpoint(OUTPUT_DIR, METRIC_NAME)



2025-10-07 23:20:41,148 [INFO] - Attempting to find best model in: mbart-large-50-cnn-summarizer-v14
2025-10-07 23:20:41,163 [INFO] - Contents of 'mbart-large-50-cnn-summarizer-v14': ['checkpoint-2076', 'checkpoint-4152', 'checkpoint-6228', 'checkpoint-8304', 'logs']
2025-10-07 23:20:41,164 [INFO] - Identified last checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-8304
2025-10-07 23:20:41,164 [INFO] - --- Model Identified for Saving ---
2025-10-07 23:20:41,164 [INFO] - Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-8304
2025-10-07 23:22:01,049 [INFO] - Successfully copied best model to: mbart-large-50-cnn-summarizer-v14\final_model


Post Training Evaluation and Model Saving

In [None]:
import os
import shutil
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import evaluate
import unicodedata
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" # trained model path
METRIC_NAME = "bleurt_f1" # metric to find the best model
DATA_PATH = "../Dataset/new_large_CNN_dataset.csv" # Path to original dataset
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"evaluate_checkpoints_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])

# --- Data Loading and Processing Functions (from the original training script) ---
def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        logging.info("--- Starting Post-Training Evaluation of Checkpoints ---")
        
        # 1. --- Load and Prepare the Test Dataset ---
        logging.info("Loading and preparing test data...")
        
        df_new = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        
        raw_dataset = Dataset.from_pandas(df_new)

        def format_dataset_mbart(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']):
                if isinstance(article, str) and article:
                    inputs.append(article); targets.append(eng_summary); langs.append("en_XX")
                    inputs.append(article); targets.append(hin_summary); langs.append("hi_IN")
            return {'article': inputs, 'summary': targets, 'target_lang': langs}

        processed_dataset = raw_dataset.map(format_dataset_mbart, batched=True, remove_columns=raw_dataset.column_names)
        
        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        test_dataset_untokenized = train_test_split['test']
        logging.info(f"Test data prepared with {len(test_dataset_untokenized)} examples.")

        # 2. --- Find all Checkpoints ---
        checkpoint_dirs = sorted(
            [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")],
            key=lambda x: int(x.split('-')[-1])
        )
        if not checkpoint_dirs:
            logging.error(f"FATAL: No 'checkpoint-*' directories found in '{OUTPUT_DIR}'.")
            return
        logging.info(f"Found {len(checkpoint_dirs)} checkpoints to evaluate: {checkpoint_dirs}")
        
        all_results = []
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{METRIC_NAME}"
        is_loss = 'loss' in metric_to_check.lower()
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics_wrapper(eval_pred, tokenizer):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            result = {"rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"], "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])}
            return {f"eval_{k}": round(v * 100, 4) for k, v in result.items()}

        # 3. --- Loop Through and Evaluate Each Checkpoint ---
        for chkpt_dir in checkpoint_dirs:
            chkpt_path = os.path.join(OUTPUT_DIR, chkpt_dir)
            logging.info(f"\n--- Evaluating Checkpoint: {chkpt_path} ---")
            
            model = MBartForConditionalGeneration.from_pretrained(chkpt_path)
            tokenizer = MBart50TokenizerFast.from_pretrained(chkpt_path)
            
            def tokenize_for_eval(examples):
                tokenizer.src_lang = "en_XX"
                model_inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
                labels_batch = []
                for i in range(len(examples['summary'])):
                    tokenizer.tgt_lang = examples['target_lang'][i]
                    labels = tokenizer(text_target=examples['summary'][i], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
                    labels_batch.append(labels['input_ids'])
                model_inputs["labels"] = labels_batch
                return model_inputs

            tokenized_test_dataset = test_dataset_untokenized.map(tokenize_for_eval, batched=True, remove_columns=['article', 'summary', 'target_lang'])

            temp_training_args = Seq2SeqTrainingArguments(
                output_dir=os.path.join(OUTPUT_DIR, "temp_eval"),
                per_device_eval_batch_size=4,
                predict_with_generate=True,
                fp16=torch.cuda.is_available()
            )

            trainer = Seq2SeqTrainer(
                model=model, args=temp_training_args,
                eval_dataset=tokenized_test_dataset, tokenizer=tokenizer,
                data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
                compute_metrics=lambda p: compute_metrics_wrapper(p, tokenizer)
            )
            
            logging.info(f"Running evaluation on {chkpt_dir}...")
            eval_results = trainer.evaluate()
            
            logging.info(f"--- Results for {chkpt_dir} ---")
            for key, value in eval_results.items():
                logging.info(f"  - {key}: {value:.4f}")
            all_results.append({'checkpoint': chkpt_dir, **eval_results})
            
            metric_value = eval_results.get(metric_to_check)
            if metric_value is not None:
                if best_metric_value is None or (not is_loss and metric_value > best_metric_value) or (is_loss and metric_value < best_metric_value):
                    best_metric_value, best_checkpoint_path = metric_value, chkpt_path
                    logging.info(f"*** New best checkpoint found: {chkpt_dir} with {metric_to_check}: {metric_value:.4f} ***")

        # 4. --- Print Final Summary Table and Save the Best ---
        if not all_results:
            logging.error("No checkpoints were successfully evaluated.")
            return

        logging.info("\n" + "="*80)
        logging.info("--- FINAL EVALUATION SUMMARY ---".center(80))
        logging.info("="*80)
        header = f"{'Checkpoint':<20} | {'eval_loss':<12} | {'eval_rouge1':<12} | {'eval_rouge2':<12} | {'eval_rougeL':<12} | {'eval_bleurt_f1':<15}"
        logging.info(header)
        logging.info("-" * len(header))
        for result in all_results:
            row = f"{result['checkpoint']:<20} | {result.get('eval_loss', 'N/A'):<12.4f} | {result.get('eval_eval_rouge1', 'N/A'):<12.4f} | {result.get('eval_eval_rouge2', 'N/A'):<12.4f} | {result.get('eval_eval_rougeL', 'N/A'):<12.4f} | {result.get(metric_to_check, 'N/A'):<15.4f}"
            logging.info(row)
        logging.info("="*80)

        if not best_checkpoint_path:
            logging.error("Could not determine the best checkpoint after evaluation.")
            return

        logging.info(f"\n--- Best Model Identified ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        logging.info(f"Metric ({metric_to_check}): {best_metric_value:.4f}")

        final_model_path = os.path.join(OUTPUT_DIR, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()

import os
import shutil
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import evaluate
import unicodedata
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" # trained model path
METRIC_NAME = "bleurt_f1" # metric to find the best model
DATA_PATH = "../Dataset/new_large_CNN_dataset.csv" # Path to original dataset
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"evaluate_checkpoints_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])

# --- Data Loading and Processing Functions (from the original training script) ---
def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        logging.info("--- Starting Post-Training Evaluation of Checkpoints ---")
        
        # 1. --- Load and Prepare the Test Dataset ---
        logging.info("Loading and preparing test data...")
        
        df_new = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        
        raw_dataset = Dataset.from_pandas(df_new)

        def format_dataset_mbart(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']):
                if isinstance(article, str) and article:
                    inputs.append(article); targets.append(eng_summary); langs.append("en_XX")
                    inputs.append(article); targets.append(hin_summary); langs.append("hi_IN")
            return {'article': inputs, 'summary': targets, 'target_lang': langs}

        processed_dataset = raw_dataset.map(format_dataset_mbart, batched=True, remove_columns=raw_dataset.column_names)
        
        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        test_dataset_untokenized = train_test_split['test']
        logging.info(f"Test data prepared with {len(test_dataset_untokenized)} examples.")

        # 2. --- Find all Checkpoints ---
        checkpoint_dirs = sorted(
            [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")],
            key=lambda x: int(x.split('-')[-1])
        )
        if not checkpoint_dirs:
            logging.error(f"FATAL: No 'checkpoint-*' directories found in '{OUTPUT_DIR}'.")
            return
        logging.info(f"Found {len(checkpoint_dirs)} checkpoints to evaluate: {checkpoint_dirs}")
        
        all_results = []
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{METRIC_NAME}"
        is_loss = 'loss' in metric_to_check.lower()
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics_wrapper(eval_pred, tokenizer):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            result = {"rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"], "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])}
            return {f"eval_{k}": round(v * 100, 4) for k, v in result.items()}

        # 3. --- Loop Through and Evaluate Each Checkpoint ---
        for chkpt_dir in checkpoint_dirs:
            chkpt_path = os.path.join(OUTPUT_DIR, chkpt_dir)
            logging.info(f"\n--- Evaluating Checkpoint: {chkpt_path} ---")
            
            model = MBartForConditionalGeneration.from_pretrained(chkpt_path)
            tokenizer = MBart50TokenizerFast.from_pretrained(chkpt_path)
            
            def tokenize_for_eval(examples):
                tokenizer.src_lang = "en_XX"
                model_inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
                labels_batch = []
                for i in range(len(examples['summary'])):
                    tokenizer.tgt_lang = examples['target_lang'][i]
                    labels = tokenizer(text_target=examples['summary'][i], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
                    labels_batch.append(labels['input_ids'])
                model_inputs["labels"] = labels_batch
                return model_inputs

            tokenized_test_dataset = test_dataset_untokenized.map(tokenize_for_eval, batched=True, remove_columns=['article', 'summary', 'target_lang'])

            temp_training_args = Seq2SeqTrainingArguments(
                output_dir=os.path.join(OUTPUT_DIR, "temp_eval"),
                per_device_eval_batch_size=4,
                predict_with_generate=True,
                fp16=torch.cuda.is_available()
            )

            trainer = Seq2SeqTrainer(
                model=model, args=temp_training_args,
                eval_dataset=tokenized_test_dataset, tokenizer=tokenizer,
                data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
                compute_metrics=lambda p: compute_metrics_wrapper(p, tokenizer)
            )
            
            logging.info(f"Running evaluation on {chkpt_dir}...")
            eval_results = trainer.evaluate()
            
            logging.info(f"--- Results for {chkpt_dir} ---")
            for key, value in eval_results.items():
                logging.info(f"  - {key}: {value:.4f}")
            all_results.append({'checkpoint': chkpt_dir, **eval_results})
            
            metric_value = eval_results.get(metric_to_check)
            if metric_value is not None:
                if best_metric_value is None or (not is_loss and metric_value > best_metric_value) or (is_loss and metric_value < best_metric_value):
                    best_metric_value, best_checkpoint_path = metric_value, chkpt_path
                    logging.info(f"*** New best checkpoint found: {chkpt_dir} with {metric_to_check}: {metric_value:.4f} ***")

        # 4. --- Print Final Summary Table and Save the Best ---
        if not all_results:
            logging.error("No checkpoints were successfully evaluated.")
            return

        logging.info("\n" + "="*80)
        logging.info("--- FINAL EVALUATION SUMMARY ---".center(80))
        logging.info("="*80)
        header = f"{'Checkpoint':<20} | {'eval_loss':<12} | {'eval_rouge1':<12} | {'eval_rouge2':<12} | {'eval_rougeL':<12} | {'eval_bleurt_f1':<15}"
        logging.info(header)
        logging.info("-" * len(header))
        for result in all_results:
            row = f"{result['checkpoint']:<20} | {result.get('eval_loss', 'N/A'):<12.4f} | {result.get('eval_eval_rouge1', 'N/A'):<12.4f} | {result.get('eval_eval_rouge2', 'N/A'):<12.4f} | {result.get('eval_eval_rougeL', 'N/A'):<12.4f} | {result.get(metric_to_check, 'N/A'):<15.4f}"
            logging.info(row)
        logging.info("="*80)

        if not best_checkpoint_path:
            logging.error("Could not determine the best checkpoint after evaluation.")
            return

        logging.info(f"\n--- Best Model Identified ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        logging.info(f"Metric ({metric_to_check}): {best_metric_value:.4f}")

        final_model_path = os.path.join(OUTPUT_DIR, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-07 23:31:36,192 [INFO] - --- Starting Post-Training Evaluation of Checkpoints ---
2025-10-07 23:31:36,197 [INFO] - Loading and preparing test data...


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

2025-10-07 23:31:39,400 [INFO] - Test data prepared with 1845 examples.
2025-10-07 23:31:39,400 [INFO] - Found 4 checkpoints to evaluate: ['checkpoint-2076', 'checkpoint-4152', 'checkpoint-6228', 'checkpoint-8304']


INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-07 23:31:51,081 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-07 23:31:51,083 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-07 23:31:51,089 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-07 23:31:51,090 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-07 23:31:51,092 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-07 23:31:51,093 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-07 23:31:51,096 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-07 23:31:51,098 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-07 23:31:51,099 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-07 23:31:51,102 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-07 23:31:51,104 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-07 23:31:51,105 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 23:31:51,108 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 23:31:51,109 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-07 23:31:51,111 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-07 23:31:51,601 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-07 23:31:51,614 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-07 23:31:51,617 [INFO] - Loading model.
2025-10-07 23:31:57,978 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-07 23:31:57,980 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-07 23:31:57,986 [INFO] - BLEURT initialized.
2025-10-07 23:31:57,986 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-2076 ---


Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-07 23:32:04,261 [INFO] - Running evaluation on checkpoint-2076...


KeyboardInterrupt: 

In [10]:
import os
import shutil
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import evaluate
import unicodedata
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" 
METRIC_NAME = "bleurt_f1" 
DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"
MAX_SUMMARY_LENGTH_EVAL = 256
# --- OPTIMIZATION: Increase batch size for faster evaluation ---
EVAL_BATCH_SIZE = 16 
# --- OPTIONAL: Set to a number (e.g., 500) for a quick evaluation on a subset, or None for the full dataset ---
NUM_EVAL_SAMPLES = 500 

# --- Setup Logging ---
log_filename = f"evaluate_checkpoints_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])

def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        logging.info("--- Starting Post-Training Evaluation of Checkpoints (Optimized) ---")
        
        # 1. --- Load and Prepare the Test Dataset ---
        logging.info("Loading and preparing test data...")
        
        df_new = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        
        raw_dataset = Dataset.from_pandas(df_new)

        def format_dataset_mbart(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']):
                if isinstance(article, str) and article:
                    inputs.append(article); targets.append(eng_summary); langs.append("en_XX")
                    inputs.append(article); targets.append(hin_summary); langs.append("hi_IN")
            return {'article': inputs, 'summary': targets, 'target_lang': langs}

        processed_dataset = raw_dataset.map(format_dataset_mbart, batched=True, remove_columns=raw_dataset.column_names)
        
        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        test_dataset_untokenized = train_test_split['test']
        
        if NUM_EVAL_SAMPLES:
            logging.warning(f"Using a subset of {NUM_EVAL_SAMPLES} examples for quick evaluation.")
            test_dataset_untokenized = test_dataset_untokenized.select(range(NUM_EVAL_SAMPLES))

        logging.info(f"Test data prepared with {len(test_dataset_untokenized)} examples.")

        # 2. --- Find all Checkpoints ---
        checkpoint_dirs = sorted(
            [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")],
            key=lambda x: int(x.split('-')[-1])
        )
        if not checkpoint_dirs:
            logging.error(f"FATAL: No 'checkpoint-*' directories found in '{OUTPUT_DIR}'.")
            return
        logging.info(f"Found {len(checkpoint_dirs)} checkpoints to evaluate: {checkpoint_dirs}")
        
        all_results = []
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{METRIC_NAME}"
        is_loss = 'loss' in metric_to_check.lower()
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics_wrapper(eval_pred, tokenizer):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            result = {"rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"], "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])}
            return {f"eval_{k}": v for k, v in result.items()} # Return raw scores for averaging

        # 3. --- Loop Through and Evaluate Each Checkpoint ---
        for chkpt_dir in checkpoint_dirs:
            chkpt_path = os.path.join(OUTPUT_DIR, chkpt_dir)
            logging.info(f"\n--- Evaluating Checkpoint: {chkpt_path} ---")
            
            model = MBartForConditionalGeneration.from_pretrained(chkpt_path)
            tokenizer = MBart50TokenizerFast.from_pretrained(chkpt_path)
            
            def tokenize_for_eval(examples):
                tokenizer.src_lang = "en_XX"
                model_inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
                labels_batch = []
                for i in range(len(examples['summary'])):
                    tokenizer.tgt_lang = examples['target_lang'][i]
                    labels = tokenizer(text_target=examples['summary'][i], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
                    labels_batch.append(labels['input_ids'])
                model_inputs["labels"] = labels_batch
                return model_inputs

            tokenized_test_dataset = test_dataset_untokenized.map(tokenize_for_eval, batched=True, remove_columns=['article', 'summary', 'target_lang'])

            temp_training_args = Seq2SeqTrainingArguments(
                output_dir=os.path.join(OUTPUT_DIR, "temp_eval"),
                per_device_eval_batch_size=EVAL_BATCH_SIZE, # Using the optimized batch size
                predict_with_generate=True,
                fp16=torch.cuda.is_available()
            )

            trainer = Seq2SeqTrainer(
                model=model, args=temp_training_args,
                eval_dataset=tokenized_test_dataset, tokenizer=tokenizer,
                data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
                compute_metrics=lambda p: compute_metrics_wrapper(p, tokenizer)
            )
            
            logging.info(f"Running evaluation on {chkpt_dir}...")
            eval_results = trainer.evaluate()
            
            # Round metrics for logging
            rounded_results = {k: round(v, 4) for k, v in eval_results.items()}
            logging.info(f"--- Results for {chkpt_dir} ---")
            for key, value in rounded_results.items():
                logging.info(f"  - {key}: {value}")
            all_results.append({'checkpoint': chkpt_dir, **rounded_results})
            
            metric_value = eval_results.get(metric_to_check)
            if metric_value is not None:
                if best_metric_value is None or (not is_loss and metric_value > best_metric_value) or (is_loss and metric_value < best_metric_value):
                    best_metric_value, best_checkpoint_path = metric_value, chkpt_path
                    logging.info(f"*** New best checkpoint found: {chkpt_dir} with {metric_to_check}: {metric_value:.4f} ***")

        # 4. --- Print Final Summary Table and Save the Best ---
        if not all_results:
            logging.error("No checkpoints were successfully evaluated.")
            return

        logging.info("\n" + "="*80)
        logging.info("--- FINAL EVALUATION SUMMARY ---".center(80))
        logging.info("="*80)
        header = f"{'Checkpoint':<20} | {'eval_loss':<12} | {'eval_rouge1':<12} | {'eval_rouge2':<12} | {'eval_rougeL':<12} | {'eval_bleurt_f1':<15}"
        logging.info(header)
        logging.info("-" * len(header))
        for result in all_results:
            row = f"{result['checkpoint']:<20} | {result.get('eval_loss', 'N/A'):<12.4f} | {result.get('eval_eval_rouge1', 'N/A'):<12.4f} | {result.get('eval_eval_rouge2', 'N/A'):<12.4f} | {result.get('eval_eval_rougeL', 'N/A'):<12.4f} | {result.get(metric_to_check, 'N/A'):<15.4f}"
            logging.info(row)
        logging.info("="*80)

        if not best_checkpoint_path:
            logging.error("Could not determine the best checkpoint after evaluation.")
            return

        logging.info(f"\n--- Best Model Identified ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        logging.info(f"Metric ({metric_to_check}): {best_metric_value:.4f}")

        final_model_path = os.path.join(OUTPUT_DIR, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-08 00:05:37,655 [INFO] - --- Starting Post-Training Evaluation of Checkpoints (Optimized) ---
2025-10-08 00:05:37,657 [INFO] - Loading and preparing test data...


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

2025-10-08 00:05:40,845 [INFO] - Test data prepared with 500 examples.
2025-10-08 00:05:40,845 [INFO] - Found 4 checkpoints to evaluate: ['checkpoint-2076', 'checkpoint-4152', 'checkpoint-6228', 'checkpoint-8304']


INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-08 00:05:53,257 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-08 00:05:53,257 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-08 00:05:53,270 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-08 00:05:53,270 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-08 00:05:53,273 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-08 00:05:53,273 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-08 00:05:53,279 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-08 00:05:53,281 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-08 00:05:53,281 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-08 00:05:53,281 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-08 00:05:53,281 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-08 00:05:53,281 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-08 00:05:53,290 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-08 00:05:53,292 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-08 00:05:53,293 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-08 00:05:53,760 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-08 00:05:53,761 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-08 00:05:53,761 [INFO] - Loading model.
2025-10-08 00:05:59,845 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-08 00:05:59,856 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-08 00:05:59,860 [INFO] - BLEURT initialized.
2025-10-08 00:05:59,860 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-2076 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-08 00:06:03,694 [INFO] - Running evaluation on checkpoint-2076...


2025-10-08 00:14:13,634 [INFO] - Using default tokenizer.
2025-10-08 00:25:55,794 [INFO] - --- Results for checkpoint-2076 ---
2025-10-08 00:25:55,796 [INFO] -   - eval_rouge1: 0.0409
2025-10-08 00:25:55,797 [INFO] -   - eval_rouge2: 0.021
2025-10-08 00:25:55,799 [INFO] -   - eval_rougeL: 0.0276
2025-10-08 00:25:55,799 [INFO] -   - eval_bleurt_f1: 0.2583
2025-10-08 00:25:55,800 [INFO] -   - eval_loss: 1.6859
2025-10-08 00:25:55,801 [INFO] -   - eval_model_preparation_time: 0.0048
2025-10-08 00:25:55,801 [INFO] -   - eval_runtime: 1192.0459
2025-10-08 00:25:55,802 [INFO] -   - eval_samples_per_second: 0.419
2025-10-08 00:25:55,804 [INFO] -   - eval_steps_per_second: 0.027
2025-10-08 00:25:55,805 [INFO] - *** New best checkpoint found: checkpoint-2076 with eval_bleurt_f1: 0.2583 ***
2025-10-08 00:25:55,806 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-4152 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-10-08 00:25:59,911 [INFO] - Running evaluation on checkpoint-4152...


2025-10-08 00:34:06,833 [INFO] - Using default tokenizer.
2025-10-08 00:45:50,066 [INFO] - --- Results for checkpoint-4152 ---
2025-10-08 00:45:50,068 [INFO] -   - eval_rouge1: 0.2696
2025-10-08 00:45:50,068 [INFO] -   - eval_rouge2: 0.106
2025-10-08 00:45:50,068 [INFO] -   - eval_rougeL: 0.2434
2025-10-08 00:45:50,068 [INFO] -   - eval_bleurt_f1: 0.4036
2025-10-08 00:45:50,072 [INFO] -   - eval_loss: 1.5638
2025-10-08 00:45:50,072 [INFO] -   - eval_model_preparation_time: 0.0039
2025-10-08 00:45:50,072 [INFO] -   - eval_runtime: 1189.7429
2025-10-08 00:45:50,072 [INFO] -   - eval_samples_per_second: 0.42
2025-10-08 00:45:50,077 [INFO] -   - eval_steps_per_second: 0.027
2025-10-08 00:45:50,077 [INFO] - *** New best checkpoint found: checkpoint-4152 with eval_bleurt_f1: 0.4036 ***
2025-10-08 00:45:50,080 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-6228 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-10-08 00:45:54,015 [INFO] - Running evaluation on checkpoint-6228...


KeyboardInterrupt: 

In [2]:
import os
import shutil
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import evaluate
import unicodedata
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" 
METRIC_NAME = "bleurt_f1" 
DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"
MAX_SUMMARY_LENGTH_EVAL = 256
# --- OPTIMIZATION: Increase batch size for faster evaluation ---
EVAL_BATCH_SIZE = 16 
# --- OPTIONAL: Set to a number (e.g., 500) for a quick evaluation on a subset, or None for the full dataset ---
NUM_EVAL_SAMPLES = 500 

# --- Setup Logging ---
log_filename = f"evaluate_checkpoints_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])

def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        logging.info("--- Starting Post-Training Evaluation of Checkpoints (Optimized) ---")
        
        # 1. --- Load and Prepare the Test Dataset ---
        logging.info("Loading and preparing test data...")
        
        df_new = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        
        raw_dataset = Dataset.from_pandas(df_new)

        def format_dataset_mbart(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']):
                if isinstance(article, str) and article:
                    inputs.append(article); targets.append(eng_summary); langs.append("en_XX")
                    inputs.append(article); targets.append(hin_summary); langs.append("hi_IN")
            return {'article': inputs, 'summary': targets, 'target_lang': langs}

        processed_dataset = raw_dataset.map(format_dataset_mbart, batched=True, remove_columns=raw_dataset.column_names)
        
        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        test_dataset_untokenized = train_test_split['test']
        
        if NUM_EVAL_SAMPLES:
            logging.warning(f"Using a subset of {NUM_EVAL_SAMPLES} examples for quick evaluation.")
            test_dataset_untokenized = test_dataset_untokenized.select(range(NUM_EVAL_SAMPLES))

        logging.info(f"Test data prepared with {len(test_dataset_untokenized)} examples.")

        # 2. --- Find all Checkpoints ---
        checkpoint_dirs = sorted(
            [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")],
            key=lambda x: int(x.split('-')[-1])
        )
        if not checkpoint_dirs:
            logging.error(f"FATAL: No 'checkpoint-*' directories found in '{OUTPUT_DIR}'.")
            return
        logging.info(f"Found {len(checkpoint_dirs)} checkpoints to evaluate: {checkpoint_dirs}")
        
        all_results = []
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{METRIC_NAME}"
        is_loss = 'loss' in metric_to_check.lower()
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics_wrapper(eval_pred, tokenizer):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            result = {"rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"], "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])}
            return {f"eval_{k}": v for k, v in result.items()} # Return raw scores for averaging

        # 3. --- Loop Through and Evaluate Each Checkpoint ---
        for chkpt_dir in checkpoint_dirs:
            chkpt_path = os.path.join(OUTPUT_DIR, chkpt_dir)
            logging.info(f"\n--- Evaluating Checkpoint: {chkpt_path} ---")
            
            model = MBartForConditionalGeneration.from_pretrained(chkpt_path)
            tokenizer = MBart50TokenizerFast.from_pretrained(chkpt_path)
            
            def tokenize_for_eval(examples):
                tokenizer.src_lang = "en_XX"
                model_inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
                labels_batch = []
                for i in range(len(examples['summary'])):
                    tokenizer.tgt_lang = examples['target_lang'][i]
                    labels = tokenizer(text_target=examples['summary'][i], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
                    labels_batch.append(labels['input_ids'])
                model_inputs["labels"] = labels_batch
                return model_inputs

            tokenized_test_dataset = test_dataset_untokenized.map(tokenize_for_eval, batched=True, remove_columns=['article', 'summary', 'target_lang'])

            temp_training_args = Seq2SeqTrainingArguments(
                output_dir=os.path.join(OUTPUT_DIR, "temp_eval"),
                per_device_eval_batch_size=EVAL_BATCH_SIZE, # Using the optimized batch size
                predict_with_generate=True,
                fp16=torch.cuda.is_available()
            )

            trainer = Seq2SeqTrainer(
                model=model, args=temp_training_args,
                eval_dataset=tokenized_test_dataset, tokenizer=tokenizer,
                data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
                compute_metrics=lambda p: compute_metrics_wrapper(p, tokenizer)
            )
            
            logging.info(f"Running evaluation on {chkpt_dir}...")
            eval_results = trainer.evaluate()
            
            # Round metrics for logging
            rounded_results = {k: round(v, 4) for k, v in eval_results.items()}
            logging.info(f"--- Results for {chkpt_dir} ---")
            for key, value in rounded_results.items():
                logging.info(f"  - {key}: {value}")
            all_results.append({'checkpoint': chkpt_dir, **rounded_results})
            
            metric_value = eval_results.get(metric_to_check)
            if metric_value is not None:
                if best_metric_value is None or (not is_loss and metric_value > best_metric_value) or (is_loss and metric_value < best_metric_value):
                    best_metric_value, best_checkpoint_path = metric_value, chkpt_path
                    logging.info(f"*** New best checkpoint found: {chkpt_dir} with {metric_to_check}: {metric_value:.4f} ***")

        # 4. --- Print Final Summary Table and Save the Best ---
        if not all_results:
            logging.error("No checkpoints were successfully evaluated.")
            return

        logging.info("\n" + "="*80)
        logging.info("--- FINAL EVALUATION SUMMARY ---".center(80))
        logging.info("="*80)
        header = f"{'Checkpoint':<20} | {'eval_loss':<12} | {'eval_rouge1':<12} | {'eval_rouge2':<12} | {'eval_rougeL':<12} | {'eval_bleurt_f1':<15}"
        logging.info(header)
        logging.info("-" * len(header))
        for result in all_results:
            row = f"{result['checkpoint']:<20} | {result.get('eval_loss', 'N/A'):<12.4f} | {result.get('eval_eval_rouge1', 'N/A'):<12.4f} | {result.get('eval_eval_rouge2', 'N/A'):<12.4f} | {result.get('eval_eval_rougeL', 'N/A'):<12.4f} | {result.get(metric_to_check, 'N/A'):<15.4f}"
            logging.info(row)
        logging.info("="*80)

        if not best_checkpoint_path:
            logging.error("Could not determine the best checkpoint after evaluation.")
            return

        logging.info(f"\n--- Best Model Identified ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        logging.info(f"Metric ({metric_to_check}): {best_metric_value:.4f}")

        final_model_path = os.path.join(OUTPUT_DIR, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-08 01:07:05,211 [INFO] - --- Starting Post-Training Evaluation of Checkpoints (Optimized) ---
2025-10-08 01:07:05,212 [INFO] - Loading and preparing test data...


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

2025-10-08 01:07:08,338 [INFO] - Test data prepared with 500 examples.
2025-10-08 01:07:08,338 [INFO] - Found 4 checkpoints to evaluate: ['checkpoint-2076', 'checkpoint-4152', 'checkpoint-6228', 'checkpoint-8304']


INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-08 01:07:20,768 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-08 01:07:20,768 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-08 01:07:20,768 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-08 01:07:20,768 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-08 01:07:20,768 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-08 01:07:20,768 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-08 01:07:20,768 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-08 01:07:20,768 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-08 01:07:20,768 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-08 01:07:20,784 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-08 01:07:20,785 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-08 01:07:20,787 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-08 01:07:20,790 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-08 01:07:20,791 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-08 01:07:20,791 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-08 01:07:21,268 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-08 01:07:21,271 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-08 01:07:21,272 [INFO] - Loading model.
2025-10-08 01:07:27,084 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-08 01:07:27,084 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-08 01:07:27,094 [INFO] - BLEURT initialized.
2025-10-08 01:07:27,094 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-2076 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-08 01:07:31,553 [INFO] - Running evaluation on checkpoint-2076...


2025-10-08 01:14:46,765 [INFO] - Using default tokenizer.
2025-10-08 01:25:16,994 [INFO] - --- Results for checkpoint-2076 ---
2025-10-08 01:25:16,997 [INFO] -   - eval_rouge1: 0.0409
2025-10-08 01:25:16,998 [INFO] -   - eval_rouge2: 0.0209
2025-10-08 01:25:16,998 [INFO] -   - eval_rougeL: 0.0275
2025-10-08 01:25:16,998 [INFO] -   - eval_bleurt_f1: 0.2583
2025-10-08 01:25:16,998 [INFO] -   - eval_loss: 1.6859
2025-10-08 01:25:16,998 [INFO] -   - eval_model_preparation_time: 0.0
2025-10-08 01:25:17,003 [INFO] -   - eval_runtime: 1064.9994
2025-10-08 01:25:17,004 [INFO] -   - eval_samples_per_second: 0.469
2025-10-08 01:25:17,006 [INFO] -   - eval_steps_per_second: 0.03
2025-10-08 01:25:17,006 [INFO] - *** New best checkpoint found: checkpoint-2076 with eval_bleurt_f1: 0.2583 ***
2025-10-08 01:25:17,006 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-4152 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-10-08 01:25:20,924 [INFO] - Running evaluation on checkpoint-4152...


2025-10-08 01:32:34,883 [INFO] - Using default tokenizer.
2025-10-08 01:43:15,831 [INFO] - --- Results for checkpoint-4152 ---
2025-10-08 01:43:15,831 [INFO] -   - eval_rouge1: 0.2682
2025-10-08 01:43:15,831 [INFO] -   - eval_rouge2: 0.1067
2025-10-08 01:43:15,831 [INFO] -   - eval_rougeL: 0.2432
2025-10-08 01:43:15,831 [INFO] -   - eval_bleurt_f1: 0.4036
2025-10-08 01:43:15,831 [INFO] -   - eval_loss: 1.5638
2025-10-08 01:43:15,831 [INFO] -   - eval_model_preparation_time: 0.0055
2025-10-08 01:43:15,831 [INFO] -   - eval_runtime: 1074.5817
2025-10-08 01:43:15,839 [INFO] -   - eval_samples_per_second: 0.465
2025-10-08 01:43:15,839 [INFO] -   - eval_steps_per_second: 0.03
2025-10-08 01:43:15,843 [INFO] - *** New best checkpoint found: checkpoint-4152 with eval_bleurt_f1: 0.4036 ***
2025-10-08 01:43:15,844 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-6228 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-10-08 01:43:19,693 [INFO] - Running evaluation on checkpoint-6228...


2025-10-08 01:50:42,969 [INFO] - Using default tokenizer.
2025-10-08 02:01:50,847 [INFO] - --- Results for checkpoint-6228 ---
2025-10-08 02:01:50,847 [INFO] -   - eval_rouge1: 0.2363
2025-10-08 02:01:50,847 [INFO] -   - eval_rouge2: 0.0982
2025-10-08 02:01:50,863 [INFO] -   - eval_rougeL: 0.2146
2025-10-08 02:01:50,864 [INFO] -   - eval_bleurt_f1: 0.3982
2025-10-08 02:01:50,864 [INFO] -   - eval_loss: 1.5082
2025-10-08 02:01:50,864 [INFO] -   - eval_model_preparation_time: 0.0136
2025-10-08 02:01:50,864 [INFO] -   - eval_runtime: 1110.6297
2025-10-08 02:01:50,864 [INFO] -   - eval_samples_per_second: 0.45
2025-10-08 02:01:50,864 [INFO] -   - eval_steps_per_second: 0.029
2025-10-08 02:01:50,864 [INFO] - 
--- Evaluating Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-8304 ---


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-10-08 02:01:54,763 [INFO] - Running evaluation on checkpoint-8304...


2025-10-08 02:09:18,003 [INFO] - Using default tokenizer.
2025-10-08 02:20:39,800 [INFO] - --- Results for checkpoint-8304 ---
2025-10-08 02:20:39,800 [INFO] -   - eval_rouge1: 0.249
2025-10-08 02:20:39,800 [INFO] -   - eval_rouge2: 0.111
2025-10-08 02:20:39,800 [INFO] -   - eval_rougeL: 0.2186
2025-10-08 02:20:39,816 [INFO] -   - eval_bleurt_f1: 0.3965
2025-10-08 02:20:39,817 [INFO] -   - eval_loss: 1.5014
2025-10-08 02:20:39,818 [INFO] -   - eval_model_preparation_time: 0.0167
2025-10-08 02:20:39,819 [INFO] -   - eval_runtime: 1124.7317
2025-10-08 02:20:39,820 [INFO] -   - eval_samples_per_second: 0.445
2025-10-08 02:20:39,821 [INFO] -   - eval_steps_per_second: 0.028
2025-10-08 02:20:39,822 [INFO] - 
2025-10-08 02:20:39,822 [INFO] -                         --- FINAL EVALUATION SUMMARY ---                        
2025-10-08 02:20:39,828 [INFO] - Checkpoint           | eval_loss    | eval_rouge1  | eval_rouge2  | eval_rougeL  | eval_bleurt_f1 
2025-10-08 02:20:39,829 [INFO] - --------

In [3]:
import os
import shutil
import logging
from datetime import datetime

OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" 
# The specific checkpoint folder you identified as the best.
BEST_CHECKPOINT_FOLDER = "checkpoint-4152"
# ----------------------------------------------------------------

# --- Setup Logging ---
log_filename = f"manual_save_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])

def save_specific_checkpoint():
    """
    Manually copies a specified checkpoint directory to a 'final_model' directory.
    """
    try:
        source_path = os.path.join(OUTPUT_DIR, BEST_CHECKPOINT_FOLDER)
        destination_path = os.path.join(OUTPUT_DIR, "final_model")

        logging.info(f"--- Starting Manual Model Save ---")
        logging.info(f"Source Checkpoint: {source_path}")
        logging.info(f"Destination: {destination_path}")

        # Check if the source checkpoint directory exists
        if not os.path.isdir(source_path):
            logging.error(f"FATAL: The source checkpoint directory '{source_path}' does not exist.")
            logging.error("Please ensure the OUTPUT_DIR and BEST_CHECKPOINT_FOLDER variables are set correctly.")
            return

        # If a 'final_model' directory already exists, remove it for a clean copy
        if os.path.exists(destination_path):
            logging.warning(f"Removing existing 'final_model' directory: {destination_path}")
            shutil.rmtree(destination_path)

        # Copy the entire checkpoint directory to the 'final_model' destination
        shutil.copytree(source_path, destination_path)

        logging.info(f"\nSUCCESS: Successfully copied '{BEST_CHECKPOINT_FOLDER}' to '{destination_path}'.")
        logging.info("Your final model is now ready to use.")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the copy process: {e}", exc_info=True)

if __name__ == "__main__":
    save_specific_checkpoint()


2025-10-08 02:25:57,558 [INFO] - --- Starting Manual Model Save ---
2025-10-08 02:25:57,558 [INFO] - Source Checkpoint: mbart-large-50-cnn-summarizer-v14\checkpoint-4152
2025-10-08 02:25:57,558 [INFO] - Destination: mbart-large-50-cnn-summarizer-v14\final_model
2025-10-08 02:27:22,891 [INFO] - 
SUCCESS: Successfully copied 'checkpoint-4152' to 'mbart-large-50-cnn-summarizer-v14\final_model'.
2025-10-08 02:27:22,891 [INFO] - Your final model is now ready to use.


In [1]:
import os
import shutil
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import evaluate
import unicodedata
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
# Import the bleurt library directly for GPU-acceleration ---
try:
    from bleurt import score as bleurt_scorer
    BLEURT_INSTALLED = True
except ImportError:
    BLEURT_INSTALLED = False

# --- Configuration ---
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-v14" 
METRIC_NAME = "rougeL" 
DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"
MAX_SUMMARY_LENGTH_EVAL = 256
EVAL_BATCH_SIZE = 16 
NUM_EVAL_SAMPLES = 1000 

#Control whether to run BLEURT on the GPU ---
USE_GPU_FOR_BLEURT = True

# --- Setup Logging ---
log_filename = f"evaluate_checkpoints_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])

def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        logging.info("--- Starting Post-Training Evaluation of Checkpoints (GPU-Accelerated) ---")
        
        # 1. --- Load and Prepare the Test Dataset ---
        logging.info("Loading and preparing test data...")
        
        df_new = pd.read_csv(DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        
        raw_dataset = Dataset.from_pandas(df_new)

        def format_dataset_mbart(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']):
                if isinstance(article, str) and article:
                    inputs.append(article); targets.append(eng_summary); langs.append("en_XX")
                    inputs.append(article); targets.append(hin_summary); langs.append("hi_IN")
            return {'article': inputs, 'summary': targets, 'target_lang': langs}

        processed_dataset = raw_dataset.map(format_dataset_mbart, batched=True, remove_columns=raw_dataset.column_names)
        
        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        test_dataset_untokenized = train_test_split['test']
        
        if NUM_EVAL_SAMPLES:
            logging.warning(f"Using a subset of {NUM_EVAL_SAMPLES} examples for quick evaluation.")
            test_dataset_untokenized = test_dataset_untokenized.select(range(NUM_EVAL_SAMPLES))

        logging.info(f"Test data prepared with {len(test_dataset_untokenized)} examples.")

        # 2. --- Find all Checkpoints ---
        checkpoint_dirs = sorted([d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")], key=lambda x: int(x.split('-')[-1]))
        if not checkpoint_dirs:
            logging.error(f"FATAL: No 'checkpoint-*' directories found in '{OUTPUT_DIR}'.")
            return
        logging.info(f"Found {len(checkpoint_dirs)} checkpoints to evaluate: {checkpoint_dirs}")
        
        all_results = []
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{METRIC_NAME}"
        is_loss = 'loss' in metric_to_check.lower()
        
        rouge_metric = evaluate.load("rouge")
        
        bleurt_scorer_gpu = None
        bleurt_metric_cpu = None
        if USE_GPU_FOR_BLEURT:
            if BLEURT_INSTALLED:
                logging.info("Initializing GPU-based BLEURT scorer (will auto-download checkpoint if needed)...")
                # This will automatically download and cache the model the first time it's run
                bleurt_scorer_gpu = bleurt_scorer.BleurtScorer("bleurt-20")
                logging.info("GPU-based BLEURT scorer initialized.")
            else:
                logging.error("The 'bleurt' library is not installed. Please run 'pip install git+https://github.com/google-research/bleurt.git'.")
                logging.warning("Falling back to CPU-based BLEURT calculation.")
                bleurt_metric_cpu = evaluate.load("bleurt", "bleurt-20")
        else:
            bleurt_metric_cpu = evaluate.load("bleurt", "bleurt-20")


        def compute_metrics_wrapper(eval_pred, tokenizer):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            logging.info("Calculating ROUGE scores...")
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            result = {"rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"], "rougeL": rouge_result["rougeL"]}
            
            if bleurt_scorer_gpu:
                logging.info("Calculating BLEURT scores on GPU...")
                bleurt_scores = bleurt_scorer_gpu.score(references=decoded_labels, candidates=decoded_preds, batch_size=EVAL_BATCH_SIZE)
                result["bleurt_f1"] = np.mean(bleurt_scores)
            elif bleurt_metric_cpu:
                logging.info("Calculating BLEURT scores on CPU (this will be very slow)...")
                bleurt_result = bleurt_metric_cpu.compute(predictions=decoded_preds, references=decoded_labels)
                result["bleurt_f1"] = np.mean(bleurt_result["scores"])
            
            return {f"eval_{k}": v for k, v in result.items()}

        # 3. --- Loop Through and Evaluate Each Checkpoint ---
        for chkpt_dir in checkpoint_dirs:
            chkpt_path = os.path.join(OUTPUT_DIR, chkpt_dir)
            logging.info(f"\n--- Evaluating Checkpoint: {chkpt_path} ---")
            
            model = MBartForConditionalGeneration.from_pretrained(chkpt_path)
            tokenizer = MBart50TokenizerFast.from_pretrained(chkpt_path)
            
            def tokenize_for_eval(examples):
                tokenizer.src_lang = "en_XX"
                model_inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
                labels_batch = []
                for i in range(len(examples['summary'])):
                    tokenizer.tgt_lang = examples['target_lang'][i]
                    labels = tokenizer(text_target=examples['summary'][i], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
                    labels_batch.append(labels['input_ids'])
                model_inputs["labels"] = labels_batch
                return model_inputs

            tokenized_test_dataset = test_dataset_untokenized.map(tokenize_for_eval, batched=True, remove_columns=['article', 'summary', 'target_lang'])

            temp_training_args = Seq2SeqTrainingArguments(
                output_dir=os.path.join(OUTPUT_DIR, "temp_eval"),
                per_device_eval_batch_size=EVAL_BATCH_SIZE,
                predict_with_generate=True,
                fp16=torch.cuda.is_available()
            )

            trainer = Seq2SeqTrainer(
                model=model, args=temp_training_args,
                eval_dataset=tokenized_test_dataset, tokenizer=tokenizer,
                data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
                compute_metrics=lambda p: compute_metrics_wrapper(p, tokenizer)
            )
            
            logging.info(f"Running evaluation on {chkpt_dir}...")
            eval_results = trainer.evaluate()
            
            rounded_results = {k: round(v, 4) for k, v in eval_results.items()}
            logging.info(f"--- Results for {chkpt_dir} ---")
            for key, value in rounded_results.items():
                logging.info(f"  - {key}: {value}")
            all_results.append({'checkpoint': chkpt_dir, **rounded_results})
            
            metric_value = eval_results.get(metric_to_check)
            if metric_value is not None:
                if best_metric_value is None or (not is_loss and metric_value > best_metric_value) or (is_loss and metric_value < best_metric_value):
                    best_metric_value, best_checkpoint_path = metric_value, chkpt_path
                    logging.info(f"*** New best checkpoint found: {chkpt_dir} with {metric_to_check}: {metric_value:.4f} ***")
        
        # 4. --- Print Final Summary Table and Save the Best ---
        if not all_results:
            logging.error("No checkpoints were successfully evaluated.")
            return

        logging.info("\n" + "="*80)
        logging.info("--- FINAL EVALUATION SUMMARY ---".center(80))
        logging.info("="*80)
        header_cols = ['Checkpoint', 'eval_loss', 'eval_rouge1', 'eval_rouge2', 'eval_rougeL']
        if USE_GPU_FOR_BLEURT or bleurt_metric_cpu: header_cols.append('eval_bleurt_f1')
        header = " | ".join([f"{col:<15}" for col in header_cols])
        logging.info(header)
        logging.info("-" * len(header))
        for result in all_results:
            row_vals = [
                result.get('checkpoint', 'N/A'),
                f"{result.get('eval_loss', 0):.4f}",
                f"{result.get('eval_eval_rouge1', 0):.4f}",
                f"{result.get('eval_eval_rouge2', 0):.4f}",
                f"{result.get('eval_eval_rougeL', 0):.4f}",
            ]
            if USE_GPU_FOR_BLEURT or bleurt_metric_cpu: row_vals.append(f"{result.get('eval_eval_bleurt_f1', 0):.4f}")
            row = " | ".join([f"{val:<15}" for val in row_vals])
            logging.info(row)
        logging.info("="*80)

        if not best_checkpoint_path:
            logging.error("Could not determine the best checkpoint after evaluation.")
            return

        logging.info(f"\n--- Best Model Identified ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        logging.info(f"Metric ({metric_to_check}): {best_metric_value:.4f}")

        final_model_path = os.path.join(OUTPUT_DIR, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    main()






2025-10-08 01:05:53,936 [INFO] - --- Starting Post-Training Evaluation of Checkpoints (GPU-Accelerated) ---
2025-10-08 01:05:53,951 [INFO] - Loading and preparing test data...


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

2025-10-08 01:05:57,075 [INFO] - Test data prepared with 1000 examples.
2025-10-08 01:05:57,085 [INFO] - Found 4 checkpoints to evaluate: ['checkpoint-2076', 'checkpoint-4152', 'checkpoint-6228', 'checkpoint-8304']
2025-10-08 01:06:02,425 [INFO] - Initializing GPU-based BLEURT scorer (will auto-download checkpoint if needed)...








INFO:tensorflow:Reading checkpoint bleurt-20.


2025-10-08 01:06:02,597 [INFO] - Reading checkpoint bleurt-20.
2025-10-08 01:06:02,599 [ERROR] - An unexpected error occurred: Could not find BLEURT checkpoint bleurt-20
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_22992\636203867.py", line 103, in main
    bleurt_scorer_gpu = bleurt_scorer.BleurtScorer("bleurt-20")
  File "c:\Users\admin\anaconda3\envs\summarizer_env3\lib\site-packages\bleurt\score.py", line 161, in __init__
    self.config = checkpoint_lib.read_bleurt_config(checkpoint)
  File "c:\Users\admin\anaconda3\envs\summarizer_env3\lib\site-packages\bleurt\checkpoint.py", line 84, in read_bleurt_config
    assert tf.io.gfile.exists(path), \
AssertionError: Could not find BLEURT checkpoint bleurt-20
