In [None]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "mt5-base-cnn-summarizer-en-hi_v5/final_model"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v6"
NEW_DATA_PATH = "../Dataset/filtered_articles_CNN.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.01
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"finetuning_log_v6_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def pre_run_checks(base_model_path, data_path, output_dir):
    """Performs checks for paths and permissions before starting."""
    logging.info("--- Performing Pre-Run Checks ---")
    all_checks_passed = True

    if not os.path.isdir(base_model_path):
        logging.error(f"Base model path not found: {base_model_path}")
        all_checks_passed = False

    if not os.path.isfile(data_path):
        logging.error(f"Data file not found: {data_path}")
        all_checks_passed = False

    try:
        os.makedirs(output_dir, exist_ok=True)
        test_file_path = os.path.join(output_dir, ".permission_test")
        with open(test_file_path, "w") as f:
            f.write("test")
        os.remove(test_file_path)
    except Exception as e:
        logging.error(f"Output directory '{output_dir}' is not writable. Error: {e}")
        all_checks_passed = False

    if all_checks_passed:
        logging.info("--- All pre-run checks passed. ---")
    else:
        logging.error("--- Pre-run checks failed. Halting execution. ---")

    return all_checks_passed

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint and saves it to a 'final_model' directory."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_checkpoint_path = state.get("best_model_checkpoint")
        if not best_checkpoint_path:
            logging.error("Could not find 'best_model_checkpoint' in trainer_state.json.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path}")
        
        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    if not pre_run_checks(BASE_MODEL_PATH, NEW_DATA_PATH, NEW_MODEL_OUTPUT_DIR):
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            metric_for_best_model="rouge2",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        rouge_metric = evaluate.load("rouge")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            return {k: round(v * 100, 4) for k, v in result.items()}

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting fine-tuning...")
        trainer.train()
        logging.info("Fine-tuning finished successfully.")
        
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()

model fine tuning v7

In [1]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.2
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def pre_run_checks(data_path, output_dir):
    """Performs checks for data path and output permissions before starting."""
    logging.info("--- Performing Pre-Run Checks ---")
    all_checks_passed = True

    if not os.path.isfile(data_path):
        logging.error(f"Data file not found: {data_path}")
        all_checks_passed = False

    try:
        os.makedirs(output_dir, exist_ok=True)
        test_file_path = os.path.join(output_dir, ".permission_test")
        with open(test_file_path, "w") as f:
            f.write("test")
        os.remove(test_file_path)
    except Exception as e:
        logging.error(f"Output directory '{output_dir}' is not writable. Error: {e}")
        all_checks_passed = False

    if all_checks_passed:
        logging.info("--- All pre-run checks passed. ---")
    else:
        logging.error("--- Pre-run checks failed. Halting execution. ---")

    return all_checks_passed

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint and saves it to a 'final_model' directory."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_checkpoint_path = state.get("best_model_checkpoint")
        if not best_checkpoint_path:
            logging.error("Could not find 'best_model_checkpoint' in trainer_state.json.")
            return

        best_checkpoint_step = int(best_checkpoint_path.split('-')[-1])
        best_eval_log = {}
        for log in state["log_history"]:
            if log.get("step") == best_checkpoint_step and "eval_loss" in log:
                best_eval_log = log
                break
        
        logging.info(f"Best checkpoint found: {best_checkpoint_path}")
        logging.info(f"Metrics for best checkpoint: {best_eval_log}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    if not pre_run_checks(NEW_DATA_PATH, NEW_MODEL_OUTPUT_DIR):
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            evaluation_strategy="epoch", # This line is added back
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="bertscore_f1",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()





2025-10-06 15:45:30,431 [INFO] - --- Performing Pre-Run Checks ---
2025-10-06 15:45:30,441 [INFO] - --- All pre-run checks passed. ---
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

2025-10-06 15:46:28,849 [ERROR] - An unexpected error occurred during the main process: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_20984\3107943505.py", line 167, in main
    training_args = Seq2SeqTrainingArguments(
TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'


In [2]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.2
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def pre_run_checks(data_path, output_dir):
    """Performs checks for data path and output permissions before starting."""
    logging.info("--- Performing Pre-Run Checks ---")
    all_checks_passed = True

    if not os.path.isfile(data_path):
        logging.error(f"Data file not found: {data_path}")
        all_checks_passed = False

    try:
        os.makedirs(output_dir, exist_ok=True)
        test_file_path = os.path.join(output_dir, ".permission_test")
        with open(test_file_path, "w") as f:
            f.write("test")
        os.remove(test_file_path)
    except Exception as e:
        logging.error(f"Output directory '{output_dir}' is not writable. Error: {e}")
        all_checks_passed = False

    if all_checks_passed:
        logging.info("--- All pre-run checks passed. ---")
    else:
        logging.error("--- Pre-run checks failed. Halting execution. ---")

    return all_checks_passed

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint and saves it to a 'final_model' directory."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        # Find the evaluation log with the best (lowest) eval_loss or highest metric
        best_metric_value = None
        best_checkpoint_path = None

        # Determine metric for comparison, prioritizing 'eval_bertscore_f1' then 'eval_loss'
        metric_to_check = f"eval_{'bertscore_f1'}" # Assumes metric_for_best_model is bertscore_f1
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log['step']
                    # Construct checkpoint path based on step
                    potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                    if os.path.exists(potential_path):
                         best_checkpoint_path = potential_path


        if not best_checkpoint_path:
            logging.error("Could not find a valid best checkpoint path from logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    if not pre_run_checks(NEW_DATA_PATH, NEW_MODEL_OUTPUT_DIR):
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False, # Set to False
            # metric_for_best_model is not needed when load_best_model_at_end is False
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 15:49:01,116 [INFO] - --- Performing Pre-Run Checks ---
2025-10-06 15:49:01,127 [INFO] - --- All pre-run checks passed. ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 15:49:44,597 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


KeyboardInterrupt: 

In [3]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.2
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def pre_run_checks(data_path, output_dir):
    """Performs checks for data path and output permissions before starting."""
    logging.info("--- Performing Pre-Run Checks ---")
    all_checks_passed = True

    if not os.path.isfile(data_path):
        logging.error(f"Data file not found: {data_path}")
        all_checks_passed = False

    try:
        os.makedirs(output_dir, exist_ok=True)
        test_file_path = os.path.join(output_dir, ".permission_test")
        with open(test_file_path, "w") as f:
            f.write("test")
        os.remove(test_file_path)
    except Exception as e:
        logging.error(f"Output directory '{output_dir}' is not writable. Error: {e}")
        all_checks_passed = False

    if all_checks_passed:
        logging.info("--- All pre-run checks passed. ---")
    else:
        logging.error("--- Pre-run checks failed. Halting execution. ---")

    return all_checks_passed

def find_and_save_best_model(output_dir):
    """Finds the best checkpoint and saves it to a 'final_model' directory."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None

        metric_to_check = f"eval_{'bertscore_f1'}" 
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log['step']
                    potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                    if os.path.exists(potential_path):
                         best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find a valid best checkpoint path from logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    if not pre_run_checks(NEW_DATA_PATH, NEW_MODEL_OUTPUT_DIR):
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        # More robust cleaning: drop rows where any of the key columns are just whitespace
        df_new = df_new[df_new['raw_news_article'].str.strip().astype(bool)]
        df_new = df_new[df_new['english_summary'].str.strip().astype(bool)]
        df_new = df_new[df_new['hindi_summary'].str.strip().astype(bool)]

        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })

        # --- Diagnostic Step: Print a few examples to check for data leakage ---
        logging.info("--- Checking a few examples from the training set for data leakage ---")
        for i in range(3):
            logging.info(f"\n--- Example {i+1} ---")
            logging.info(f"INPUT: {final_datasets['train'][i]['inputs'][:500]}...") # Print first 500 chars
            logging.info(f"TARGET: {final_datasets['train'][i]['targets']}")
        logging.info("\n" + "="*80 + "\n")
        # --- End of Diagnostic Step ---

        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 16:18:01,928 [INFO] - --- Performing Pre-Run Checks ---
2025-10-06 16:18:02,043 [INFO] - --- All pre-run checks passed. ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

2025-10-06 16:18:12,136 [INFO] - --- Checking a few examples from the training set for data leakage ---
2025-10-06 16:18:12,136 [INFO] - 
--- Example 1 ---
2025-10-06 16:18:12,136 [INFO] - INPUT: summarize Hindi: Paris Saint-Germain face Nice on Saturday, hoping to take Ligue 1's top spot from Lyon but do so with a host of key stars missing, including captain Thiago Silva who is recuperating at home from a thigh injury. Zlatan Ibrahimovic, Marco Verratti and Thiago Motta all join Silva on the sidelines for the trip to the Mediterranean coast, while David Luiz is still not fully fit as he recovers from a thigh problem, although he is still set to start. Silva was pictured nursing his prob...
2025-10-06 16:18:12,141 [INFO] - TARGET: पेरिस सेंट-जर्मेन (पीएसजी) शनिवार को नाइस के खिलाफ खेलेगा, जिसका लक्ष्य लियोन से लीग 1 का शीर्ष स्थान हासिल करना है। टीम को ज़्लाटन इब्राहिमोविच, मार्को वेराट्टी, थियागो मोट्टा और कप्तान थियागो सिल्वा सहित महत्वपूर्ण खिलाड़ियों की अनुपस्थिति का सामना करना पड़

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 16:18:48,002 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


KeyboardInterrupt: 

In [1]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    MT5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def pre_run_checks(data_path, output_dir):
    """Performs checks for data path and output permissions before starting."""
    logging.info("--- Performing Pre-Run Checks ---")
    all_checks_passed = True

    if not os.path.isfile(data_path):
        logging.error(f"Data file not found: {data_path}")
        all_checks_passed = False

    try:
        os.makedirs(output_dir, exist_ok=True)
        test_file_path = os.path.join(output_dir, ".permission_test")
        with open(test_file_path, "w") as f:
            f.write("test")
        os.remove(test_file_path)
    except Exception as e:
        logging.error(f"Output directory '{output_dir}' is not writable. Error: {e}")
        all_checks_passed = False

    if all_checks_passed:
        logging.info("--- All pre-run checks passed. ---")
    else:
        logging.error("--- Pre-run checks failed. Halting execution. ---")

    return all_checks_passed

def main():
    if not pre_run_checks(NEW_DATA_PATH, NEW_MODEL_OUTPUT_DIR):
        return

    try:
        # Using explicit MT5Tokenizer for clarity and correctness
        tokenizer = MT5Tokenizer.from_pretrained(BASE_MODEL_PATH)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        df_new = df_new[df_new['raw_news_article'].str.strip().astype(bool)]
        df_new = df_new[df_new['english_summary'].str.strip().astype(bool)]
        df_new = df_new[df_new['hindi_summary'].str.strip().astype(bool)]

        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        # --- THE CRITICAL FIX IS HERE ---
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            
            # This context manager is essential for T5-based models.
            # It ensures the labels are tokenized correctly for the decoder.
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
                
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=2,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="bertscore_f1",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        final_model_path = os.path.join(NEW_MODEL_OUTPUT_DIR, "final_model")
        trainer.save_model(final_model_path)
        logging.info(f"Best model saved to {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()






2025-10-06 16:48:26,485 [INFO] - --- Performing Pre-Run Checks ---
2025-10-06 16:48:26,751 [INFO] - --- All pre-run checks passed. ---
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]



Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

2025-10-06 16:50:05,181 [ERROR] - An unexpected error occurred during the main process: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_21372\912464866.py", line 146, in main
    training_args = Seq2SeqTrainingArguments(
TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'


In [3]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1" # Define metric for manual saving

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def pre_run_checks(data_path, output_dir):
    """Performs checks for data path and output permissions before starting."""
    logging.info("--- Performing Pre-Run Checks ---")
    all_checks_passed = True

    if not os.path.isfile(data_path):
        logging.error(f"Data file not found: {data_path}")
        all_checks_passed = False

    try:
        os.makedirs(output_dir, exist_ok=True)
        test_file_path = os.path.join(output_dir, ".permission_test")
        with open(test_file_path, "w") as f:
            f.write("test")
        os.remove(test_file_path)
    except Exception as e:
        logging.error(f"Output directory '{output_dir}' is not writable. Error: {e}")
        all_checks_passed = False

    if all_checks_passed:
        logging.info("--- All pre-run checks passed. ---")
    else:
        logging.error("--- Pre-run checks failed. Halting execution. ---")

    return all_checks_passed

def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    if not pre_run_checks(NEW_DATA_PATH, NEW_MODEL_OUTPUT_DIR):
        return

    try:
        # Using T5Tokenizer to match the 'google/mt5-base' checkpoint
        # Setting legacy=False to address potential tokenizer bugs
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        df_new = df_new[df_new['raw_news_article'].str.strip().astype(bool)]
        df_new = df_new[df_new['english_summary'].str.strip().astype(bool)]
        df_new = df_new[df_new['hindi_summary'].str.strip().astype(bool)]

        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            # Modern way to tokenize inputs and labels separately
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS, # Save all checkpoints to find the best one
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False, # Set to False for compatibility
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        # Manually find and save the best model from all checkpoints
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 16:56:53,044 [INFO] - --- Performing Pre-Run Checks ---
2025-10-06 16:56:53,176 [INFO] - --- All pre-run checks passed. ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 16:58:34,019 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0
150,0.0


KeyboardInterrupt: 

In [5]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
# NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv" # Disabled for sanity check

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "rouge_rouge1" # Using ROUGE as it's simpler for this check

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        # --- SANITY CHECK: Using a built-in, offline dataset ---
        logging.info("--- RUNNING SANITY CHECK WITH BUILT-IN OFFLINE DATASET ---")
        
        # Create a small, clean dataset in memory to bypass network issues
        dummy_data = {
            "train": {
                "dialogue": [
                    "Amanda: I baked cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you some tomorrow.",
                    "Olivia: I'm so tired. I stayed up all night studying.\nLeo: You should get some rest."
                ],
                "summary": [
                    "Amanda baked cookies and will bring Jerry some tomorrow.",
                    "Olivia is tired from studying and Leo suggests she should rest."
                ]
            },
            "test": {
                "dialogue": [
                    "Will: I'm going to the store. Do you need anything?\nJane: Yes, can you get some milk?"
                ],
                "summary": [
                    "Jane needs milk from the store."
                ]
            }
        }
        
        train_dataset = Dataset.from_dict(dummy_data["train"])
        test_dataset = Dataset.from_dict(dummy_data["test"])
        offline_dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
        
        PREFIX = "summarize: "

        def format_offline_dataset(examples):
            inputs = [PREFIX + doc for doc in examples["dialogue"]]
            model_inputs = tokenizer(inputs, max_length=512, truncation=True) # Reduced length for dummy data
            labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = offline_dataset.map(format_offline_dataset, batched=True)
        
        rouge_metric = evaluate.load("rouge")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            return {f"rouge_{key}": value for key, value in rouge_result.items()}

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=3, # Reduced epochs for a quick test
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=1, # Smaller batch for tiny dataset
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=1,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=1,
            save_strategy="epoch",
            save_total_limit=1,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch on OFFLINE dataset...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Sanity check complete. If loss decreased, the code is working.")
        logging.info("You can now re-enable your local dataset and investigate it for issues.")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 17:02:38,784 [INFO] - --- RUNNING SANITY CHECK WITH BUILT-IN OFFLINE DATASET ---


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 17:02:42,479 [INFO] - Starting training from scratch on OFFLINE dataset...


Step,Training Loss
1,0.0
2,14.1908
3,0.0
4,0.0
5,0.0
6,0.0


2025-10-06 17:04:34,222 [INFO] - Training finished successfully.
2025-10-06 17:04:34,224 [INFO] - Sanity check complete. If loss decreased, the code is working.
2025-10-06 17:04:34,224 [INFO] - You can now re-enable your local dataset and investigate it for issues.


In [7]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv" # Re-enabled user's data path

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def data_validation_check(df, num_rows_to_check=100):
    """
    Performs validation checks on the dataframe to find potential data leakage.
    """
    logging.info("--- Starting Data Validation ---")
    is_issue_found = False
    for i, row in df.head(num_rows_to_check).iterrows():
        article = str(row['raw_news_article'])
        eng_summary = str(row['english_summary'])
        hin_summary = str(row['hindi_summary'])

        # Check for summary text within the article text
        if eng_summary in article:
            logging.warning(f"[DATA VALIDATION WARNING] Row {i}: English summary found in article text.")
            is_issue_found = True
        if hin_summary in article:
            logging.warning(f"[DATA VALIDATION WARNING] Row {i}: Hindi summary found in article text.")
            is_issue_found = True
        
        # Check for unusually short content
        if len(article.split()) < 20:
            logging.warning(f"[DATA VALIDATION WARNING] Row {i}: Article text is very short ({len(article.split())} words).")
            is_issue_found = True
        if len(eng_summary.split()) < 5:
            logging.warning(f"[DATA VALIDATION WARNING] Row {i}: English summary is very short ({len(eng_summary.split())} words).")
            is_issue_found = True

    if not is_issue_found:
        logging.info("--- Data Validation Passed. No obvious issues found in the first {num_rows_to_check} rows. ---")
    else:
        logging.error("--- Data Validation Failed. Please review warnings above and clean your CSV file. ---")
    return not is_issue_found


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        # Load and validate the user's dataset
        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        if not data_validation_check(df_new):
            logging.error("Halting execution due to data validation issues.")
            return

        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()




2025-10-06 17:07:20,969 [INFO] - --- Starting Data Validation ---
2025-10-06 17:07:20,985 [INFO] - --- Data Validation Passed. No obvious issues found in the first {num_rows_to_check} rows. ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 17:08:49,766 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0


KeyboardInterrupt: 

In [8]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv" 

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        # Load the user's dataset
        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        # --- NEW: Aggressive Data Cleaning Step ---
        logging.info("--- Starting Aggressive Data Cleaning to Remove Leaks ---")
        cleaned_articles = []
        for i, row in df_new.iterrows():
            article = str(row['raw_news_article'])
            eng_summary = str(row['english_summary'])
            hin_summary = str(row['hindi_summary'])
            
            # Surgically remove summary text from the article text
            article = article.replace(eng_summary, "")
            article = article.replace(hin_summary, "")
            cleaned_articles.append(article)
        
        df_new['raw_news_article'] = cleaned_articles
        logging.info("--- Aggressive Data Cleaning Finished ---")

        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 17:12:27,685 [INFO] - --- Starting Aggressive Data Cleaning to Remove Leaks ---
2025-10-06 17:12:28,093 [INFO] - --- Aggressive Data Cleaning Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 17:13:57,977 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0


KeyboardInterrupt: 

In [9]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        # Load the user's dataset
        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        # --- Aggressive Data Cleaning Step ---
        logging.info("--- Starting Aggressive Data Cleaning to Remove Leaks ---")
        cleaned_articles = []
        for i, row in df_new.iterrows():
            article = str(row['raw_news_article'])
            eng_summary = str(row['english_summary'])
            hin_summary = str(row['hindi_summary'])
            
            # Surgically remove summary text from the article text
            article = article.replace(eng_summary, "")
            article = article.replace(hin_summary, "")
            cleaned_articles.append(article)
        
        df_new['raw_news_article'] = cleaned_articles
        logging.info("--- Aggressive Data Cleaning Finished ---")

        # --- NEW: Ground Truth Diagnostic ---
        logging.info("\n\n==================== GROUND TRUTH DIAGNOSTIC ====================")
        first_row = df_new.iloc[0]
        logging.info(f"--- Cleaned Article (First Row) ---\n{first_row['raw_news_article']}\n")
        logging.info(f"--- English Summary (First Row) ---\n{first_row['english_summary']}\n")
        logging.info(f"--- Hindi Summary (First Row) ---\n{first_row['hindi_summary']}\n")
        logging.info("=================================================================\n\n")


        df_new.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str):
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 17:17:35,116 [INFO] - --- Starting Aggressive Data Cleaning to Remove Leaks ---
2025-10-06 17:17:35,521 [INFO] - --- Aggressive Data Cleaning Finished ---
2025-10-06 17:17:35,532 [INFO] - 

2025-10-06 17:17:35,534 [INFO] - --- Cleaned Article (First Row) ---

2025-10-06 17:17:35,535 [INFO] - --- English Summary (First Row) ---

2025-10-06 17:17:35,536 [INFO] - --- Hindi Summary (First Row) ---
चुनाव से सिर्फ दो हफ्ते पहले, इंस्टीट्यूट फॉर फिस्कल स्टडीज (IFS) के आर्थिक विशेषज्ञों ने चेतावनी दी है कि मतदाता प्रमुख ब्रिटिश राजनीतिक दलों द्वारा नियोजित खर्च कटौती के पैमाने और गहराई के बारे में अनभिज्ञ हैं। IFS की विस्तृत घोषणापत्र अध्ययनों पर आधारित रिपोर्ट ने कंजर्वेटिव्स की £30 बिलियन की घाटा कटौती योजना पर विवरण की कमी की आलोचना की, जो बड़े पैमाने पर अनिर्दिष्ट खर्च कटौती और कर वृद्धि पर निर्भर करती है, जबकि लेबर ने प्रति वर्ष अतिरिक्त £26 बिलियन उधार लेने की इच्छा व्यक्त की है। हालांकि चांसलर जॉर्ज ओसबोर्न ने हाल ही में घोषणा की कि पिछले वित्तीय वर्ष के लिए सार्वजनिक क्षेत्र

Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 17:19:04,583 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0


KeyboardInterrupt: 

In [11]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        # --- MODIFIED: Use only English columns for this test ---
        df_new.dropna(subset=['raw_news_article', 'english_summary'], inplace=True)
        
        logging.info("--- Starting Text Normalization (English Only) ---")
        df_new['raw_news_article'] = df_new['raw_news_article'].apply(normalize_text)
        df_new['english_summary'] = df_new['english_summary'].apply(normalize_text)
        logging.info("--- Text Normalization Finished ---")

        # --- MODIFIED: Create dataset with only English columns ---
        df_eng_only = df_new[['raw_news_article', 'english_summary']].copy()
        df_eng_only.reset_index(drop=True, inplace=True)
        raw_dataset = Dataset.from_pandas(df_eng_only)

        PREFIX_ENG = "summarize English: "

        # --- MODIFIED: Format dataset for English only ---
        def format_dataset_eng_only(batch):
            inputs, targets = [], []
            for article, eng_summary in zip(
                batch['raw_news_article'], batch['english_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset_eng_only, batched=True, remove_columns=raw_dataset.column_names
        )

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch (English Only)...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 17:43:10,027 [INFO] - --- Starting Text Normalization (English Only) ---
2025-10-06 17:43:10,871 [INFO] - --- Text Normalization Finished ---


Map:   0%|          | 0/9237 [00:00<?, ? examples/s]

Map:   0%|          | 0/8313 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 17:43:58,487 [INFO] - Starting training from scratch (English Only)...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


KeyboardInterrupt: 

In [12]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_MODEL_OUTPUT_DIR = "mt5-base-cnn-summarizer-en-hi_v8"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 1 # Use batch size of 1 for single example test
GRADIENT_ACCUMULATION_STEPS = 1
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()


def main():
    try:
        logging.info("--- STARTING SINGLE EXAMPLE OVERFIT TEST ---")
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary'], inplace=True)
        
        logging.info("--- Starting Text Normalization ---")
        df_new['raw_news_article'] = df_new['raw_news_article'].apply(normalize_text)
        df_new['english_summary'] = df_new['english_summary'].apply(normalize_text)
        logging.info("--- Text Normalization Finished ---")

        # --- MODIFIED: Select only the FIRST row for the test ---
        df_single_example = df_new[['raw_news_article', 'english_summary']].head(1).copy()
        logging.info("--- Using single example for overfitting test: ---")
        logging.info(f"ARTICLE: {df_single_example.iloc[0]['raw_news_article'][:500]}...")
        logging.info(f"SUMMARY: {df_single_example.iloc[0]['english_summary']}")

        
        raw_dataset = Dataset.from_pandas(df_single_example)

        PREFIX_ENG = "summarize English: "

        def format_dataset_eng_only(batch):
            inputs, targets = [], []
            for article, eng_summary in zip(
                batch['raw_news_article'], batch['english_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset_eng_only, batched=True, remove_columns=raw_dataset.column_names
        )

        # --- MODIFIED: No train/test split needed for a single example ---
        final_datasets = DatasetDict({
            'train': processed_dataset,
            'test': processed_dataset # Use the same example for evaluation
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        # Metrics are not critical for this test, but we keep them for consistency
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            # ... (compute_metrics function remains the same)
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)
            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=10, # More epochs to ensure overfitting
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=1, # Log every step
            save_strategy="epoch",
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training (overfit test on single example)...")
        trainer.train()
        logging.info("Overfit test finished.")
        
    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 18:11:20,519 [INFO] - --- STARTING SINGLE EXAMPLE OVERFIT TEST ---
2025-10-06 18:11:28,896 [INFO] - --- Starting Text Normalization ---
2025-10-06 18:11:29,753 [INFO] - --- Text Normalization Finished ---
2025-10-06 18:11:29,753 [INFO] - --- Using single example for overfitting test: ---
2025-10-06 18:11:29,753 [INFO] - ARTICLE: Voters are still 'in the dark' about the scale and depth of spending cuts being planned by all the main parties with just two weeks until polling day, economic experts warned today. Analysts from the Institute for Fiscal Studies said none of the major parties had given 'anything like full details' on how they will tackle the nations' debts after the election. The Tories were accused of giving 'no detail' about their deficit reduction plan, which relies on £30billion of cuts, while Labour has lef...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 18:11:36,911 [INFO] - Starting training (overfit test on single example)...


Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


2025-10-06 18:17:44,376 [INFO] - Overfit test finished.


In [13]:
import logging
import pandas as pd
import numpy as np
import torch
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    MT5ForConditionalGeneration
)

# --- Configuration ---
BASE_MODEL_PATH = "google/mt5-base"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v8_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()


def main():
    try:
        logging.info("--- FINAL DIAGNOSTIC: INSPECTING TOKEN TENSORS ---")
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary'], inplace=True)
        
        logging.info("--- Starting Text Normalization ---")
        df_new['raw_news_article'] = df_new['raw_news_article'].apply(normalize_text)
        df_new['english_summary'] = df_new['english_summary'].apply(normalize_text)
        logging.info("--- Text Normalization Finished ---")

        # Select only the FIRST row for the test
        df_single_example = df_new[['raw_news_article', 'english_summary']].head(1).copy()
        
        article_text = df_single_example.iloc[0]['raw_news_article']
        summary_text = df_single_example.iloc[0]['english_summary']

        logging.info("--- Using single example for token inspection: ---")
        logging.info(f"ARTICLE: {article_text[:500]}...")
        logging.info(f"SUMMARY: {summary_text}")
        
        PREFIX_ENG = "summarize English: "

        # Tokenize the input and the target separately
        input_encoding = tokenizer(PREFIX_ENG + article_text, max_length=1024, truncation=True, return_tensors="pt")
        target_encoding = tokenizer(text_target=summary_text, max_length=256, truncation=True, return_tensors="pt")

        input_ids = input_encoding.input_ids[0]
        labels = target_encoding.input_ids[0]

        logging.info("\n==================== TOKEN INSPECTION ====================")
        
        logging.info(f"\n--- DECODED INPUT TOKENS ---\n{tokenizer.decode(input_ids, skip_special_tokens=True)}")
        logging.info(f"\n--- DECODED LABEL TOKENS ---\n{tokenizer.decode(labels, skip_special_tokens=True)}")

        logging.info(f"\n--- RAW INPUT IDS ---\n{input_ids.tolist()}")
        logging.info(f"\n--- RAW LABEL IDS ---\n{labels.tolist()}")
        
        logging.info(f"\nNumber of Input Tokens: {len(input_ids)}")
        logging.info(f"Number of Label Tokens: {len(labels)}")
        
        are_tensors_equal = torch.equal(input_ids, labels)
        
        logging.info("\n--- FINAL VERDICT ---")
        if are_tensors_equal:
            logging.error("CRITICAL ERROR: The input_ids and labels tensors are IDENTICAL.")
            logging.error("This is the cause of the zero loss. The tokenizer is producing the same token sequence for the article and the summary.")
        else:
            logging.info("SUCCESS: The input_ids and labels tensors are DIFFERENT.")
            logging.info("This is the expected behavior. If the loss is still zero, the issue is exceptionally unusual.")
            
        logging.info("\n==========================================================")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the diagnostic script: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-06 18:17:56,244 [INFO] - --- FINAL DIAGNOSTIC: INSPECTING TOKEN TENSORS ---
2025-10-06 18:18:00,226 [INFO] - --- Starting Text Normalization ---
2025-10-06 18:18:01,092 [INFO] - --- Text Normalization Finished ---
2025-10-06 18:18:01,109 [INFO] - --- Using single example for token inspection: ---
2025-10-06 18:18:01,110 [INFO] - ARTICLE: Voters are still 'in the dark' about the scale and depth of spending cuts being planned by all the main parties with just two weeks until polling day, economic experts warned today. Analysts from the Institute for Fiscal Studies said none of the major parties had given 'anything like full details' on how they will tackle the nations' debts after the election. The Tories were accused of giving 'no detail' about their deficit reduction plan, which relies on £30billion of cuts, while Labour has lef...
2025-10-06 18:18:01,113 [INFO] - 
2025-10-06 18:18:01,146 [INFO] - 
--- DECODED INPUT TOKENS ---
summarize English: Voters are still 'in the dark' a

In [14]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration, # Changed from MT5
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
# --- MODIFIED: Switched to t5-base model ---
BASE_MODEL_PATH = "t5-base"
NEW_MODEL_OUTPUT_DIR = "t5-base-cnn-summarizer-en-hi_v9"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v9_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        # --- MODIFIED: Using T5ForConditionalGeneration ---
        model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Normalization ---")
        df_new['raw_news_article'] = df_new['raw_news_article'].apply(normalize_text)
        df_new['english_summary'] = df_new['english_summary'].apply(normalize_text)
        df_new['hindi_summary'] = df_new['hindi_summary'].apply(normalize_text)
        logging.info("--- Text Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            # Note: BERTScore lang should ideally be dynamic, but 'en' is a safe default.
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

2025-10-06 18:21:01,544 [INFO] - --- Starting Text Normalization ---
2025-10-06 18:21:03,028 [INFO] - --- Text Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 18:22:27,912 [INFO] - Starting training from scratch...


Step,Training Loss
50,3.4109
100,1.4516
150,1.2074
200,1.163
250,1.2898
300,1.0666
350,1.1618
400,1.0861
450,1.0374
500,1.0546


KeyboardInterrupt: 

In [15]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/flan-t5-base"
NEW_MODEL_OUTPUT_DIR = "flan-t5-base-cnn-summarizer-en-hi_v10"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v10_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()


def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Normalization ---")
        df_new['raw_news_article'] = df_new['raw_news_article'].apply(normalize_text)
        df_new['english_summary'] = df_new['english_summary'].apply(normalize_text)
        df_new['hindi_summary'] = df_new['hindi_summary'].apply(normalize_text)
        logging.info("--- Text Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)

            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

2025-10-06 18:36:31,131 [INFO] - --- Starting Text Normalization ---
2025-10-06 18:36:32,659 [INFO] - --- Text Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 18:37:57,713 [INFO] - Starting training from scratch...


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0


KeyboardInterrupt: 

In [16]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import shutil
import os
import json
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
# --- MODIFIED: Switched to a different model architecture (mBART) ---
BASE_MODEL_PATH = "facebook/mbart-large-50"
NEW_MODEL_OUTPUT_DIR = "mbart-large-50-cnn-summarizer-en-hi_v11"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bertscore_f1"

# --- Setup Logging ---
log_filename = f"scratch_training_log_v11_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)


def find_and_save_best_model(output_dir, metric_name):
    """Finds the best checkpoint from trainer_state.json and saves it."""
    try:
        state_path = os.path.join(output_dir, "trainer_state.json")
        with open(state_path, "r") as f:
            state = json.load(f)
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check

        for log in state["log_history"]:
            if metric_to_check in log:
                metric_value = log[metric_to_check]
                if best_metric_value is None or \
                   (is_loss and metric_value < best_metric_value) or \
                   (not is_loss and metric_value > best_metric_value):
                    best_metric_value = metric_value
                    step = log.get('step')
                    if step:
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_checkpoint_path = potential_path

        if not best_checkpoint_path:
            logging.error("Could not find the best checkpoint from the logs.")
            return

        logging.info(f"Best checkpoint found: {best_checkpoint_path} with {metric_to_check}: {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Best model copied to {final_model_path}")

    except Exception as e:
        logging.error(f"Could not save the best model due to: {e}", exc_info=True)

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()


def main():
    try:
        # --- MODIFIED: Using mBART specific tokenizer and model ---
        tokenizer = MBart50TokenizerFast.from_pretrained(BASE_MODEL_PATH)
        model = MBartForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Normalization ---")
        df_new['raw_news_article'] = df_new['raw_news_article'].apply(normalize_text)
        df_new['english_summary'] = df_new['english_summary'].apply(normalize_text)
        df_new['hindi_summary'] = df_new['hindi_summary'].apply(normalize_text)
        logging.info("--- Text Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        # --- MODIFIED: Added 'lang' column for mBART's tokenizer ---
        def format_dataset(batch):
            inputs, targets, langs = [], [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    # English Task
                    inputs.append(article)
                    targets.append(eng_summary)
                    langs.append("en_XX")
                    # Hindi Task
                    inputs.append(article)
                    targets.append(hin_summary)
                    langs.append("hi_IN")
            return {'inputs': inputs, 'targets': targets, 'lang': langs}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        # --- MODIFIED: Tokenize function adapted for mBART ---
        def tokenize_function(examples):
            # Set source language for all articles
            tokenizer.src_lang = "en_XX" 
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)

            # Process labels for the whole batch, setting target language for each
            all_labels = []
            for i in range(len(examples['targets'])):
                tokenizer.tgt_lang = examples['lang'][i]
                label_ids = tokenizer(text_target=examples['targets'][i], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True).input_ids
                all_labels.append(label_ids)
            model_inputs['labels'] = all_labels
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets', 'lang'])
        
        rouge_metric = evaluate.load("rouge")
        bertscore_metric = evaluate.load("bertscore")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            # --- MODIFIED: Forcing target language for decoding predictions ---
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
            bert_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
            
            result = {}
            for key, value in rouge_result.items():
                result[f"rouge_{key}"] = round(value * 100, 4)

            result["bertscore_f1"] = round(np.mean(bert_result["f1"]) * 100, 4)
            return result

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting training from scratch with mBART model...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        logging.info("Finding and saving the best model...")
        find_and_save_best_model(NEW_MODEL_OUTPUT_DIR, METRIC_FOR_BEST_MODEL)

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

2025-10-06 18:42:03,296 [INFO] - --- Starting Text Normalization ---
2025-10-06 18:42:05,142 [INFO] - --- Text Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
2025-10-06 18:42:52,595 [INFO] - Starting training from scratch with mBART model...


Step,Training Loss
50,3.214
100,2.6424
150,2.5207
200,2.4164
250,2.3181
300,2.2747
350,2.1777
400,2.138
450,2.0755
500,2.0379


2025-10-06 21:01:46,637 [INFO] - Training finished successfully.
2025-10-06 21:01:46,637 [INFO] - Finding and saving the best model...
2025-10-06 21:01:46,643 [ERROR] - Could not save the best model due to: [Errno 2] No such file or directory: 'mbart-large-50-cnn-summarizer-en-hi_v11\\trainer_state.json'
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_21372\739669738.py", line 49, in find_and_save_best_model
    with open(state_path, "r") as f:
  File "c:\Users\admin\anaconda3\envs\summarizer_env\lib\site-packages\IPython\core\interactiveshell.py", line 324, in _modified_open
    return io_open(file, *args, **kwargs)
FileNotFoundError: [Errno 2] No such file or directory: 'mbart-large-50-cnn-summarizer-en-hi_v11\\trainer_state.json'


In [21]:
import os
import json
import shutil
import logging
from datetime import datetime

# --- Configuration: MODIFY THESE TWO VARIABLES ---
# Set this to the output directory of your completed (but failed to save) training run.
OUTPUT_DIR = "mbart-large-50-cnn-summarizer-en-hi_v11" 
# Set this to the metric you used to determine the best model.
METRIC_NAME = "bertscore_f1"
# -------------------------------------------------

# --- Setup Logging ---
log_filename = f"save_best_model_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def find_and_save_best_checkpoint(output_dir, metric_name):
    """
    Reads training logs to identify the best checkpoint based on a specified metric.
    If no metrics are found, it defaults to saving the last available checkpoint.
    """
    try:
        logging.info(f"Attempting to find best model in: {output_dir}")

        if not os.path.isdir(output_dir):
            logging.error(f"FATAL: The directory '{output_dir}' does not exist.")
            return
        
        logging.info(f"Contents of '{output_dir}': {os.listdir(output_dir)}")
        
        best_metric_value = None
        best_checkpoint_path = None
        metric_to_check = f"eval_{metric_name}"
        is_loss = 'loss' in metric_to_check.lower()
        log_history = []
        
        # Strategies to find evaluation logs
        main_state_path = os.path.join(output_dir, "trainer_state.json")
        if os.path.exists(main_state_path):
            with open(main_state_path, "r") as f:
                state = json.load(f)
            log_history = state["log_history"]
        
        if not log_history:
            checkpoint_dirs = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
            for chkpt_dir in checkpoint_dirs:
                chkpt_state_path = os.path.join(output_dir, chkpt_dir, "trainer_state.json")
                if os.path.exists(chkpt_state_path):
                    with open(chkpt_state_path, "r") as f:
                        chkpt_state = json.load(f)
                    for log in chkpt_state["log_history"]:
                        if metric_to_check in log: log_history.append(log)

        if not log_history:
            checkpoint_dirs = sorted([d for d in os.listdir(output_dir) if d.startswith("checkpoint-")], key=lambda x: int(x.split('-')[-1]))
            for chkpt_dir in checkpoint_dirs:
                eval_results_path = os.path.join(output_dir, chkpt_dir, "eval_results.json")
                if os.path.exists(eval_results_path):
                    with open(eval_results_path, "r") as f:
                        eval_results = json.load(f)
                    if metric_to_check in eval_results:
                        log_history.append({"step": int(chkpt_dir.split('-')[-1]), metric_to_check: eval_results[metric_to_check]})

        if log_history:
            logging.info(f"Searching for best score using metric: '{metric_to_check}'")
            for log in log_history:
                if metric_to_check in log:
                    metric_value, step = log[metric_to_check], log.get('step')
                    if step is None: continue
                    if best_metric_value is None or (is_loss and metric_value < best_metric_value) or (not is_loss and metric_value > best_metric_value):
                        potential_path = os.path.join(output_dir, f"checkpoint-{step}")
                        if os.path.exists(potential_path):
                            best_metric_value, best_checkpoint_path = metric_value, potential_path
                            logging.info(f"New best found -> Step: {step}, {metric_to_check}: {metric_value}")
        else:
            # --- NEW: FINAL FALLBACK ---
            logging.warning("Could not find any evaluation metric logs.")
            logging.warning("Defaulting to the LAST saved checkpoint as the best model.")
            checkpoint_dirs = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
            if checkpoint_dirs:
                latest_step = -1
                for chkpt_dir in checkpoint_dirs:
                    try:
                        step = int(chkpt_dir.split('-')[-1])
                        if step > latest_step:
                            latest_step = step
                            best_checkpoint_path = os.path.join(output_dir, chkpt_dir)
                    except ValueError:
                        continue
                if best_checkpoint_path:
                     logging.info(f"Identified last checkpoint: {best_checkpoint_path}")

        if not best_checkpoint_path:
            logging.error("FATAL: Could not find any valid checkpoints to save.")
            return

        logging.info(f"--- Model Identified for Saving ---")
        logging.info(f"Checkpoint: {best_checkpoint_path}")
        if best_metric_value is not None:
            logging.info(f"Metric ({metric_to_check}): {best_metric_value}")

        final_model_path = os.path.join(output_dir, "final_model")
        if os.path.exists(final_model_path):
            logging.warning(f"Removing existing 'final_model' directory: {final_model_path}")
            shutil.rmtree(final_model_path)
            
        shutil.copytree(best_checkpoint_path, final_model_path)
        logging.info(f"Successfully copied best model to: {final_model_path}")

    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)

if __name__ == "__main__":
    find_and_save_best_checkpoint(OUTPUT_DIR, METRIC_NAME)



2025-10-06 22:33:20,551 [INFO] - Attempting to find best model in: mbart-large-50-cnn-summarizer-en-hi_v11
2025-10-06 22:33:20,551 [INFO] - Contents of 'mbart-large-50-cnn-summarizer-en-hi_v11': ['checkpoint-10380', 'checkpoint-2076', 'checkpoint-4152', 'checkpoint-6228', 'checkpoint-8304', 'logs']
2025-10-06 22:33:20,551 [INFO] - Identified last checkpoint: mbart-large-50-cnn-summarizer-en-hi_v11\checkpoint-10380
2025-10-06 22:33:20,551 [INFO] - --- Model Identified for Saving ---
2025-10-06 22:33:20,566 [INFO] - Checkpoint: mbart-large-50-cnn-summarizer-en-hi_v11\checkpoint-10380
2025-10-06 22:35:00,801 [INFO] - Successfully copied best model to: mbart-large-50-cnn-summarizer-en-hi_v11\final_model
