In [1]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/flan-t5-large"
NEW_MODEL_OUTPUT_DIR = "flan-t5-large-cnn-summarizer-v12"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3 # FLAN-T5 often requires fewer epochs
BATCH_SIZE = 1 # Use a small batch size for the large model to fit in memory
GRADIENT_ACCUMULATION_STEPS = 8 # Increase accumulation to compensate for small batch size
WEIGHT_DECAY = 0.25
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bleurt_f1" # A more robust metric for quality

# --- Setup Logging ---
log_filename = f"flan_t5_large_training_log_v12_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def sanitize_text(text):
    """
    Cleans text to handle specific formatting issues like escaped quotes from CSVs.
    """
    if not isinstance(text, str):
        return ""
    text = text.replace('""', '"')
    return text.strip()

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()

def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Sanitization & Normalization ---")
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        logging.info("--- Text Sanitization & Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        # --- NEW: Enhanced Evaluation Metrics ---
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # ROUGE scores
            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            # BLEURT score
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {
                "rouge1": rouge_result["rouge1"],
                "rouge2": rouge_result["rouge2"],
                "rougeL": rouge_result["rougeL"],
                "bleurt_f1": np.mean(bleurt_result["scores"])
            }
            
            return {k: round(v * 100, 4) for k, v in result.items()}

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_strategy="epoch", # Log at the end of each epoch
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model=METRIC_FOR_BEST_MODEL,
            report_to="tensorboard",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting final training from scratch with FLAN-T5-LARGE...")
        trainer.train()
        logging.info("Training finished successfully.")
        
        final_model_dir = os.path.join(NEW_MODEL_OUTPUT_DIR, "final_model")
        trainer.save_model(final_model_dir)
        logging.info(f"Best model saved to {final_model_dir}")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()





2025-10-07 13:52:25,380 [INFO] - --- Starting Text Sanitization & Normalization ---
2025-10-07 13:52:26,888 [INFO] - --- Text Sanitization & Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

Computing checksums: 100%|##########| 1/1 [00:05<00:00,  5.05s/it]







INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-07 13:56:10,230 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-07 13:56:10,246 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-07 13:56:10,246 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-07 13:56:10,246 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-07 13:56:10,246 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-07 13:56:10,246 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-07 13:56:10,246 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-07 13:56:10,246 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-07 13:56:10,246 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-07 13:56:10,259 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-07 13:56:10,261 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-07 13:56:10,262 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 13:56:10,263 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 13:56:10,265 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-07 13:56:10,266 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-07 13:56:10,750 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-07 13:56:10,750 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-07 13:56:10,750 [INFO] - Loading model.
2025-10-07 13:56:16,705 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-07 13:56:16,705 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-07 13:56:16,712 [INFO] - BLEURT initialized.
2025-10-07 13:56:16,742 [ERROR] - An unexpected error occurred during the main process: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
Traceback (most recent call last):
  File "C:\Users\admin\AppData\Local\Temp\ipykernel_15280\3186982528.py", line 134, in main
    training_args = Seq2SeqTrainingArguments(
TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'


In [11]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/flan-t5-large"
NEW_MODEL_OUTPUT_DIR = "flan-t5-large-cnn-summarizer-v12"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3 
BATCH_SIZE = 1 
GRADIENT_ACCUMULATION_STEPS = 8
WEIGHT_DECAY = 0.3
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bleurt_f1" 

# --- Setup Logging ---
log_filename = f"flan_t5_large_training_log_v12_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def sanitize_text(text):
    """
    Cleans text to handle specific formatting issues like escaped quotes from CSVs.
    """
    if not isinstance(text, str):
        return ""
    text = text.replace('""', '"')
    return text.strip()

def normalize_text(text):
    """
    Cleans and normalizes text to remove inconsistencies and hidden characters.
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text)
    text = ' '.join(text.split())
    return text.strip()

def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Sanitization & Normalization ---")
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        logging.info("--- Text Sanitization & Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {
                "rouge1": rouge_result["rouge1"],
                "rouge2": rouge_result["rouge2"],
                "rougeL": rouge_result["rougeL"],
                "bleurt_f1": np.mean(bleurt_result["scores"])
            }
            
            return {k: round(v * 100, 4) for k, v in result.items()}

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            # --- KEY CHANGES to bypass versioning errors ---
            load_best_model_at_end=False,
            # evaluation_strategy is removed as it's not needed/supported in all versions
            # ------------------------------------------------
            report_to="tensorboard",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting final training from scratch with FLAN-T5-LARGE...")
        trainer.train()
        logging.info("Training finished. All checkpoints and logs are saved.")
        
    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()



2025-10-07 14:55:05,985 [INFO] - --- Starting Text Sanitization & Normalization ---
2025-10-07 14:55:07,535 [INFO] - --- Text Sanitization & Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-07 14:56:37,740 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-07 14:56:37,740 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-07 14:56:37,740 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-07 14:56:37,751 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-07 14:56:37,752 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-07 14:56:37,753 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-07 14:56:37,753 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-07 14:56:37,753 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-07 14:56:37,753 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-07 14:56:37,760 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-07 14:56:37,761 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-07 14:56:37,762 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 14:56:37,764 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 14:56:37,766 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-07 14:56:37,766 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-07 14:56:38,217 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-07 14:56:38,217 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-07 14:56:38,217 [INFO] - Loading model.
2025-10-07 14:56:43,234 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-07 14:56:43,234 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-07 14:56:43,234 [INFO] - BLEURT initialized.
  trainer = Seq2SeqTrainer(
2025-10-07 14:56:46,255 [INFO] - Starting final training from scratch with FLAN-T5-LARGE...


Step,Training Loss
2076,0.0


KeyboardInterrupt: 

In [1]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainerCallback
)

# --- Configuration ---
BASE_MODEL_PATH = "google/flan-t5-large"
NEW_MODEL_OUTPUT_DIR = "flan-t5-large-cnn-summarizer-v13"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3 
BATCH_SIZE = 1 
GRADIENT_ACCUMULATION_STEPS = 8
WEIGHT_DECAY = 0.3 
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bleurt_f1" 

# --- Setup Logging ---
log_filename = f"flan_t5_large_training_log_v13_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

class ZeroLossCallback(TrainerCallback):
    """A callback that stops training if the training loss is zero."""
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and 'loss' in logs and logs['loss'] == 0.0:
            logging.error("CRITICAL: Training loss is zero. This indicates a likely data leak.")
            logging.error("Stopping training to prevent wasted resources.")
            control.should_training_stop = True

def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Sanitization & Normalization ---")
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        logging.info("--- Text Sanitization & Normalization Finished ---")
        
        raw_dataset = Dataset.from_pandas(df_new)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {
                "rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"],
                "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])
            }
            return {k: round(v * 100, 4) for k, v in result.items()}

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_strategy="steps",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False, # Disabled to prevent versioning conflicts
            report_to="tensorboard",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[ZeroLossCallback()]
        )

        logging.info("Starting final training (v13) from scratch with FLAN-T5-LARGE...")
        trainer.train()
        logging.info("Training finished. All checkpoints and logs are saved.")
        
    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()





2025-10-07 16:50:00,646 [INFO] - --- Starting Text Sanitization & Normalization ---
2025-10-07 16:50:02,196 [INFO] - --- Text Sanitization & Normalization Finished ---


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]







INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-07 16:51:30,275 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-07 16:51:30,578 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-07 16:51:30,578 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-07 16:51:30,578 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-07 16:51:30,578 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-07 16:51:30,578 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-07 16:51:30,578 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-07 16:51:30,590 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-07 16:51:30,591 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-07 16:51:30,594 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-07 16:51:30,596 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-07 16:51:30,597 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 16:51:30,597 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 16:51:30,597 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-07 16:51:30,597 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-07 16:51:31,058 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-07 16:51:31,060 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-07 16:51:31,062 [INFO] - Loading model.
2025-10-07 16:51:36,525 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-07 16:51:36,525 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-07 16:51:36,534 [INFO] - BLEURT initialized.
  trainer = Seq2SeqTrainer(
2025-10-07 16:51:39,258 [INFO] - Starting final training (v13) from scratch with FLAN-T5-LARGE...


Step,Training Loss
50,0.0


2025-10-07 16:53:57,676 [ERROR] - CRITICAL: Training loss is zero. This indicates a likely data leak.
2025-10-07 16:53:57,676 [ERROR] - Stopping training to prevent wasted resources.
2025-10-07 16:54:43,443 [INFO] - Training finished. All checkpoints and logs are saved.


In [2]:
import logging
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import unicodedata
from datetime import datetime
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# --- Configuration ---
BASE_MODEL_PATH = "google/flan-t5-large"
NEW_MODEL_OUTPUT_DIR = "flan-t5-large-cnn-summarizer-v13-final"
NEW_DATA_PATH = "../Dataset/new_large_CNN_dataset.csv"

# --- Hyperparameters ---
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
WEIGHT_DECAY = 0.3
NUM_BEAMS_EVAL = 6
MAX_SUMMARY_LENGTH_EVAL = 256
METRIC_FOR_BEST_MODEL = "bleurt_f1"

# --- Setup Logging ---
log_filename = f"flan_t5_large_training_log_v13_final_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]
)

def sanitize_text(text):
    if not isinstance(text, str): return ""
    return text.replace('""', '"').strip()

def normalize_text(text):
    if not isinstance(text, str): return ""
    return ' '.join(unicodedata.normalize('NFKC', text).split())

def main():
    try:
        tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH, legacy=False)
        model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)

        df_new = pd.read_csv(NEW_DATA_PATH, engine='python', on_bad_lines='skip')
        df_new.dropna(subset=['raw_news_article', 'english_summary', 'hindi_summary'], inplace=True)
        
        logging.info("--- Starting Text Sanitization & Normalization ---")
        for col in ['raw_news_article', 'english_summary', 'hindi_summary']:
            df_new[col] = df_new[col].apply(sanitize_text).apply(normalize_text)
        logging.info("--- Text Sanitization & Normalization Finished ---")

        # --- NEW: Aggressive Data Leak Detection and Removal ---
        logging.info("--- Starting Aggressive Data Leak Detection ---")
        leaky_indices = []
        for index, row in df_new.iterrows():
            article = row['raw_news_article']
            eng_summary = row['english_summary']
            hin_summary = row['hindi_summary']
            if (isinstance(article, str) and isinstance(eng_summary, str) and eng_summary in article) or \
               (isinstance(article, str) and isinstance(hin_summary, str) and hin_summary in article):
                leaky_indices.append(index)
                logging.warning(f"Data leak detected and row will be skipped. Index: {index}")
        
        if leaky_indices:
            logging.warning(f"Found and removed {len(leaky_indices)} rows with data leaks.")
            df_clean = df_new.drop(leaky_indices).reset_index(drop=True)
        else:
            logging.info("No data leaks found in the dataset.")
            df_clean = df_new
            
        if df_clean.empty:
            logging.error("CRITICAL: The dataset is empty after removing leaky rows. Cannot proceed with training.")
            return
            
        logging.info(f"Training will proceed with {len(df_clean)} clean rows.")
        # --- End of Data Leak Detection ---

        raw_dataset = Dataset.from_pandas(df_clean)

        PREFIX_ENG = "summarize English: "
        PREFIX_HIN = "summarize Hindi: "

        def format_dataset(batch):
            inputs, targets = [], []
            for article, eng_summary, hin_summary in zip(
                batch['raw_news_article'], batch['english_summary'], batch['hindi_summary']
            ):
                if isinstance(article, str) and article:
                    inputs.append(PREFIX_ENG + article)
                    targets.append(eng_summary)
                    inputs.append(PREFIX_HIN + article)
                    targets.append(hin_summary)
            return {'inputs': inputs, 'targets': targets}

        processed_dataset = raw_dataset.map(
            format_dataset, batched=True, remove_columns=raw_dataset.column_names
        ).flatten()

        train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
        final_datasets = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        
        def tokenize_function(examples):
            model_inputs = tokenizer(examples['inputs'], max_length=1024, truncation=True)
            labels = tokenizer(text_target=examples['targets'], max_length=MAX_SUMMARY_LENGTH_EVAL, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=['inputs', 'targets'])
        
        rouge_metric = evaluate.load("rouge")
        bleurt_metric = evaluate.load("bleurt", "bleurt-20")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleurt_result = bleurt_metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {
                "rouge1": rouge_result["rouge1"], "rouge2": rouge_result["rouge2"],
                "rougeL": rouge_result["rougeL"], "bleurt_f1": np.mean(bleurt_result["scores"])
            }
            return {k: round(v * 100, 4) for k, v in result.items()}

        training_args = Seq2SeqTrainingArguments(
            output_dir=NEW_MODEL_OUTPUT_DIR,
            num_train_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            weight_decay=WEIGHT_DECAY,
            logging_dir=f"{NEW_MODEL_OUTPUT_DIR}/logs",
            logging_strategy="steps",
            logging_steps=50,
            save_strategy="epoch",
            save_total_limit=NUM_EPOCHS,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=False,
            report_to="tensorboard",
            generation_max_length=MAX_SUMMARY_LENGTH_EVAL,
            generation_num_beams=NUM_BEAMS_EVAL,
        )

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        logging.info("Starting final training (v13-final) from scratch with FLAN-T5-LARGE...")
        trainer.train()
        logging.info("Training finished. All checkpoints and logs are saved.")
        
    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process: {e}", exc_info=True)

if __name__ == "__main__":
    main()


2025-10-07 16:56:35,506 [INFO] - --- Starting Text Sanitization & Normalization ---
2025-10-07 16:56:37,027 [INFO] - --- Text Sanitization & Normalization Finished ---
2025-10-07 16:56:37,027 [INFO] - --- Starting Aggressive Data Leak Detection ---
2025-10-07 16:56:37,444 [INFO] - No data leaks found in the dataset.
2025-10-07 16:56:37,444 [INFO] - Training will proceed with 9223 clean rows.


Map:   0%|          | 0/9223 [00:00<?, ? examples/s]

Map:   0%|          | 0/16601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1845 [00:00<?, ? examples/s]

INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


2025-10-07 16:58:04,181 [INFO] - Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20.


INFO:tensorflow:Config file found, reading.


2025-10-07 16:58:04,181 [INFO] - Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


2025-10-07 16:58:04,181 [INFO] - Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


2025-10-07 16:58:04,181 [INFO] - Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


2025-10-07 16:58:04,181 [INFO] - ... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


2025-10-07 16:58:04,181 [INFO] - ... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


2025-10-07 16:58:04,181 [INFO] - ... max_seq_length:512


INFO:tensorflow:... vocab_file:None


2025-10-07 16:58:04,181 [INFO] - ... vocab_file:None


INFO:tensorflow:... do_lower_case:None


2025-10-07 16:58:04,181 [INFO] - ... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


2025-10-07 16:58:04,196 [INFO] - ... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


2025-10-07 16:58:04,198 [INFO] - ... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


2025-10-07 16:58:04,199 [INFO] - Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 16:58:04,200 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


2025-10-07 16:58:04,201 [INFO] - Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


2025-10-07 16:58:04,204 [INFO] - Will load model: C:\Users\admin\.cache\huggingface\metrics\bleurt\bleurt-20\downloads\extracted\8db8856a80394ae84b010e83ab663d4a3ccfa244ce3d0dbe00143f73e65ff123\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


2025-10-07 16:58:04,644 [INFO] - SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


2025-10-07 16:58:04,644 [INFO] - Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


2025-10-07 16:58:04,644 [INFO] - Loading model.
2025-10-07 16:58:10,523 [INFO] - Fingerprint not found. Saved model loading will continue.
2025-10-07 16:58:10,523 [INFO] - path_and_singleprint metric could not be logged. Saved model loading will continue.


INFO:tensorflow:BLEURT initialized.


2025-10-07 16:58:10,530 [INFO] - BLEURT initialized.
  trainer = Seq2SeqTrainer(
2025-10-07 16:58:12,801 [INFO] - Starting final training (v13-final) from scratch with FLAN-T5-LARGE...


Step,Training Loss
50,0.0
100,0.0


KeyboardInterrupt: 