In [1]:
# !pip install torch torchvision torchaudio datasets evaluate transformers rouge_score


In [2]:
# Imports
import torch
from datasets import load_dataset
from evaluate import load
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
                          Trainer, TrainingArguments, DataCollatorForSeq2Seq,
                          get_linear_schedule_with_warmup)
import numpy as np
import gc


In [3]:
# Load ROUGE metric
rouge = load("rouge")

# Load datasets
train_dataset = load_dataset('csv', data_files='preprocessed_train.csv')['train']
test_dataset = load_dataset('csv', data_files='preprocessed_test.csv')['train']

# Filter empty documents
train_dataset = train_dataset.filter(lambda x: x['document'] and len(x['document']) > 0)
test_dataset = test_dataset.filter(lambda x: x['document'] and len(x['document']) > 0)


In [None]:
# Initialize model and tokenizer
model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [5]:
# Tokenization Function
def make_tokenize_function(tokenizer):
    def tokenize_function(examples):
        inputs = examples['document']
        labels = examples['summary']
        
        model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=64)
        
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(labels, padding='max_length', truncation=True, max_length=32)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    return tokenize_function

# Initialize tokenizer
tokenization_fn = make_tokenize_function(tokenizer)


In [6]:
# Tokenize datasets
train_dataset = train_dataset.map(tokenization_fn, batched=True, remove_columns=['summary'], num_proc=4)
test_dataset = test_dataset.map(tokenization_fn, batched=True, remove_columns=['summary'], num_proc=4)



In [7]:
# Metric computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # Get the logits

    decoded_preds = tokenizer.batch_decode(predictions.argmax(axis=-1).tolist(), skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)

    # Calculate ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
    }


In [None]:
repo_name = "Chribabc/LLM_Project_Lighthouse"  # Hugging Face username and repo name

training_args = TrainingArguments(
    output_dir='D:\\Documents\\GitHub\\LLM-Project\\My_Model',  # Local path
    push_to_hub=True,
    hub_model_id=repo_name,  #repo name for pushing
    evaluation_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    weight_decay=0.02,
    logging_dir='D:\\Documents\\GitHub\\LLM-Project\\Logs',
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
    fp16=True,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,  # Add gradient clipping
)

In [9]:
# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Clear cache
torch.cuda.empty_cache()
gc.collect()


In [11]:
# Track best model
best_rouge1 = 0
best_model_path = "D:\\Documents\\GitHub\\LLM-Project\\best_model"


In [None]:
# Define chunk sizes
train_chunk_size = 4000  # Set chunk size for training
num_train_chunks = len(train_dataset) // train_chunk_size + (len(train_dataset) % train_chunk_size > 0)

# Training loop
for chunk_idx in range(num_train_chunks):
    start_idx = chunk_idx * train_chunk_size
    end_idx = min(start_idx + train_chunk_size, len(train_dataset))

    chunk_train_dataset = train_dataset.select(range(start_idx, end_idx))

    print(f"Training chunk {chunk_idx + 1}/{num_train_chunks}...")

    # Define the optimizer for the current chunk
    optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

    # Define total training steps and scheduler for the current chunk
    total_steps = len(chunk_train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
    warmup_steps = int(0.1 * total_steps)

    # Initialize the scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    # Start training for this chunk
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=chunk_train_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler),  # Pass the optimizer and scheduler
    )

    trainer.train()

    # Clear memory after training the chunk
    torch.cuda.empty_cache()
    gc.collect()


In [None]:
# Chunked evaluation
eval_chunk_size = 50  # Set smaller chunk size for evaluation
num_eval_chunks = len(test_dataset) // eval_chunk_size + (len(test_dataset) % eval_chunk_size > 0)

for chunk_idx in range(num_eval_chunks):
    start_idx = chunk_idx * eval_chunk_size
    end_idx = min(start_idx + eval_chunk_size, len(test_dataset))

    chunk_eval_dataset = test_dataset.select(range(start_idx, end_idx))

    # Print current evaluation chunk
    print(f"Evaluating chunk {chunk_idx + 1}/{num_eval_chunks}...")

    # Evaluate on the current chunk
    eval_results = trainer.evaluate(eval_dataset=chunk_eval_dataset)

    # Print the evaluation results to debug
    print("Evaluation Results:", eval_results)  # Check the output structure

    # Check if this model is better
    if 'rouge1' in eval_results and eval_results['rouge1'] > best_rouge1:
        best_rouge1 = eval_results['rouge1']
        model.save_pretrained(best_model_path)  # Save the best model
        print(f"New best model saved with ROUGE-1: {best_rouge1}")

    # Clear memory after evaluation of the chunk
    torch.cuda.empty_cache()
    gc.collect()


In [None]:
# Push model to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

trainer.push_to_hub()

In [None]:
# Save the final model and tokenizer
model.save_pretrained("D:\\Documents\\GitHub\\LLM-Project\\My_Model")
tokenizer.save_pretrained("D:\\Documents\\GitHub\\LLM-Project\\My_Model")

# Output best model information
print(f"Best model saved at: {best_model_path} with ROUGE-1: {best_rouge1}")

