**Importing necessary libraries such as os, datasets, and Hugging Face components like AutoModelForSeq2SeqLM, Seq2SeqTrainer, and other utilities needed for model training and evaluation.**

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from datasets import load_from_disk
from transformers import (
    AutoModelForSeq2SeqLM, AutoTokenizer,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, GenerationConfig
)

from src.train import get_training_args
from src.evaluate import build_compute_metrics

**Loading the pre-saved train, validation, and test datasets from disk and printing their sizes to verify proper loading.**

In [2]:
print("Loading datasets...")
train_dataset = load_from_disk("../data/train")
val_dataset = load_from_disk("../data/val")
test_dataset = load_from_disk("../data/test")

print(f"Dataset sizes - Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

Loading datasets...
Dataset sizes - Train: 7000, Val: 1500, Test: 1500


**Loading the facebook/bart-base model and tokenizer using the transformers library, and moving the model to the CPU.**

In [3]:
model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
import torch
device = torch.device("cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

**Setting generation parameters for the model (e.g., max token length, beam search, repetition penalty) and assigning it to the model’s generation_config.**

In [4]:
gen_config = GenerationConfig(
    max_new_tokens=64,
    early_stopping=True,
    num_beams=4,
    no_repeat_ngram_size=3,
    forced_bos_token_id=0,
    decoder_start_token_id=tokenizer.bos_token_id
)
model.generation_config = gen_config

**Defining training arguments, preparing data collator and metric function, and initializing a Seq2SeqTrainer object with model, datasets, and configurations.**

In [5]:
output_dir = "../outputs/model"
training_args = get_training_args(output_dir)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

compute_metrics = build_compute_metrics(tokenizer)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


**Setting verbosity and starting training the model using trainer.train(), displaying metrics such as loss and ROUGE scores for each epoch.**

In [6]:
from transformers.utils import logging
logging.set_verbosity_info()
print("Starting training with progress bars...")

trainer.train()

***** Running training *****
  Num examples = 7,000
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10,500
  Number of trainable parameters = 139,420,416


Starting training with progress bars...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.6972,0.54025,37.84,15.7,25.71,34.88
2,0.4856,0.528412,37.74,15.91,26.0,34.88
3,0.4025,0.530186,38.36,16.38,26.34,35.43



***** Running Evaluation *****
  Num examples = 1500
  Batch size = 2
Saving model checkpoint to ../outputs/model/checkpoint-3500
Configuration saved in ../outputs/model/checkpoint-3500/config.json
Configuration saved in ../outputs/model/checkpoint-3500/generation_config.json
Model weights saved in ../outputs/model/checkpoint-3500/model.safetensors
tokenizer config file saved in ../outputs/model/checkpoint-3500/tokenizer_config.json
Special tokens file saved in ../outputs/model/checkpoint-3500/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 1500
  Batch size = 2
Saving model checkpoint to ../outputs/model/checkpoint-7000
Configuration saved in ../outputs/model/checkpoint-7000/config.json
Configuration saved in ../outputs/model/checkpoint-7000/generation_config.json
Model weights saved in ../outputs/model/checkpoint-7000/model.safetensors
tokenizer config file saved in ../outputs/model/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ../output

TrainOutput(global_step=10500, training_loss=0.5284185965401785, metrics={'train_runtime': 2148.9134, 'train_samples_per_second': 9.772, 'train_steps_per_second': 4.886, 'total_flos': 1.280446562304e+16, 'train_loss': 0.5284185965401785, 'epoch': 3.0})

**Saving the trained model and tokenizer to the specified output directory for later use or deployment.**

In [7]:
print("Saving model and tokenizer...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Done.")

Configuration saved in ../outputs/model/config.json
Configuration saved in ../outputs/model/generation_config.json


Saving model and tokenizer...


Model weights saved in ../outputs/model/model.safetensors
tokenizer config file saved in ../outputs/model/tokenizer_config.json
Special tokens file saved in ../outputs/model/special_tokens_map.json


Done.
