In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
import evaluate
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

In [5]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

True
NVIDIA GeForce RTX 3060 Laptop GPU


In [None]:
root = "dataset"

cnn = load_dataset(
    "parquet",
    data_files={
        "train":      f"{root}/cnn/train/*.parquet",
        "validation": f"{root}/cnn/validation/*.parquet",
        "test":       f"{root}/cnn/test/*.parquet", 
    }
)
train_dataset = cnn["train"]
val_dataset   = cnn["validation"].select(range(2000))
test_dataset  = cnn["test"].select(range(1500))

In [7]:
model_name = "facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [8]:
max_input_length = 768
max_target_length = 256

def preprocess(batch):
    model_inputs = tokenizer(
        batch["article"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["highlights"],
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )

    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in ids]
        for ids in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
val_tokenized = val_dataset.map(preprocess, batched=True, remove_columns=val_dataset.column_names)

In [9]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # ambil preds dari tuple jika perlu
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.clip(preds, a_min=0, a_max=None)

    # ubah -100 ke pad_token_id supaya bisa didecode
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    # decode
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # clean
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    # hitung ROUGE
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # ubah skala ke persen (opsional)
    result = {k: round(v * 100, 4) for k, v in result.items()}

    return result

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, use_cache=False)
model.gradient_checkpointing_enable()

output_dir = "./bart_cnn_finetuned"

training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_cnn",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_max_length=256,
    fp16=True,
    optim="adafactor",
    load_best_model_at_end=True,
    metric_for_best_model="eval_rougeL",
    greater_is_better=True,
    save_total_limit=2
)

In [11]:
import transformers
print(transformers.__version__)

4.57.3


In [12]:
from transformers import Seq2SeqTrainingArguments
import inspect
print(inspect.signature(Seq2SeqTrainingArguments))



In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.0195,1.878123,35.8257,15.5966,25.9,32.9839
2,1.8338,1.848299,35.5676,15.469,25.7611,32.9046
3,1.7044,1.852839,36.4452,15.9841,26.1842,33.7074
4,1.6282,1.843191,37.1053,16.154,26.3779,34.2624
5,1.5449,1.853586,36.8057,15.8408,26.1945,33.9842
6,1.4886,1.860694,36.5948,15.6737,26.0261,33.7816


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=71784, training_loss=1.72105765207202, metrics={'train_runtime': 52027.7841, 'train_samples_per_second': 11.037, 'train_steps_per_second': 1.38, 'total_flos': 2.625967248113664e+17, 'train_loss': 1.72105765207202, 'epoch': 6.0})

In [15]:
preds = []
refs  = []

for sample in test_dataset:
    text   = sample["article"]
    target = sample["highlights"]

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=128,
            num_beams=4
        )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    preds.append(summary)
    refs.append(target)

scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
print(scores)

{'rouge1': 0.35981904833861045, 'rouge2': 0.15595646040273925, 'rougeL': 0.2621508398425323, 'rougeLsum': 0.3320407701281053}


In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

checkpoint_path = "./bart_cnn/checkpoint-71784"
save_path = "./BART_NewsSummarizer"

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)

  from .autonotebook import tqdm as notebook_tqdm


Model saved to: ./BART_NewsSummarizer
