In [1]:
from datasets import load_dataset
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments

# Load the dataset from Hugging Face
dataset = load_dataset("ccdv/arxiv-summarization")
print("Train split columns:", dataset["train"].column_names)


Train split columns: ['article', 'abstract']


In [2]:
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Define maximum token lengths
max_input_length = 1024  # PEGASUS max input tokens
max_target_length = 256  # Typical max abstract length


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def preprocess_function(article, abstract, idx=None):
    # Tokenize the article (input)
    inputs = tokenizer(
        article,
        truncation=True,
        padding="max_length",
        max_length=max_input_length,
    )
    # Tokenize the abstract (target)
    targets = tokenizer(
        abstract,
        truncation=True,
        padding="max_length",
        max_length=max_target_length,
    )
    # Set labels for the model (the data collator will handle masking padding tokens)
    inputs["labels"] = targets["input_ids"]
    return inputs


In [4]:
print("ðŸ”„ Tokenizing train split...")
tokenized_train = dataset["train"].map(
    preprocess_function,
    batched=True,
    input_columns=["article", "abstract"],
    remove_columns=["article", "abstract"],
)

print("ðŸ”„ Tokenizing validation split...")
tokenized_val = dataset["validation"].map(
    preprocess_function,
    batched=True,
    input_columns=["article", "abstract"],
    remove_columns=["article", "abstract"],
)

print("ðŸ”„ Tokenizing test split...")
tokenized_test = dataset["test"].map(
    preprocess_function,
    batched=True,
    input_columns=["article", "abstract"],
    remove_columns=["article", "abstract"],
)


ðŸ”„ Tokenizing train split...


Map:   0%|          | 0/203037 [00:00<?, ? examples/s]

ðŸ”„ Tokenizing validation split...


Map:   0%|          | 0/6436 [00:00<?, ? examples/s]

ðŸ”„ Tokenizing test split...


Map:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [5]:
# Save tokenized splits locally
tokenized_train.save_to_disk("tokenized2/tokenized_train")
tokenized_val.save_to_disk("tokenized2/tokenized_val")
tokenized_test.save_to_disk("tokenized2/tokenized_test")

print("Tokenized datasets saved locally.")


Saving the dataset (0/3 shards):   0%|          | 0/203037 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6436 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6440 [00:00<?, ? examples/s]

Tokenized datasets saved locally.


In [6]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [8]:
training_args = TrainingArguments(
    output_dir="pegasus_arxiv_model2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=500,
    per_device_train_batch_size=1,   # Adjust based on your GPU's memory
    per_device_eval_batch_size=1,
    num_train_epochs=5,              # Increase epochs for better performance
    save_total_limit=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,                      # Enable mixed precision if your GPU supports it
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
# Select the first 10,000 examples
train_subset = tokenized_train.select(range(5000))
print("Subset of train dataset contains:", len(train_subset), "entries")


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
print("ðŸš€ Starting fine-tuning on 10,000 training examples...")
trainer.train()

# Save the final fine-tuned model
trainer.save_model("pegasus_arxiv_model2/final_subset")
print("âœ… Model saved to pegasus_arxiv_model2/final_subset")
