In [1]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

def preprocess_data(examples):
    inputs = tokenizer(examples['article'], max_length=512, truncation=True, padding='max_length')
    targets = tokenizer(examples['highlights'], max_length=120, truncation=True, padding='max_length')
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': targets['input_ids']}

train_ds = ds['train'].map(preprocess_data, batched=True)
validation_ds = ds['validation'].map(preprocess_data, batched=True)
test_ds = ds['test'].map(preprocess_data, batched=True)


Map: 100%|██████████| 287113/287113 [02:58<00:00, 1607.62 examples/s]
Map: 100%|██████████| 13368/13368 [00:08<00:00, 1506.96 examples/s]
Map: 100%|██████████| 11490/11490 [00:07<00:00, 1631.79 examples/s]


In [13]:
train_ds = train_ds.remove_columns(['article', 'highlights', 'id'])
validation_ds = validation_ds.remove_columns(['article', 'highlights', 'id'])
test_ds = test_ds.remove_columns(['article', 'highlights', 'id'])


# Then set format to torch
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
validation_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [14]:
print(train_ds)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 287113
})


In [16]:
train_ds.save_to_disk("brevity_date2/train")
validation_ds.save_to_disk("brevity_date2/validation")
test_ds.save_to_disk("brevity_date2/test")

Saving the dataset (3/3 shards): 100%|██████████| 287113/287113 [00:05<00:00, 49157.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 13368/13368 [00:00<00:00, 85172.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11490/11490 [00:00<00:00, 56863.63 examples/s]


In [2]:
from datasets import load_from_disk

train_ds = load_from_disk("brevity_date2/train")
validation_ds = load_from_disk("brevity_date2/validation")
test_ds = load_from_disk("brevity_date2/test")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(train_ds)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 287113
})


In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

print("Loaded DistilBART model to runtime")

Loaded DistilBART model to runtime


In [20]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Loaded DistilBART model to {device}")

Loaded DistilBART model to cuda


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distil_summarizer2",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    bf16=False,
    fp16=False,
    save_steps=1000,
    eval_steps=1000,
    optim="adafactor",
    gradient_checkpointing=True,
)

print("✅ TrainingArguments set up and ready.")


✅ TrainingArguments set up and ready.


In [1]:
import torch
import gc

gc.collect()              # Garbage collection
torch.cuda.empty_cache()  # Clear GPU cache

In [23]:
from transformers import Trainer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AutoTokenizer
from datasets import load_from_disk
import torch

tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_ds = train_ds.select(range(10000))
validation_ds = validation_ds.select(range(2000))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=validation_ds,
    data_collator=data_collator,
    tokenizer=tokenizer 
)

trainer.train()

  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,1.2094,0.850927




TrainOutput(global_step=624, training_loss=1.103832917335706, metrics={'train_runtime': 13255.0989, 'train_samples_per_second': 1.509, 'train_steps_per_second': 0.047, 'total_flos': 1.5441970964987904e+16, 'train_loss': 1.103832917335706, 'epoch': 1.9952})

In [24]:
model.save_pretrained("./brevity_small_stage2")

In [25]:
tokenizer.save_pretrained("./brevity_small_stage2")

('./brevity_small_stage2\\tokenizer_config.json',
 './brevity_small_stage2\\special_tokens_map.json',
 './brevity_small_stage2\\vocab.json',
 './brevity_small_stage2\\merges.txt',
 './brevity_small_stage2\\added_tokens.json',
 './brevity_small_stage2\\tokenizer.json')

In [2]:
from transformers import Trainer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AutoTokenizer, TrainingArguments
from datasets import load_from_disk

model = AutoModelForSeq2SeqLM.from_pretrained("./brevity_small_stage2")
tokenizer = AutoTokenizer.from_pretrained("./brevity_small_stage2")

train_ds = load_from_disk("brevity_date2/train")
validation_ds = load_from_disk("brevity_date2/validation")
train_ds = train_ds.select(range(10000, 20000))
validation_ds = validation_ds.select(range(2000, 4000))

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


training_args = TrainingArguments(
    output_dir="./distil_summarizer2-p2",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs_stage3",
    logging_steps=500,
    save_total_limit=2,
    bf16=False,
    fp16=False,
    save_steps=1000,
    eval_steps=1000,
    optim="adafactor",
    gradient_checkpointing=True,
)

from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=validation_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,0.7483,0.939475




TrainOutput(global_step=624, training_loss=0.7255076139401166, metrics={'train_runtime': 13496.545, 'train_samples_per_second': 1.482, 'train_steps_per_second': 0.046, 'total_flos': 1.5441970964987904e+16, 'train_loss': 0.7255076139401166, 'epoch': 1.9952})

In [3]:
model.save_pretrained("./brevity_small_stage3")
tokenizer.save_pretrained("./brevity_small_stage3")

('./brevity_small_stage3\\tokenizer_config.json',
 './brevity_small_stage3\\special_tokens_map.json',
 './brevity_small_stage3\\vocab.json',
 './brevity_small_stage3\\merges.txt',
 './brevity_small_stage3\\added_tokens.json',
 './brevity_small_stage3\\tokenizer.json')

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("./brevity_small_stage2")
tokenizer = AutoTokenizer.from_pretrained("./brevity_small_stage2")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [8]:
with open('sample.txt', 'r', encoding='utf-8') as f:
    text = f.read()

inputs = tokenizer(
    text,
    return_tensors="pt",
    max_length=512,
    truncation=True,
    padding="max_length"
).to(device)

In [9]:
with torch.no_grad():
    summary_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        min_length=50,
        num_beams=4,
        early_stopping=True,
        length_penalty=1.2,
        no_repeat_ngram_size=3
    )

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("\nSummary:\n", summary)



Summary:
 The concept of sustainability has gained increasing importance over the last few decades .
Sustainability involves the responsible use of resources such as water and energy .
It is essential to create policies that support long-term goals, such as sustainability .
