**Text Classification using BERT and HuggingFace Transformers with AG News Dataset**

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# ‚úÖ Load model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# ‚úÖ Preprocess function
def preprocess_function(examples):
    inputs = tokenizer(examples["article"], max_length=512, padding="max_length", truncation=True)
    targets = tokenizer(examples["highlights"], max_length=128, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]

    # Replace padding token id with -100 so that it's ignored in loss
    inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in inputs["labels"]
    ]
    return inputs

# ‚úÖ Load dataset and preprocess
dataset = load_dataset("cnn_dailymail", "3.0.0")
dataset["train"] = dataset["train"].select(range(5))  # Use only 5 samples
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ‚úÖ DataLoader
train_loader = DataLoader(tokenized_dataset["train"], batch_size=2, shuffle=True)

# ‚úÖ Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# ‚úÖ Training loop (1 epoch)
model.train()
for epoch in range(1):
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# ‚úÖ Save model
model.save_pretrained("./my_bart_summary_model")
tokenizer.save_pretrained("./my_bart_summary_model")

# ‚úÖ Inference (summary generation)
def summarize(text, model_path="./my_bart_summary_model"):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    inputs = tokenizer([text], return_tensors="pt", max_length=512, padding=True, truncation=True).to(device)
    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=4,
        max_length=142,
        min_length=56,
        length_penalty=2.0,
        no_repeat_ngram_size=3,
        early_stopping=True,
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# ‚úÖ Test summarization on a real sample
sample_text = dataset["test"][0]["article"]
print("\nüìù Original:\n", sample_text[:500], "...")  # Show partial article
print("\nüìå Summary:\n", summarize(sample_text))







Epoch 1, Loss: 7.7926

üìù Original:
 (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin ...





üìå Summary:
 The Palestinian Authority becomes the 123rd member of the International Criminal Court on Wednesday. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction.
