# Load Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("sh0416/ag_news")
print(dataset["train"])
print(dataset["test"])

Dataset({
    features: ['label', 'title', 'description'],
    num_rows: 120000
})
Dataset({
    features: ['label', 'title', 'description'],
    num_rows: 7600
})


In [3]:
dataset=dataset.map(lambda batch: {"label": batch["label"] if batch["label"] < 4 else 3}, batched=False)

# Tokenization & Preprocessing

In [4]:
from transformers import BertTokenizer

In [5]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
def tokenize(batch):
    texts=[t + " " + d for t, d in zip(batch["title"], batch["description"])]
    return tokenizer(texts,padding="max_length",truncation=True,max_length=128)

In [7]:
encoded_dataset = dataset.map(tokenize, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Model Initialization

In [8]:
from transformers import BertForSequenceClassification

In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Model Training

In [10]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [11]:
def compute_metrics(eval_pred):
    logits, labels=eval_pred
    preds=np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [12]:
training_args = TrainingArguments(
    output_dir="./news_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Fine-Tune BERT

In [None]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer=Trainer(


Step,Training Loss


# Evaluation

In [None]:
results=trainer.evaluate()
print(results)

# Save Model

In [None]:
model.save_pretrained("./bert_news_model")
tokenizer.save_pretrained("./bert_news_model")