<a href="https://colab.research.google.com/github/BarbodRE/News_classification/blob/main/News_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets evaluate scikit_learn

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import random

In [None]:
dataset = load_dataset("ag_news")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [None]:
small_train = dataset["train"].shuffle(seed=42).select(range(20000))
small_test = dataset["test"]

In [None]:
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding="max_length", truncation= True, max_length=128)

In [None]:
small_train = small_train.map(tokenize, batched=True)
small_test = small_test.map(tokenize, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

#train_dataset = tokenized_datasets["train"]
#test_dataset = tokenized_datasets["test"]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

In [None]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  preds = np.argmax(logits, axis=-1)
  return{
      "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
      "precision": precision.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="macro")["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
  }

In [None]:
training_args = TrainingArguments(
    output_dir = "./results_news",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    fp16 = True,
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_train,
    eval_dataset = small_test,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [None]:
train_result = trainer.train()
print("\n === خلاصه آموزش ===")
print(train_result)



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6892,0.65816,0.846447,0.845653,0.846447,0.844273



 === خلاصه آموزش ===
TrainOutput(global_step=1250, training_loss=0.892452896118164, metrics={'train_runtime': 280.3797, 'train_samples_per_second': 71.332, 'train_steps_per_second': 4.458, 'total_flos': 6356398080000.0, 'train_loss': 0.892452896118164, 'epoch': 1.0})


In [None]:
metrics = trainer.evaluate()
print("\n=== متریک‌های نهایی (Test) ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")




=== متریک‌های نهایی (Test) ===
eval_loss: 0.6582
eval_accuracy: 0.8464
eval_precision: 0.8457
eval_recall: 0.8464
eval_f1: 0.8443
eval_runtime: 31.2503
eval_samples_per_second: 243.1980
eval_steps_per_second: 15.2000
epoch: 1.0000


In [None]:
preds = trainer.predict(small_test)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=-1)



In [None]:
print("\n=== گزارش طبقه‌بندی ===")
print(classification_report(y_true, y_pred, target_names=["World", "Sports", "Business", "Sci/Tech"]))


=== گزارش طبقه‌بندی ===
              precision    recall  f1-score   support

       World       0.85      0.88      0.86      1900
      Sports       0.91      0.96      0.93      1900
    Business       0.83      0.70      0.76      1900
    Sci/Tech       0.80      0.84      0.82      1900

    accuracy                           0.85      7600
   macro avg       0.85      0.85      0.84      7600
weighted avg       0.85      0.85      0.84      7600



In [None]:
print("\n===ماتریس سردرگمی===")
print(confusion_matrix(y_true, y_pred))


===ماتریس سردرگمی===
[[1672   88   92   48]
 [  45 1826   17   12]
 [ 169   46 1333  352]
 [  83   51  164 1602]]


In [None]:
print("\n===10 نمونه اشتباه از تست===")
wrong_idx = np.where(y_true != y_pred)[0]
samples = random.sample(list(wrong_idx), 10)
for i , idx in enumerate(samples):
  idx = int(idx)
  text = dataset["test"][idx]["text"][:300]
  gold = dataset["test"][idx]["label"]
  pred = y_pred[idx]
  print(f"\n --- نمونه{i+1}| gold={gold}|pred={pred} --- \n{text}")


===10 نمونه اشتباه از تست===

 --- نمونه1| gold=0|pred=1 --- 
Vilsack, Dean Jockey for Top DNC Post (AP) AP - Iowa Gov. Tom Vilsack told Democratic leaders on Friday he may seek the party's top job as the jockeying to replace chairman Terry McAuliffe intensified.

 --- نمونه2| gold=3|pred=1 --- 
Rough ride won #39;t stop next X Prize shot The rolling experienced by SpaceShipOne on its first Ansari X Prize flight on Wednesday will not jeopardise the team #39;s chances of winning the \$10 million purse, team members said in a post-flight briefing.

 --- نمونه3| gold=1|pred=2 --- 
HONDA LINKED WITH BAR TAKEOVER BAR #39;s engine partner Honda is believed to be interested in purchasing the Brackley team and a deal could be done within the next 12 months.

 --- نمونه4| gold=2|pred=0 --- 
Nobel Economics Prize Awarded Norwegian-born Finn Kydland and Edward Prescott of the United States won the 2004 Nobel\economics prize, the Royal Swedish Academy of Sciences said on Monday.

 --- نمونه5| gol