<a href="https://colab.research.google.com/github/AliKarimNemati/digikala-sentiment-mbert-finetune/blob/main/digikala_sentiment_mbert_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets torch accelerate scikit-learn -U

In [22]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# لود دیتاست
dataset = load_dataset("ParsiAI/digikala-sentiment-analysis")

# Score و اضافه کردن label
def add_label_base_score(db):
    score = db["Score"]
    db["Score"] = int(score)
    db["label"] = 1 if db["Score"] >= 50 else 0
    return db

dataset = dataset.map(add_label_base_score)

model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_func(db):
    return tokenizer(db["Text"], truncation=True, padding=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_func, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./mbert_digikala_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
    seed=42,
    logging_steps=20,
    report_to="none"
)

num_positive = sum(1 for ex in dataset["train"] if ex["Score"] >= 60)
num_negative = len(dataset["train"]) - num_positive
total = len(dataset["train"])

weight_negative = total / (2.0 * num_negative)
weight_positive = total / (2.0 * num_positive)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # ← مهم!
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(
            weight=torch.tensor([weight_negative, weight_positive]).to(model.device)
        )
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

#آموزش مدل
trainer.train()

best_checkpoint = trainer.state.best_model_checkpoint
best_model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint)
best_model.eval()
best_model.to(trainer.args.device)

base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
base_model.eval()
base_model.to(trainer.args.device)

def predict(text, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=-1).item()
    return "مثبت" if pred == 1 else "منفی"

# تست نهایی
print("\nتست مدل:")
test_sentences = [
    "عالی بود خیلی خوشم اومد",
    "کیفیت پایین و صدا زیاد",
    "بهترین خرید زندگیم",
    "خیلی بد بود",
    "معمولی بود"
]


for sentence in test_sentences:
    pred_base = predict(sentence, base_model)
    pred_ft   = predict(sentence, best_model)
    print(f"{sentence}:\n\t Pred_Base:{pred_base}\n\t Pred:{pred_ft}")



Map:   0%|          | 0/2282 [00:00<?, ? examples/s]

Map:   0%|          | 0/489 [00:00<?, ? examples/s]

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/2282 [00:00<?, ? examples/s]

Map:   0%|          | 0/489 [00:00<?, ? examples/s]

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6606,0.601246,0.891616,0.854297
2,0.5467,0.621298,0.883436,0.854555
3,0.4175,0.525145,0.860941,0.86876
4,0.3691,0.6746,0.891616,0.887141


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



تست مدل:
عالی بود خیلی خوشم اومد:
	 Pred_Base:منفی
	 Pred:مثبت
کیفیت پایین و صدا زیاد:
	 Pred_Base:مثبت
	 Pred:منفی
بهترین خرید زندگیم:
	 Pred_Base:منفی
	 Pred:مثبت
خیلی بد بود:
	 Pred_Base:مثبت
	 Pred:مثبت
معمولی بود:
	 Pred_Base:مثبت
	 Pred:منفی
