# Sentiment Analysis with mBERT on Digikala Dataset

This notebook demonstrates fine-tuning a multilingual BERT model for sentiment analysis on Persian reviews from Digikala.

In [None]:
!pip install -q transformers datasets torch accelerate scikit-learn -U

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Load and Prepare Dataset

In [None]:
# Load dataset
dataset = load_dataset("ParsiAI/digikala-sentiment-analysis")

# Inspect the dataset structure
print("Dataset structure:")
print(dataset)
print("\nFirst training example:")
print(dataset["train"][0])

In [None]:
# Preprocess dataset - convert scores to binary labels
def add_label_based_on_score(example):
    score = example["Score"]
    example["Score"] = int(score)
    # Convert to binary classification: positive (1) if score >= 50, negative (0) otherwise
    example["label"] = 1 if example["Score"] >= 50 else 0
    return example

dataset = dataset.map(add_label_based_on_score)

# Check class distribution
train_labels = [example["label"] for example in dataset["train"]]
print("Class distribution in training set:")
print(f"Positive (1): {sum(train_labels)} samples")
print(f"Negative (0): {len(train_labels) - sum(train_labels)} samples")

## Tokenization and Model Setup

In [None]:
# Initialize tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["Text"], 
        truncation=True, 
        padding=True, 
        max_length=128
    )

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2
)

## Training Setup with Class Weighting

In [None]:
# Calculate class weights for imbalanced data
num_positive = sum(1 for example in dataset["train"] if example["Score"] >= 50)
num_negative = len(dataset["train"]) - num_positive
total = len(dataset["train"])

weight_negative = total / (2.0 * num_negative)
weight_positive = total / (2.0 * num_positive)

print(f"Class weights - Negative: {weight_negative:.4f}, Positive: {weight_positive:.4f}")

In [None]:
# Custom trainer with weighted loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(
            weight=torch.tensor([weight_negative, weight_positive]).to(model.device)
        )
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./mbert_digikala_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
    seed=42,
    logging_steps=20,
    report_to="none"
)

# Initialize trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

## Model Training

In [None]:
# Train the model
print("Starting training...")
trainer.train()

## Model Evaluation and Testing

In [None]:
# Load best model
best_checkpoint = trainer.state.best_model_checkpoint
best_model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint)
best_model.eval()
best_model.to(trainer.args.device)

# Base model for comparison
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
base_model.eval()
base_model.to(trainer.args.device)

print(f"Best model loaded from: {best_checkpoint}")

In [None]:
# Prediction function
def predict_sentiment(text, model):
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=128
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
        prediction = torch.argmax(logits, dim=-1).item()
    
    return "مثبت" if prediction == 1 else "منفی"

In [None]:
# Test the model
print("\n" + "="*50)
print("مدل تحلیل احساسات - تست نهایی")
print("="*50)

test_sentences = [
    "عالی بود خیلی خوشم اومد",
    "کیفیت پایین و صدا زیاد",
    "بهترین خرید زندگیم",
    "خیلی بد بود",
    "معمولی بود",
    "عالی عالی عالی",
    "افتضاح بود",
    "قیمت مناسب و کیفیت خوب",
    "پشتیبانی ضعیف",
    "راضی کننده"
]

print("\nنتایج پیش‌بینی:")
print("-" * 60)
for sentence in test_sentences:
    base_pred = predict_sentiment(sentence, base_model)
    fine_tuned_pred = predict_sentiment(sentence, best_model)
    print(f"\nمتن: '{sentence}'")
    print(f"  مدل پایه: {base_pred}")
    print(f"  مدل آموزش‌دیده: {fine_tuned_pred}")

print("\n" + "="*50)

## Save the Model

In [None]:
# Save the fine-tuned model
best_model.save_pretrained("./best_mbert_sentiment_model")
tokenizer.save_pretrained("./best_mbert_sentiment_model")

print("مدل با موفقیت ذخیره شد!")