In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Your fine-tuned model
ft_model_path = "./fine_tuned_darijaBERT"
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_path)
ft_model = AutoModelForSequenceClassification.from_pretrained(ft_model_path)

# Pretrained model (Darija Sentiment Analysis)
pretrained_model_name = "ychafiqui/darija_sentiment_analysis"
pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
pretrained_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name)

In [None]:
  from datasets import load_dataset

# Load your test dataset
test_dataset = load_dataset("csv", data_files={"test": "test.csv"})["test"]

# Inspect the test data
print(test_dataset[0])  # Example: {'text': 'هذا الفيلم رائع', 'label': 0}

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_model(model, tokenizer, dataset):
    all_labels = []
    all_predictions = []

    for example in dataset:
        inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            logits = model(**inputs).logits
        predictions = np.argmax(logits.numpy(), axis=1)

        all_labels.append(example["label"])
        all_predictions.append(predictions[0])

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average="weighted")
    accuracy = accuracy_score(all_labels, all_predictions)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Evaluate your fine-tuned model
ft_metrics = evaluate_model(ft_model, ft_tokenizer, test_dataset)
print("Fine-Tuned Model Metrics:", ft_metrics)

# Evaluate the pretrained model
pretrained_metrics = evaluate_model(pretrained_model, pretrained_tokenizer, test_dataset)
print("Pretrained Model Metrics:", pretrained_metrics)