In [None]:
# !pip install transformers datasets torch tensorflow scikit-learn numpy onnx onnx-tf

In [None]:
# Import required libraries
import pandas as pd
import torch
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import classification_report
import numpy as np
import tensorflow as tf
import onnx
# from onnx_tf.backend import prepare

In [None]:
# Load the simplified GoEmotions dataset with 27 emotions + neutral (28 classes)
dataset = load_dataset("go_emotions", "simplified")
NUM_CLASSES = 28

In [4]:
CLASS_NAMES = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

In [5]:
# Convert multi-label to single-label by taking the first label
def preprocess_labels(example):
    # Take the first label from the list of labels
    example["label"] = example["labels"][0] if isinstance(example["labels"], list) else example["labels"]
    return example


In [None]:
dataset = dataset.map(preprocess_labels)

In [None]:
# Load the MobileBERT tokenizer
tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")

In [8]:
# Function to tokenize text with padding and truncation
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


In [None]:
# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [10]:
# Remove unnecessary columns and set format to PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["labels", "id"])  # Remove original 'labels' and 'id'
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [11]:
# Split into train, validation, and test sets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [None]:
# Load MobileBERT model for sequence classification with 28 classes
model = MobileBertForSequenceClassification.from_pretrained(
    "google/mobilebert-uncased", num_labels=NUM_CLASSES
)

In [16]:
# Configure training hyperparameters
training_args = TrainingArguments(
    output_dir="./mobilebert_goemotions",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=500,
    weight_decay=0.01,
    report_to="none",  # Disable W&B logging
)

In [17]:
# Compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"],
        "weighted_f1": report["weighted avg"]["f1-score"],
    }

In [18]:
# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Evaluate on the test dataset
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./mobilebert_goemotions_final")
tokenizer.save_pretrained("./mobilebert_goemotions_final")

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./mobilebert_goemotions_final")
tokenizer.save_pretrained("./mobilebert_goemotions_final")