In [1]:
import torch
import numpy as np

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("umar178/UrduMultiDomainClassification")

dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'topic', 'intent', 'binary'],
        num_rows: 34819
    })
})

In [4]:
set(dataset["train"]["sentiment"])


{'negative', 'neutral', 'positive', 'request'}

In [5]:
dataset = dataset.filter(
    lambda x: x["sentiment"] in ["negative", "neutral", "positive"]
)


Filter: 100%|██████████████████████████████████████████████████████████████████████| 34819/34819 [00:00<00:00, 238039.04 examples/s]


In [6]:
set(dataset["train"]["sentiment"])


{'negative', 'neutral', 'positive'}

In [7]:
label2id = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

id2label = {v: k for k, v in label2id.items()}


In [8]:
def encode_labels(example):
    example["labels"] = label2id[example["sentiment"]]
    return example

dataset = dataset.map(encode_labels)


Map: 100%|██████████████████████████████████████████████████████████████████████████| 34631/34631 [00:01<00:00, 23512.16 examples/s]


In [9]:
# Remove unused columns
dataset = dataset.remove_columns(["sentiment", "topic", "intent", "binary"])


In [10]:
dataset["train"].features


{'text': Value('string'), 'labels': Value('int64')}

In [11]:
set(dataset["train"]["labels"])


{0, 1, 2}

In [13]:
# Load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")


In [14]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized = dataset.map(tokenize_function, batched=True)
tokenized.set_format("torch")


Map: 100%|██████████████████████████████████████████████████████████████████████████| 34631/34631 [00:01<00:00, 31167.27 examples/s]


In [15]:
tokenized["train"].features


{'text': Value('string'),
 'labels': Value('int64'),
 'input_ids': List(Value('int32')),
 'attention_mask': List(Value('int8'))}

In [16]:
# Load XLM-RoBERTa-large (3-class classifier)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    num_labels=3
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Freeze the encoder
for param in model.roberta.parameters():
    param.requires_grad = False


In [18]:
sum(p.requires_grad for p in model.parameters())


4

In [20]:
# Define metric
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="macro")
    
    return {
        "accuracy": acc,
        "f1": f1
    }


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./sst_m_xlmr_zero_shot",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50,
    save_strategy="no",
    report_to="none",
)


In [22]:
# Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["train"],  # zero-shot baseline
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


In [23]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0558,1.047525,0.410297,0.229382
2,0.9737,0.948994,0.539055,0.478626
3,0.9109,0.859589,0.687246,0.676385
4,0.8855,0.815097,0.676966,0.660458
5,0.8727,0.784045,0.702867,0.691214
6,0.8439,0.762482,0.713378,0.704943
7,0.8265,0.746421,0.719067,0.711848
8,0.8223,0.732958,0.723831,0.715373
9,0.795,0.722578,0.726892,0.718551
10,0.7641,0.712889,0.732234,0.723148


TrainOutput(global_step=21660, training_loss=0.8334422628415038, metrics={'train_runtime': 4419.5599, 'train_samples_per_second': 156.717, 'train_steps_per_second': 4.901, 'total_flos': 1.6136912044752384e+17, 'train_loss': 0.8334422628415038, 'epoch': 20.0})

In [24]:
results = trainer.evaluate()
results


{'eval_loss': 0.6778305768966675,
 'eval_accuracy': 0.7555369466662817,
 'eval_f1': 0.7491834750424698,
 'eval_runtime': 105.761,
 'eval_samples_per_second': 327.446,
 'eval_steps_per_second': 10.24,
 'epoch': 20.0}