In [1]:
!pip install -q transformers datasets accelerate scikit-learn


In [2]:
import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter


In [3]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [4]:
dataset = load_dataset("umar178/UrduMultiDomainClassification")
dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Dataset%20-%20Sheet1.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/34819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'topic', 'intent', 'binary'],
        num_rows: 34819
    })
})

In [5]:
Counter(dataset["train"]["sentiment"])


Counter({'positive': 10847,
         'negative': 10086,
         'neutral': 13698,
         'request': 188})

In [7]:
label2id = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

id2label = {v: k for k, v in label2id.items()}


In [8]:
# Create 16-Shot Dataset (FIRST 16 PER CLASS)
selected_texts = []
selected_labels = []

class_counts = {"negative": 0, "neutral": 0, "positive": 0}

for example in dataset["train"]:
    label = example["sentiment"]
    if label in class_counts and class_counts[label] < 16:
        selected_texts.append(example["text"])
        selected_labels.append(label2id[label])
        class_counts[label] += 1

    if all(count == 16 for count in class_counts.values()):
        break

class_counts


{'negative': 16, 'neutral': 16, 'positive': 16}

In [9]:
sstm_16shot = Dataset.from_dict({
    "text": selected_texts,
    "label": selected_labels
})

sstm_16shot


Dataset({
    features: ['text', 'label'],
    num_rows: 48
})

In [10]:
Counter(sstm_16shot["label"])


Counter({2: 16, 0: 16, 1: 16})

In [11]:
model_name = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [12]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


In [13]:
sstm_16shot_tokenized = sstm_16shot.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

sstm_16shot_tokenized.set_format("torch")
sstm_16shot_tokenized


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 48
})

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.gradient_checkpointing_enable()


In [16]:
# Define Metrics (Accuracy + Macro F1)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "f1": f1
    }


In [18]:
training_args = TrainingArguments(
    output_dir="./sstm_xlmr_16shot",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=32,   # 🔒 fixed
    per_device_eval_batch_size=16,    # safe for Colab
    num_train_epochs=20,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
    seed=42,
    report_to="none"
)


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sstm_16shot_tokenized,
    eval_dataset=sstm_16shot_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [20]:
torch.cuda.empty_cache()


In [21]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.128825,0.333333,0.166667
2,No log,0.963511,0.604167,0.508001
3,No log,0.963511,0.604167,0.508001
4,No log,1.121752,0.395833,0.289171
5,No log,1.062373,0.354167,0.23908
6,No log,1.062373,0.354167,0.23908
7,No log,0.82811,0.791667,0.768965
8,No log,0.812744,0.645833,0.540268
9,No log,0.799049,0.645833,0.540268
10,No log,0.76023,0.645833,0.540268


TrainOutput(global_step=40, training_loss=0.8759775161743164, metrics={'train_runtime': 30.8066, 'train_samples_per_second': 31.162, 'train_steps_per_second': 1.298, 'total_flos': 223664282910720.0, 'train_loss': 0.8759775161743164, 'epoch': 20.0})

In [22]:
results_16 = trainer.evaluate()
results_16


{'eval_loss': 0.6627140045166016,
 'eval_accuracy': 0.6458333333333334,
 'eval_f1': 0.5402677199912223,
 'eval_runtime': 0.344,
 'eval_samples_per_second': 139.524,
 'eval_steps_per_second': 8.72,
 'epoch': 20.0}