In [1]:
import numpy as np
import torch
import pandas as pd

from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

SEED = 42
MODEL_NAME = "xlm-roberta-large"
MAX_LEN = 128

np.random.seed(SEED)
torch.manual_seed(SEED)


  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x1ffe49082b0>

In [2]:
# Load dataset
dataset = load_dataset("sepidmnorozy/Urdu_sentiment")

train_full = dataset["train"]
test_full  = dataset["test"]

print(train_full.shape, test_full.shape)


(685, 2) (294, 2)


In [3]:
# Create 80/20 split
train_pd = train_full.to_pandas()

train_80_df, dev_20_df = train_test_split(
    train_pd,
    test_size=0.2,
    random_state=SEED,
    stratify=train_pd["label"]
)

print(train_80_df.shape, dev_20_df.shape)
print(train_80_df["label"].value_counts(normalize=True))
print(dev_20_df["label"].value_counts(normalize=True))


(548, 2) (137, 2)
label
0    0.507299
1    0.492701
Name: proportion, dtype: float64
label
0    0.510949
1    0.489051
Name: proportion, dtype: float64


In [4]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )


In [5]:
# Convert datasets
train_80_ds = Dataset.from_pandas(train_80_df)
dev_20_ds   = Dataset.from_pandas(dev_20_df)

train_80_ds = train_80_ds.map(tokenize, batched=True, remove_columns=["text"])
dev_20_ds   = dev_20_ds.map(tokenize, batched=True, remove_columns=["text"])

train_80_ds.set_format("torch")
dev_20_ds.set_format("torch")

print(train_80_ds[0])


Map: 100%|██████████████████████████████████████████████████████████████████████████████| 548/548 [00:00<00:00, 17302.35 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 19344.86 examples/s]

{'label': tensor(0), '__index_level_0__': tensor(234), 'input_ids': tensor([     0, 183511,     50,    288,   8253,  87236,    490,  33571,    216,
         17396,   4481, 169009,    490,      6,  20276,   4297,    288,  15035,
         58894,   5588, 197857,  51765,   1568,   7060,   1597,    504,  16430,
          4573,  27469,    639,      2,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,     




In [12]:
# Metric (Accuracy)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "f1": f1
    }


In [8]:
# Load model (FULL fine-tuning)
model_80 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Training Arguments (STANDARD)
training_args_80 = TrainingArguments(
    output_dir="sst2_xlmr_80_20",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=SEED
)


In [10]:
# Trainer
trainer_80 = Trainer(
    model=model_80,
    args=training_args_80,
    train_dataset=train_80_ds,
    eval_dataset=dev_20_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_80 = Trainer(


In [13]:
# Train
trainer_80.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.691231,0.489051,0.328431
2,No log,0.658851,0.715328,0.710328
3,0.702900,0.728312,0.510949,0.338164
4,0.702900,0.601997,0.715328,0.715086
5,0.702900,0.597028,0.744526,0.744035
6,0.602900,0.62492,0.678832,0.671104
7,0.602900,0.653337,0.708029,0.708014
8,0.602900,0.695116,0.678832,0.677991
9,0.418500,0.940469,0.722628,0.720108
10,0.418500,0.943336,0.70073,0.698416




TrainOutput(global_step=360, training_loss=0.3034006910605563, metrics={'train_runtime': 4057.9391, 'train_samples_per_second': 2.701, 'train_steps_per_second': 0.089, 'total_flos': 2681166532294656.0, 'train_loss': 0.3034006910605563, 'epoch': 20.0})

In [14]:
# Final evaluation
results_80 = trainer_80.evaluate()
results_80




{'eval_loss': 0.5970277190208435,
 'eval_accuracy': 0.7445255474452555,
 'eval_f1': 0.7440345913628357,
 'eval_runtime': 8.0632,
 'eval_samples_per_second': 16.991,
 'eval_steps_per_second': 0.62,
 'epoch': 20.0}