In [2]:
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

In [39]:
# load datasets
eng_sarc_dataset = load_dataset(path='raquiba/Sarcasm_News_Headline', split='train')
ar_sarc_dataset = load_dataset(path='ar_sarcasm', split='train', ignore_verifications=True)
kor_sarc_dataset = load_dataset(path='kor_sarcasm', split='train')

In [40]:
### process datasets
eng_sarc_dataset = eng_sarc_dataset.remove_columns(column_names=['article_link'])
eng_sarc_dataset = eng_sarc_dataset.rename_columns({"is_sarcastic":"label"})

ar_sarc_dataset = ar_sarc_dataset.remove_columns(column_names=['dialect', 'sentiment', 'original_sentiment', 'source'])
ar_sarc_dataset = ar_sarc_dataset.rename_columns({"tweet":"headline", "sarcasm":"label"})

kor_sarc_dataset = kor_sarc_dataset.rename_columns({"tokens":"headline"})

In [41]:
model_ckpt = 'distilbert-base-uncased'

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [42]:
# tokenize datasets using distilbert pretrained tokenizer
encoded_eng_dataset = eng_sarc_dataset.map(lambda sample: tokenizer(sample['headline'], max_length=512, padding="max_length", truncation=True), batched=True)
encoded_ar_dataset = ar_sarc_dataset.map(lambda sample: tokenizer(sample['headline'],  max_length=512, padding="max_length", truncation=True), batched=True)
encoded_kor_dataset = kor_sarc_dataset.map(lambda sample: tokenizer(sample['headline'], max_length=512,  padding="max_length", truncation=True), batched=True)

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [8]:
from transformers import AutoModelForSequenceClassification
import torch 

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

In [10]:
from transformers import TrainingArguments, Trainer

In [12]:
model_name = f"{model_ckpt}-multiling-sarcasm"
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=10,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=32,
                                 per_device_eval_batch_size=32,
                                 evaluation_strategy='epoch',
                                 weight_decay=0.01,
                                 log_level='error',
                                 push_to_hub=False,
                                 report_to='none')

In [13]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np 

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions)
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1, "acc": accuracy}

In [12]:
encoded_eng_dataset_rdy = encoded_eng_dataset.train_test_split(test_size=0.25)

In [13]:
trainer = Trainer(model=model,
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=encoded_eng_dataset_rdy['train'],
                 eval_dataset=encoded_eng_dataset_rdy['test'])
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,No log,0.245721,0.897732,0.89979
2,0.309200,0.215883,0.909383,0.913208
3,0.168300,0.208795,0.917062,0.919357
4,0.168300,0.221639,0.918264,0.920755
5,0.100000,0.251394,0.9149,0.919217
6,0.069700,0.287518,0.910661,0.916143
7,0.069700,0.298254,0.918681,0.922432
8,0.047000,0.31925,0.918691,0.921873
9,0.034200,0.340741,0.915038,0.919776
10,0.034200,0.338145,0.918235,0.922292


TrainOutput(global_step=3360, training_loss=0.11149543438638959, metrics={'train_runtime': 5779.7232, 'train_samples_per_second': 37.137, 'train_steps_per_second': 0.581, 'total_flos': 2.843280244752384e+16, 'train_loss': 0.11149543438638959, 'epoch': 10.0})

In [14]:
encoded_ar_dataset_rdy = encoded_ar_dataset.train_test_split(test_size=0.25)

In [16]:
trainer_ar = Trainer(model=model,
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=encoded_ar_dataset_rdy['train'],
                 eval_dataset=encoded_ar_dataset_rdy['test'])
trainer_ar.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,No log,0.410598,0.0,0.83981
2,No log,0.412799,0.017595,0.841232
3,No log,0.430641,0.023392,0.841706
4,No log,0.394647,0.077135,0.841232
5,No log,0.437436,0.061972,0.84218
6,0.386500,0.409591,0.220779,0.829384
7,0.386500,0.411075,0.139175,0.841706
8,0.386500,0.428214,0.138107,0.840284
9,0.386500,0.427417,0.172662,0.836493
10,0.386500,0.430012,0.181818,0.837915


TrainOutput(global_step=990, training_loss=0.35400498515427714, metrics={'train_runtime': 1714.6107, 'train_samples_per_second': 36.901, 'train_steps_per_second': 0.577, 'total_flos': 8381212312965120.0, 'train_loss': 0.35400498515427714, 'epoch': 10.0})

In [14]:
encoded_kor_dataset_rdy = encoded_kor_dataset.train_test_split(test_size=0.25)

In [16]:
trainer_kor = Trainer(model=model,
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=encoded_kor_dataset_rdy['train'],
                 eval_dataset=encoded_kor_dataset_rdy['test'])
trainer_kor.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,No log,0.51446,0.739606,0.735556
2,No log,0.510045,0.775869,0.750667
3,No log,0.490087,0.755496,0.757778
4,No log,0.490712,0.773504,0.764444
5,0.499000,0.507012,0.754853,0.758667
6,0.499000,0.502476,0.768906,0.756889
7,0.499000,0.50787,0.763823,0.755111
8,0.499000,0.524923,0.765739,0.756889
9,0.499000,0.523665,0.764114,0.760444
10,0.365500,0.532682,0.765591,0.757778


TrainOutput(global_step=1060, training_loss=0.42571446760645454, metrics={'train_runtime': 1919.0889, 'train_samples_per_second': 35.173, 'train_steps_per_second': 0.552, 'total_flos': 8941549409280000.0, 'train_loss': 0.42571446760645454, 'epoch': 10.0})

In [80]:
trainer_kor.state.log_history

[{'eval_loss': 0.5144595503807068,
  'eval_f1': 0.7396061269146608,
  'eval_acc': 0.7355555555555555,
  'eval_runtime': 23.216,
  'eval_samples_per_second': 96.916,
  'eval_steps_per_second': 1.551,
  'epoch': 1.0,
  'step': 106},
 {'eval_loss': 0.5100448131561279,
  'eval_f1': 0.7758689572512985,
  'eval_acc': 0.7506666666666667,
  'eval_runtime': 22.9996,
  'eval_samples_per_second': 97.828,
  'eval_steps_per_second': 1.565,
  'epoch': 2.0,
  'step': 212},
 {'eval_loss': 0.49008703231811523,
  'eval_f1': 0.7554957379991027,
  'eval_acc': 0.7577777777777778,
  'eval_runtime': 23.1709,
  'eval_samples_per_second': 97.105,
  'eval_steps_per_second': 1.554,
  'epoch': 3.0,
  'step': 318},
 {'eval_loss': 0.49071213603019714,
  'eval_f1': 0.7735042735042735,
  'eval_acc': 0.7644444444444445,
  'eval_runtime': 23.1273,
  'eval_samples_per_second': 97.288,
  'eval_steps_per_second': 1.557,
  'epoch': 4.0,
  'step': 424},
 {'loss': 0.499,
  'learning_rate': 1.0566037735849058e-05,
  'epoch': 