In [1]:
from datasets import load_dataset, concatenate_datasets
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load datasets
eng_sarc_dataset = load_dataset(path='raquiba/Sarcasm_News_Headline')
ar_sarc_dataset = load_dataset(path='ar_sarcasm', ignore_verifications=True)
kor_sarc_dataset = load_dataset(path='kor_sarcasm')

Downloading and preparing dataset json/raquiba--Sarcasm_News_Headline to /root/.cache/huggingface/datasets/json/raquiba--Sarcasm_News_Headline-2c0b20e2ac20e61f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.59M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/raquiba--Sarcasm_News_Headline-2c0b20e2ac20e61f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading and preparing dataset ar_sarcasm/default (download: 733.12 KiB, generated: 2.18 MiB, post-processed: Unknown size, total: 2.90 MiB) to /root/.cache/huggingface/datasets/ar_sarcasm/default/1.0.0/946b5574cab73f8afb77406014d21a41f3d73d0d1922b8a675fa7449190b9753...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/8437 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2110 [00:00<?, ? examples/s]

Dataset ar_sarcasm downloaded and prepared to /root/.cache/huggingface/datasets/ar_sarcasm/default/1.0.0/946b5574cab73f8afb77406014d21a41f3d73d0d1922b8a675fa7449190b9753. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/728 [00:00<?, ?B/s]

Downloading and preparing dataset kor_sarcasm/default (download: 985.31 KiB, generated: 1020.03 KiB, post-processed: Unknown size, total: 1.96 MiB) to /root/.cache/huggingface/datasets/kor_sarcasm/default/1.1.0/00d38c200d4d563ed94efb9ff4ca119ded94fe3cdf1e381ed95274de0a9d59f0...


Downloading data:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/301 [00:00<?, ? examples/s]

Dataset kor_sarcasm downloaded and prepared to /root/.cache/huggingface/datasets/kor_sarcasm/default/1.1.0/00d38c200d4d563ed94efb9ff4ca119ded94fe3cdf1e381ed95274de0a9d59f0. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
### process datasets
eng_sarc_dataset = eng_sarc_dataset.remove_columns(column_names=['article_link'])
eng_sarc_dataset = eng_sarc_dataset.rename_columns({"is_sarcastic":"label"})

ar_sarc_dataset = ar_sarc_dataset.remove_columns(column_names=['dialect', 'sentiment', 'original_sentiment', 'source'])
ar_sarc_dataset = ar_sarc_dataset.rename_columns({"tweet":"headline", "sarcasm":"label"})

kor_sarc_dataset = kor_sarc_dataset.rename_columns({"tokens":"headline"})

In [4]:
model_ckpt = 'distilbert-base-uncased'

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
from transformers import AutoModelForSequenceClassification, DistilBertConfig
import torch 

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = DistilBertConfig(seq_classif_dropout=0.3)

def model_init(config, device):
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)
    return model

In [14]:
from transformers import TrainingArguments, Trainer

In [15]:
model_name = f"{model_ckpt}-multiling-sarcasm"
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=2,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=32,
                                 per_device_eval_batch_size=32,
                                 evaluation_strategy='epoch',
                                 weight_decay=0.01,
                                 log_level='error',
                                 push_to_hub=False,
                                 report_to='none')

In [16]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np 

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1, "acc": accuracy}

In [11]:
from datasets import DatasetDict

eng_valid_test = eng_sarc_dataset['train'].train_test_split(test_size=0.2)
ar_valid_test = ar_sarc_dataset['train'].train_test_split(test_size=0.2)
kor_train_valid = kor_sarc_dataset['train'].train_test_split(test_size=0.1)

eng_splits = DatasetDict({
    "train": eng_valid_test['train'],
    "valid": eng_valid_test['test'],
    "test": eng_sarc_dataset['test'],
})

ar_splits = DatasetDict({
    "train": ar_valid_test['train'],
    "valid": ar_valid_test['test'],
    "test": ar_sarc_dataset['test'],
})

kor_splits = DatasetDict({
    "train": kor_train_valid['train'],
    "valid": kor_train_valid['test'],
    "test": kor_sarc_dataset['test']
})

In [12]:
# tokenize datasets using distilbert pretrained tokenizer
encoded_eng_dataset = eng_splits.map(lambda sample: tokenizer(sample['headline'], max_length=512, padding="max_length", truncation=True), batched=True).shuffle()
encoded_ar_dataset = ar_splits.map(lambda sample: tokenizer(sample['headline'],  max_length=512, padding="max_length", truncation=True), batched=True).shuffle()
encoded_kor_dataset = kor_splits.map(lambda sample: tokenizer(sample['headline'], max_length=512,  padding="max_length", truncation=True), batched=True).shuffle()

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
trainer_eng = Trainer(model=model_init(config, device),
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=encoded_eng_dataset['train'],
                 eval_dataset=encoded_eng_dataset['valid'])
trainer_eng.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.we

Epoch,Training Loss,Validation Loss,F1,Acc
1,No log,0.224159,0.911428,0.911426
2,0.270200,0.218806,0.914855,0.91492


TrainOutput(global_step=716, training_loss=0.23958530106358022, metrics={'train_runtime': 1199.9914, 'train_samples_per_second': 38.159, 'train_steps_per_second': 0.597, 'total_flos': 6065682184458240.0, 'train_loss': 0.23958530106358022, 'epoch': 2.0})

In [18]:
trainer_ar = Trainer(model=model_init(config, device),
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=encoded_ar_dataset['train'],
                 eval_dataset=encoded_ar_dataset['valid'])
trainer_ar.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,No log,0.401227,0.775383,0.845972
2,No log,0.388728,0.775383,0.845972


TrainOutput(global_step=212, training_loss=0.40992650445902123, metrics={'train_runtime': 355.4199, 'train_samples_per_second': 37.978, 'train_steps_per_second': 0.596, 'total_flos': 1788044947058688.0, 'train_loss': 0.40992650445902123, 'epoch': 2.0})

In [19]:
trainer_kor = Trainer(model=model_init(config, device),
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=encoded_kor_dataset['train'],
                 eval_dataset=encoded_kor_dataset['valid'])
trainer_kor.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,No log,0.510958,0.747948,0.75
2,No log,0.474486,0.758495,0.758889


TrainOutput(global_step=254, training_loss=0.5539385577825111, metrics={'train_runtime': 402.4714, 'train_samples_per_second': 40.251, 'train_steps_per_second': 0.631, 'total_flos': 2145971858227200.0, 'train_loss': 0.5539385577825111, 'epoch': 2.0})

In [20]:
def concatenate_splits(corpora, cast_features):
    multi_corpus = DatasetDict()
    for split in corpora[0].keys():
        multi_corpus[split] = concatenate_datasets([corpus[split].cast(cast_features) for corpus in corpora]).shuffle()
    return multi_corpus        

In [21]:
corpora = [encoded_eng_dataset, encoded_ar_dataset, encoded_kor_dataset]

In [22]:
multiling_dataset = concatenate_splits(corpora, cast_features=encoded_kor_dataset['train'].features)

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
multiling_dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 37744
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8312
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 29120
    })
})

In [24]:
trainer = Trainer(model=model_init(config, device),
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=multiling_dataset['train'],
                 eval_dataset=multiling_dataset['valid'])
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,0.4006,0.287451,0.877098,0.877166
2,0.2876,0.279227,0.885427,0.886429


TrainOutput(global_step=1180, training_loss=0.3327484971385891, metrics={'train_runtime': 1953.894, 'train_samples_per_second': 38.635, 'train_steps_per_second': 0.604, 'total_flos': 9999698989744128.0, 'train_loss': 0.3327484971385891, 'epoch': 2.0})

In [133]:
def get_f1_score(trainer, corpus):
    return trainer.predict(corpus['test']).metrics['test_f1']

In [138]:
langs = ['eng', 'ar', 'kor']

def collect_scores(langs, key, trainer, corpora):
    scores = {}
    for i, lang in enumerate(langs):
        scores[key][lang] = get_f1_score(trainer, corpora[i])
    return scores

In [140]:
# collect_scores(langs, 'eng', trainer_eng, [encoded_ar_dataset,encoded_kor_dataset])