In [1]:
import os

from functools import partial

import evaluate

import numpy as np
import pandas as pd

from arabert.preprocess import ArabertPreprocessor
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split
from evaluate import evaluator, combine

# Setup Data

In [2]:
dataset_path = "https://nlp-slides.vercel.app/clean-tweets.tsv"
   
dataset = pd.read_csv(filepath_or_buffer=dataset_path, sep="\t")

In [3]:
dataset.head(n=2)

Unnamed: 0,Tweet,Country,Topic,Sentiment,Sentiment_Expression,Sentiment_Target,word_count,char_count,clean_text,clean_stemmed
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",lebanon,personal,negative,implicit,بريق العيون,23,132,اومن بان الانسان ينطفي جماله ابتعاد يحب بريق ا...,اوم بان انس نطف جمل بعد يحب برق عين خفي صبح ذب...
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,jordan,sports,positive,explicit,افضل لاعب في العالم,23,141,الذاكره عندما اعتقد كريستيانو انه افضل لاعب ال...,ذكر عند عقد كريستيانو انه فضل لعب علم ككا يسي ...


In [4]:
data = dataset[["Tweet", "Sentiment"]]
data.head(n=3)

Unnamed: 0,Tweet,Sentiment
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",negative
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,positive
2,لا نخلو من ضغوطات الحياة. فنحن نعيش على أرض أع...,neutral


In [5]:
data = data.rename({"Tweet": "text", "Sentiment": "label"}, axis=1)

In [6]:
data["label"] = data["label"].replace({"negative": 0, "neutral": 1, "positive": 2})

In [7]:
data

Unnamed: 0,text,label
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",0
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,2
2,لا نخلو من ضغوطات الحياة. فنحن نعيش على أرض أع...,1
3,#مصطلحات_لبنانيه_حيرت_البشريه بتوصل عالبيت ، ب...,0
4,نصمت !! لتسير حياتنا على مً يرام فالناّس لم تع...,0
...,...,...
3995,صلاح من لاعب في المقاولون العرب يحلم ان يلعب ل...,2
3996,الملك سلمان بن عبد العزيز: تطبيق الأنظمة بحزم ...,2
3997,@ZahraaIraq9 😂 كل ما ادخل حسابي الكه تغريداتج ...,0
3998,شو هالشعب نحنا اللي عايش بلا مي وكهربا والزبال...,0


In [8]:
data["label"].value_counts()

0    1883
2    1232
1     885
Name: label, dtype: int64

# Setup Model

In [9]:
model_name="aubmindlab/bert-base-arabertv02-twitter"

## Preprocess Data for Model

In [10]:
arabert_prep = ArabertPreprocessor(model_name=model_name)

In [11]:
sample = dataset.loc[3997, 'Tweet']
sample

'@ZahraaIraq9 😂 كل ما ادخل حسابي الكه تغريداتج عن حب العراق وانتي هسه اذا ينطوج جنسيه مال غير دوله و يجيج واتب كل را… https://t.co/MqZmtiTcil'

In [12]:
arabert_prep.preprocess(sample)

'[مستخدم] 😂 كل ما ادخل حسابي الكه تغريداتج عن حب العراق وانتي هسه اذا ينطوج جنسيه مال غير دوله و يجيج واتب كل را … [رابط]'

In [13]:
data["text"] = data["text"].apply(arabert_prep.preprocess)

In [14]:
data.head(n=2)

Unnamed: 0,text,label
0,""" أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد ...",0
1,من الذاكره . . [مستخدم] . عندما اعتقد كريستيان...,2


In [15]:
train_data, validation_data = train_test_split(data, stratify=data["label"].tolist())

In [16]:
train_dataset = Dataset.from_pandas(df=train_data, preserve_index=False)
val_dataset = Dataset.from_pandas(df=validation_data, preserve_index=False)

In [17]:
train_dataset, val_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 3000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 1000
 }))

## Tokenizer

In [18]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

In [19]:
tokenizer

PreTrainedTokenizerFast(name_or_path='aubmindlab/bert-base-arabertv02-twitter', vocab_size=64000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [20]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [21]:
train_data = train_dataset.map(preprocess_function, batched=True)
val_data = val_dataset.map(preprocess_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Model

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmi

In [23]:
# Use DataCollatorWithPadding to pad tokens and prepare batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics

In [24]:
f1 = evaluate.load("f1")

In [25]:
def compute_metrics(eval_pred: np.ndarray, metric: evaluate.Metric):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

compute_metrics_fn = partial(compute_metrics, metric=f1)

# Training

## Training Args

In [26]:
training_args = TrainingArguments(
    output_dir=os.path.join(os.curdir, "data"),
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_steps=50,
    logging_strategy="steps",
    logging_steps=50, 
    evaluation_strategy="steps",
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    load_best_model_at_end=True,
    save_steps=50,
    save_total_limit=1
)

## Trainer

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_fn
)

## Train Model

In [28]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 940
  Number of trainable parameters = 135195651
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
50,0.865,0.680612,0.721337
100,0.615,0.536201,0.791496
150,0.4529,0.533914,0.796555
200,0.379,0.519856,0.812192
250,0.2886,0.562367,0.802992
300,0.2762,0.575765,0.810857
350,0.2234,0.621137,0.806009
400,0.232,0.605137,0.802674
450,0.1991,0.622573,0.802703
500,0.1664,0.667783,0.806869


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./data/checkpoint-50
Configuration saved in ./data/checkpoint-50/config.json
Model weights saved in ./data/checkpoint-50/pytorch_model.bin
tokenizer config file saved in ./data/checkpoint-50/tokenizer_config.json
Special tokens file saved in ./data/checkpoint-50/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to .

Configuration saved in ./data/checkpoint-600/config.json
Model weights saved in ./data/checkpoint-600/pytorch_model.bin
tokenizer config file saved in ./data/checkpoint-600/tokenizer_config.json
Special tokens file saved in ./data/checkpoint-600/special_tokens_map.json
Deleting older checkpoint [data/checkpoint-550] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./data/checkpoint-650
Configuration saved in ./data/checkpoint-650/config.json
Model weights saved in ./data/checkpoint-650/pytorch_model.bin
tokenizer config file saved in ./data/checkpoint-650/tokenizer_config.json
Special tokens file saved in ./data/checkpoint-650/special_tokens_map.json
Delet

TrainOutput(global_step=940, training_loss=0.2570740765713631, metrics={'train_runtime': 258.412, 'train_samples_per_second': 116.094, 'train_steps_per_second': 3.638, 'total_flos': 719360306362656.0, 'train_loss': 0.2570740765713631, 'epoch': 10.0})

## Evaluate Model

In [30]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32


{'eval_loss': 0.519856333732605,
 'eval_f1': 0.8121922288151512,
 'eval_runtime': 1.7249,
 'eval_samples_per_second': 579.752,
 'eval_steps_per_second': 18.552,
 'epoch': 10.0}