In [1]:

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


url = 'faisalq/bert-base-arabic-wordpiece'
# url = 'faisalq/bert-base-arabic-senpiece'
# url = 'faisalq/bert-base-arabic-bbpe'


dataset = load_dataset("arbml/AQMAR")

df = dataset['train'].to_pandas()

sentences = []
labels = []

current_sentence = []
current_labels = []

#convert the df into a format suitable for BERT
for _, row in df.iterrows():
    if row['Word'] != '.':  #check if the word is not a sentence separator (period)
        current_sentence.append(row['Word'])
        current_labels.append(row['label'])
    else:
        sentences.append(current_sentence)
        labels.append(current_labels)        
        current_sentence = []
        current_labels = []


dfx = pd.DataFrame({
    'tokens': sentences,
    'ner_tags': labels
})

# print(dfx)

dataset = Dataset.from_pandas(dfx)

# display(dataset)

unique_classes = set()

for example in dataset:
    unique_classes.update(example['ner_tags'])

num_classes = len(unique_classes)+1  # 27 classes (label 22 is missing??)
# print(f"Number of unique classes: {num_classes}") 

tokenizer = AutoTokenizer.from_pretrained(url)


model = AutoModelForTokenClassification.from_pretrained(url, 
                                num_labels=num_classes)

max_length = 128


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True,
                                 padding="max_length", max_length=max_length)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword token
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



# dataset = dataset['train']
dataset = dataset.train_test_split(test_size=0.2)

dataset_train = dataset['train']
dataset_validation = dataset['test']

# display(dataset_train)
    
dataset_train = dataset_train.map(tokenize_and_align_labels, batched=True)
dataset_validation = dataset_validation.map(tokenize_and_align_labels, batched=True)





2024-02-04 19:21:54.189195: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-04 19:21:54.213093: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found cached dataset parquet (/home/ffq/.cache/huggingface/datasets/arbml___parquet/arbml--AQMAR-60f97168b35666a9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-wordpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1698 [00:00<?, ? examples/s]

Map:   0%|          | 0/425 [00:00<?, ? examples/s]

In [2]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_predictions = [tokenizer.convert_ids_to_tokens(tp) for tp in true_predictions]
    true_labels = [tokenizer.convert_ids_to_tokens(tl) for tl in true_labels]

    # results = classification_report(true_labels, true_predictions)
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        # "report": results,
    }



epochs = 30
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256



training_args = TrainingArguments(
    output_dir = 'bert_wp/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 2, 
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    # weight_decay=0.01,
    logging_steps = 10, #50_000
    evaluation_strategy = 'steps',
    # logging_dir="logs.txt",
    eval_steps = 10
    
)




# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Step,Training Loss,Validation Loss,Precision,Recall,F1
10,2.21,0.698995,0.0,0.0,0.0
20,0.5056,0.343774,0.435418,0.3042,0.358169
30,0.2812,0.223487,0.603517,0.603859,0.603688
40,0.1758,0.177703,0.65493,0.686152,0.670177
50,0.1225,0.167441,0.67587,0.694665,0.685139
60,0.0845,0.159612,0.710412,0.743473,0.726567
70,0.0626,0.15976,0.733817,0.746311,0.740011
80,0.0467,0.164469,0.740761,0.762202,0.751329
90,0.0365,0.169527,0.753898,0.768445,0.761102
100,0.0292,0.170445,0.737696,0.77412,0.755469


{'eval_loss': 0.19733229279518127,
 'eval_precision': 0.7530186608122942,
 'eval_recall': 0.7786606129398411,
 'eval_f1': 0.7656250000000001,
 'eval_runtime': 0.2905,
 'eval_samples_per_second': 1462.996,
 'eval_steps_per_second': 6.885,
 'epoch': 30.0}

In [3]:
# 942553