In [1]:

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# url = 'faisalq/bert-base-arabic-wordpiece'
url = 'faisalq/bert-base-arabic-senpiece'
# url = 'faisalq/bert-base-arabic-bbpe'


dataset = load_dataset("wikiann", 'ar')

# display(dataset)

tokenizer = AutoTokenizer.from_pretrained(url)


model = AutoModelForTokenClassification.from_pretrained(url, 
                                num_labels=dataset["train"].features["ner_tags"].feature.num_classes)

max_length = 128


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True,
                                 padding="max_length", max_length=max_length)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword token
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



dataset_1 = dataset['train']
dataset_2 = dataset['test']
dataset_train = concatenate_datasets([dataset_1, dataset_2])

dataset_validation = dataset['validation']

# display(dataset_train)
    
dataset_train = dataset_train.map(tokenize_and_align_labels, batched=True)
dataset_validation = dataset_validation.map(tokenize_and_align_labels, batched=True)





2024-02-04 07:11:27.427791: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-04 07:11:27.451119: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found cached dataset wikiann (/home/ffq/.cache/huggingface/datasets/wikiann/ar/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/wikiann/ar/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-6f8a94ae77d2f78f.arrow
Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/wikiann/ar/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-01314223572663b6.arrow


In [2]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_predictions = [tokenizer.convert_ids_to_tokens(tp) for tp in true_predictions]
    true_labels = [tokenizer.convert_ids_to_tokens(tl) for tl in true_labels]

    # results = classification_report(true_labels, true_predictions)
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        # "report": results,
    }



epochs = 15
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256



training_args = TrainingArguments(
    output_dir = 'bert_wp/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 2, 
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    # weight_decay=0.01,
    logging_steps = 50, #50_000
    evaluation_strategy = 'steps',
    # logging_dir="logs.txt",
    eval_steps = 50
    
)




# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,0.5427,0.213149,0.875124,0.890188,0.882592
100,0.2056,0.176179,0.911984,0.905994,0.908979
150,0.1523,0.1571,0.923569,0.922702,0.923135
200,0.1278,0.150335,0.920137,0.922594,0.921364
250,0.1141,0.147036,0.932273,0.930894,0.931583
300,0.0779,0.146828,0.932504,0.930316,0.931409
350,0.0784,0.144624,0.937427,0.930966,0.934186
400,0.0535,0.149366,0.93451,0.934611,0.93456
450,0.0476,0.149573,0.933972,0.935657,0.934814
500,0.0429,0.16961,0.931716,0.937029,0.934365


{'eval_loss': 0.24043051898479462,
 'eval_precision': 0.9403324079749071,
 'eval_recall': 0.9412146800909387,
 'eval_f1': 0.9407733371807819,
 'eval_runtime': 5.1248,
 'eval_samples_per_second': 1951.292,
 'eval_steps_per_second': 7.805,
 'epoch': 15.0}

In [3]:
# 941537