In [1]:

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


# url = 'faisalq/bert-base-arabic-wordpiece'
# url = 'faisalq/bert-base-arabic-senpiece'
url = 'faisalq/bert-base-arabic-bbpe'


dataset = load_dataset("arbml/WDC")

df = dataset['train'].to_pandas()


# removing incorrect tags
df2 = df[~df['Entity'].str.contains('ORG', regex=True)]
df2 = df2[~df2['Entity'].str.contains('LOC', regex=True)]
df2 = df2[~df2['Entity'].str.contains('PER', regex=True)]
df2 = df2[~df2['Entity'].str.contains('MISC', regex=True)]
df2 = df2[df2['Entity'] != 'O']

v = df2['Entity'].values
v = list(set(v))
df = df[~df['Entity'].isin(v)]

display(set(df['Entity'].values))

classes_num = len(set(df['Entity']))

display(classes_num)

# display(df)
df['Entity'] = df['Entity'].astype('category')
df['label'] = df['Entity'].cat.codes



sentences = []
labels = []

current_sentence = []
current_labels = []

#convert the df into a format suitable for BERT
for _, row in df.iterrows():
    if row['Word'] != '.':  #check if the word is not a sentence separator (period)
        current_sentence.append(row['Word'])
        current_labels.append(row['label'])
    else:
        sentences.append(current_sentence)
        labels.append(current_labels)        
        current_sentence = []
        current_labels = []


dfx = pd.DataFrame({
    'tokens': sentences,
    'ner_tags': labels
})



dataset = Dataset.from_pandas(dfx)



# unique_classes = set()

# for example in dataset:
#     unique_classes.update(example['ner_tags'])



tokenizer = AutoTokenizer.from_pretrained(url, add_prefix_space=True)


model = AutoModelForTokenClassification.from_pretrained(url, 
                                num_labels= classes_num)

max_length = 128


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True,
                                 padding="max_length", max_length=max_length)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword token
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



# dataset = dataset['train']
dataset = dataset.train_test_split(test_size=0.2)

dataset_train = dataset['train']
dataset_validation = dataset['test']

# display(dataset_train)
    
dataset_train = dataset_train.map(tokenize_and_align_labels, batched=True)
dataset_validation = dataset_validation.map(tokenize_and_align_labels, batched=True)





2024-02-05 09:38:39.773069: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-05 09:38:39.796228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found cached dataset parquet (/home/ffq/.cache/huggingface/datasets/arbml___parquet/arbml--WDC-c6c8f367119243ea/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

9

Some weights of BertForTokenClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-bbpe and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/151082 [00:00<?, ? examples/s]

Map:   0%|          | 0/37771 [00:00<?, ? examples/s]

In [2]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_predictions = [tokenizer.convert_ids_to_tokens(tp) for tp in true_predictions]
    true_labels = [tokenizer.convert_ids_to_tokens(tl) for tl in true_labels]

    # results = classification_report(true_labels, true_predictions)
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        # "report": results,
    }



epochs = 50
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256



training_args = TrainingArguments(
    output_dir = 'bert_wp/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 2, 
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    # weight_decay=0.01,
    logging_steps = 500, #50_000
    evaluation_strategy = 'steps',
    # logging_dir="logs.txt",
    eval_steps = 500
    
)




# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.2846,0.202143,0.721199,0.702151,0.711548
1000,0.1811,0.174642,0.758749,0.728112,0.743115
1500,0.1495,0.168794,0.768552,0.749487,0.7589
2000,0.136,0.202979,0.764942,0.69987,0.730961
2500,0.1083,0.177822,0.764963,0.768908,0.76693
3000,0.0916,0.183762,0.776838,0.768163,0.772476
3500,0.0755,0.180738,0.782912,0.767363,0.775059
4000,0.0638,0.190979,0.766908,0.779078,0.772945
4500,0.0554,0.204152,0.768845,0.776199,0.772504
5000,0.0479,0.218625,0.764256,0.783878,0.773943


{'eval_loss': 0.40940067172050476,
 'eval_precision': 0.7870340538718016,
 'eval_recall': 0.7896135429116959,
 'eval_f1': 0.7883216882931697,
 'eval_runtime': 25.314,
 'eval_samples_per_second': 1492.1,
 'eval_steps_per_second': 5.847,
 'epoch': 50.0}

In [3]:
# 788369