In [1]:

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


# url = 'faisalq/bert-base-arabic-wordpiece'
url = 'faisalq/bert-base-arabic-senpiece'
# url = 'faisalq/bert-base-arabic-bbpe'

df = pd.read_csv('WikiFANE_2014_Gold_standard/NewsFANE_Gold_2014_170K.txt', sep='\t', encoding='utf-8', 
                 header=None, names=['Word', 'tok'], engine='python', quotechar="\x07")
df.fillna('', inplace=True)
df.loc[df['tok'] == '', 'tok'] = 'O'  # empty lines don't have a token which will acquire a new noise label
# display(df[35:50])

df['tok'] = df['tok'].astype('category')
df['label'] = df['tok'].cat.codes

# x = set(df['tok'].values)
# display(x)

num_classes = len(set(df['label'].values))
display(num_classes)

sentences = []
labels = []

current_sentence = []
current_labels = []

#convert the df into a format suitable for BERT
for _, row in df.iterrows():
    if row['Word'] != '':  #check if the word is not a sentence separator (period)
        current_sentence.append(row['Word'])
        current_labels.append(row['label'])
    else:
        sentences.append(current_sentence)
        labels.append(current_labels)        
        current_sentence = []
        current_labels = []


dfx = pd.DataFrame({
    'tokens': sentences,
    'ner_tags': labels
})

# print(dfx)

dataset = Dataset.from_pandas(dfx)

# display(dataset)



tokenizer = AutoTokenizer.from_pretrained(url)


model = AutoModelForTokenClassification.from_pretrained(url, 
                                num_labels=num_classes)

max_length = 128


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True,
                                 padding="max_length", max_length=max_length)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword token
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



# dataset = dataset['train']
dataset = dataset.train_test_split(test_size=0.2)

dataset_train = dataset['train']
dataset_validation = dataset['test']

display(len(dataset_train))
display(len(dataset_validation))

# display(dataset_train)
    
dataset_train = dataset_train.map(tokenize_and_align_labels, batched=True)
dataset_validation = dataset_validation.map(tokenize_and_align_labels, batched=True)





2024-02-05 23:13:24.105290: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-05 23:13:24.128974: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


88

Some weights of BertForTokenClassification were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3807

952

Map:   0%|          | 0/3807 [00:00<?, ? examples/s]

Map:   0%|          | 0/952 [00:00<?, ? examples/s]

In [2]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_predictions = [tokenizer.convert_ids_to_tokens(tp) for tp in true_predictions]
    true_labels = [tokenizer.convert_ids_to_tokens(tl) for tl in true_labels]

    # results = classification_report(true_labels, true_predictions)
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        # "report": results,
    }



epochs = 70
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256



training_args = TrainingArguments(
    output_dir = 'bert_wp/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 2, 
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    # weight_decay=0.01,
    logging_steps = 50, #50_000
    evaluation_strategy = 'steps',
    # logging_dir="logs.txt",
    eval_steps = 50
    
)




# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Step,Training Loss,Validation Loss,Precision,Recall,F1
50,0.8443,0.199308,0.770641,0.76868,0.769659
100,0.1021,0.145795,0.822031,0.836407,0.829157
150,0.0414,0.150518,0.835283,0.851351,0.843241
200,0.0208,0.150796,0.842402,0.856598,0.84944
250,0.012,0.166978,0.833153,0.858188,0.845485
300,0.0078,0.161269,0.842784,0.86248,0.852518
350,0.0054,0.16527,0.84601,0.864706,0.855256
400,0.0038,0.172564,0.845292,0.863434,0.854267
450,0.0029,0.177359,0.841558,0.862162,0.851736
500,0.0023,0.181561,0.845686,0.861685,0.853611


{'eval_loss': 0.1926591843366623,
 'eval_precision': 0.8512072750078394,
 'eval_recall': 0.8631160572337043,
 'eval_f1': 0.8571203031259866,
 'eval_runtime': 0.692,
 'eval_samples_per_second': 1375.661,
 'eval_steps_per_second': 5.78,
 'epoch': 70.0}

In [3]:
# 942553