In [29]:
!pip install -U datasets accelerate seqeval transformers




In [30]:
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import numpy as np
from seqeval.metrics import f1_score, classification_report

dataset = load_dataset("asas-ai/ANERCorp")
dataset


DatasetDict({
    train: Dataset({
        features: ['word', 'tag'],
        num_rows: 125102
    })
    test: Dataset({
        features: ['word', 'tag'],
        num_rows: 25008
    })
})

In [31]:
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])


{'word': 'فرانكفورت', 'tag': 'B-LOC'}
{'word': '(د', 'tag': 'O'}
{'word': 'ب', 'tag': 'O'}


In [32]:
SENTENCE_ENDINGS = {".", "؟", "!", "؛"}

def build_sentences(ds_split):
    sentences = []
    labels = []

    curr_tokens = []
    curr_tags = []

    for row in ds_split:
        w = row["word"]
        t = row["tag"]

        if w is None or w.strip() == "":
            continue

        curr_tokens.append(w)
        curr_tags.append(t)

        if w in SENTENCE_ENDINGS:
            sentences.append(curr_tokens)
            labels.append(curr_tags)
            curr_tokens = []
            curr_tags = []

    if curr_tokens:
        sentences.append(curr_tokens)
        labels.append(curr_tags)

    return sentences, labels

train_texts, train_labels = build_sentences(dataset["train"])
test_texts, test_labels = build_sentences(dataset["test"])

len(train_texts), len(test_texts)


(4279, 976)

In [33]:
all_tags = set()

for seq in train_labels + test_labels:
    for t in seq:
        all_tags.add(t)

label_list = sorted(list(all_tags))
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

label_list, label2id


(['B-LOC',
  'B-MISC',
  'B-ORG',
  'B-PERS',
  'I-LOC',
  'I-MISC',
  'I-ORG',
  'I-PERS',
  'O'],
 {'B-LOC': 0,
  'B-MISC': 1,
  'B-ORG': 2,
  'B-PERS': 3,
  'I-LOC': 4,
  'I-MISC': 5,
  'I-ORG': 6,
  'I-PERS': 7,
  'O': 8})

In [34]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "asafaya/bert-base-arabic"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
def encode_examples(texts, tags):
    tokenized_inputs = tokenizer(
        texts,
        is_split_into_words=True,
        truncation=True,
        padding=True,
        max_length=128
    )

    all_labels = []

    for i, label_seq in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_seq[word_idx]])
            else:
                original_label = label_seq[word_idx]
                if original_label.startswith("B-"):
                    new_label = "I-" + original_label[2:]
                else:
                    new_label = original_label
                label_ids.append(label2id[new_label])
            previous_word_idx = word_idx

        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

train_encodings = encode_examples(train_texts, train_labels)
test_encodings  = encode_examples(test_texts, test_labels)


In [36]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train_encodings)
eval_dataset  = Dataset.from_dict(test_encodings)

train_dataset.set_format(type="torch")
eval_dataset.set_format(type="torch")


In [37]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader  = DataLoader(eval_dataset,  batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [38]:
def evaluate_model(model, data_loader, label_list):
    model.eval()
    all_true = []
    all_pred = []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds  = torch.argmax(logits, -1)
            labels = batch["labels"]

            preds  = preds.cpu().numpy()
            labels = labels.cpu().numpy()

            for p_seq, l_seq in zip(preds, labels):
                curr_true, curr_pred = [], []
                for p_i, l_i in zip(p_seq, l_seq):
                    if l_i == -100:
                        continue
                    curr_true.append(label_list[l_i])
                    curr_pred.append(label_list[p_i])
                all_true.append(curr_true)
                all_pred.append(curr_pred)

    print("F1:", f1_score(all_true, all_pred))
    print(classification_report(all_true, all_pred))


In [39]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")
    print("Validation:")
    evaluate_model(model, eval_loader, label_list)
    print("-" * 60)


Epoch 1 - Loss: 0.1431
Validation:
F1: 0.7554300608166812
              precision    recall  f1-score   support

         LOC       0.81      0.90      0.85       675
        MISC       0.60      0.46      0.52       243
         ORG       0.64      0.63      0.64       455
        PERS       0.78      0.81      0.79       905

   micro avg       0.75      0.76      0.76      2278
   macro avg       0.71      0.70      0.70      2278
weighted avg       0.74      0.76      0.75      2278

------------------------------------------------------------
Epoch 2 - Loss: 0.0487
Validation:
F1: 0.7342851054762412
              precision    recall  f1-score   support

         LOC       0.82      0.91      0.86       675
        MISC       0.54      0.61      0.57       243
         ORG       0.58      0.72      0.64       455
        PERS       0.76      0.70      0.73       905

   micro avg       0.71      0.76      0.73      2278
   macro avg       0.68      0.73      0.70      2278
weighted

In [43]:
def ner_predict(tokens):
    # tokenize correctly
    encoded = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoded)
        preds = outputs.logits.argmax(-1)[0].cpu().numpy()

    # get word ids from batch encoding
    word_ids = encoded.word_ids()

    results = []
    last_word = None

    for i, w in enumerate(word_ids):
        if w is None or w == last_word:
            continue

        label = id2label[preds[i]]
        results.append((tokens[w], label))
        last_word = w

    return results


# test
ner_predict(["محمد", "يسكن", "في", "دمشق", ".", "الآن"])


[('محمد', 'B-PERS'),
 ('يسكن', 'O'),
 ('في', 'O'),
 ('دمشق', 'B-LOC'),
 ('.', 'O'),
 ('الآن', 'O')]