## Downloading Data

In [1]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
!git clone https://github.com/SinaLab/ArabicNER.git

Cloning into 'ArabicNER'...
remote: Enumerating objects: 624, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 624 (delta 29), reused 19 (delta 10), pack-reused 568 (from 1)[K
Receiving objects: 100% (624/624), 295.74 KiB | 4.77 MiB/s, done.
Resolving deltas: 100% (370/370), done.


In [3]:
!cd ArabicNER && cd data && ls

test.txt  train.txt  val.txt


In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
import shutil
shutil.move("/content/ArabicNER/data/", "/content/drive/MyDrive/Wojood_NER/")

'/content/drive/MyDrive/Wojood_NER/data'

## Data Preprocessing

In [2]:
!pip install -qU datasets

In [2]:
import json
import pandas as pd
from pydantic import BaseModel, Field
from typing import List, Literal
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset, DatasetDict, load_from_disk

In [8]:
def load_ner_data(file_path):

    sentences, labels = [], []

    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        raw_sentences = content.split("\n\n")

        for raw_sentence in raw_sentences:
            words, sentence_labels = [], []
            for line in raw_sentence.split("\n"):
                parts = line.split()
                if len(parts) >= 2:
                    word = parts[0]
                    tag_list = parts[1:]
                    words.append(word)
                    sentence_labels.append(tag_list)

            if words and sentence_labels:
                sentences.append(words)
                labels.append(sentence_labels)

    return sentences, labels

In [9]:
train_sentences, train_labels = load_ner_data("/content/drive/MyDrive/Wojood_NER/data/train.txt")
test_sentences, test_labels = load_ner_data("/content/drive/MyDrive/Wojood_NER/data/test.txt")
val_sentences, val_labels = load_ner_data("/content/drive/MyDrive/Wojood_NER/data/val.txt")

In [10]:
train_sentences[69]

['مراسلة',
 'بلدية',
 'مدينة',
 'البيرة',
 'بخصوص',
 'مدينة',
 'البيرة',
 'ونظام',
 'التقسيمات',
 'الإدارية',
 'بتاريخ',
 '(',
 '25',
 '/',
 '1',
 '/',
 '1966',
 ')',
 '.']

In [11]:
train_labels[69]

[['B-OCC'],
 ['I-OCC', 'B-ORG'],
 ['I-OCC', 'I-ORG', 'B-GPE'],
 ['I-OCC', 'I-ORG', 'I-GPE'],
 ['O'],
 ['B-GPE'],
 ['I-GPE'],
 ['O'],
 ['O'],
 ['O'],
 ['B-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['O']]

In [12]:
def show_junk_sentences(sentences, labels):
    """
    Prints sentences that are junk, e.g. just '.'
    or ['"', '$#$'] with O labels.
    """
    for i, (words, tags) in enumerate(zip(sentences, labels)):
        if len(words) == 1 and words[0] == ".":
            print(f"Sentence {i}: {words} | Labels: {tags}")

        elif words == ['"', '$#$'] and all(tag == ["O"] for tag in tags):
            print(f"Sentence {i}: {words} | Labels: {tags}")

In [13]:
show_junk_sentences(train_sentences, train_labels)

Sentence 22: ['.'] | Labels: [['O']]
Sentence 81: ['.'] | Labels: [['O']]
Sentence 82: ['.'] | Labels: [['O']]
Sentence 100: ['.'] | Labels: [['O']]
Sentence 171: ['.'] | Labels: [['O']]
Sentence 187: ['.'] | Labels: [['O']]
Sentence 249: ['.'] | Labels: [['O']]
Sentence 300: ['.'] | Labels: [['O']]
Sentence 367: ['.'] | Labels: [['O']]
Sentence 371: ['.'] | Labels: [['O']]
Sentence 422: ['.'] | Labels: [['O']]
Sentence 436: ['.'] | Labels: [['O']]
Sentence 458: ['"', '$#$'] | Labels: [['O'], ['O']]
Sentence 495: ['.'] | Labels: [['O']]
Sentence 505: ['"', '$#$'] | Labels: [['O'], ['O']]
Sentence 573: ['.'] | Labels: [['O']]
Sentence 597: ['.'] | Labels: [['O']]
Sentence 628: ['.'] | Labels: [['O']]
Sentence 632: ['.'] | Labels: [['O']]
Sentence 637: ['.'] | Labels: [['O']]
Sentence 721: ['.'] | Labels: [['O']]
Sentence 816: ['.'] | Labels: [['O']]
Sentence 841: ['.'] | Labels: [['O']]
Sentence 877: ['.'] | Labels: [['O']]
Sentence 921: ['"', '$#$'] | Labels: [['O'], ['O']]
Sentence 92

In [14]:
show_junk_sentences(test_sentences, test_labels)

Sentence 30: ['.'] | Labels: [['O']]
Sentence 97: ['.'] | Labels: [['O']]
Sentence 196: ['.'] | Labels: [['O']]
Sentence 241: ['.'] | Labels: [['O']]
Sentence 249: ['.'] | Labels: [['O']]
Sentence 256: ['"', '$#$'] | Labels: [['O'], ['O']]
Sentence 303: ['.'] | Labels: [['O']]
Sentence 312: ['.'] | Labels: [['O']]
Sentence 336: ['"', '$#$'] | Labels: [['O'], ['O']]
Sentence 347: ['.'] | Labels: [['O']]
Sentence 351: ['.'] | Labels: [['O']]


In [15]:
show_junk_sentences(val_sentences, val_labels)

Sentence 53: ['"', '$#$'] | Labels: [['O'], ['O']]
Sentence 60: ['.'] | Labels: [['O']]
Sentence 117: ['.'] | Labels: [['O']]
Sentence 124: ['"', '$#$'] | Labels: [['O'], ['O']]


In [16]:
def remove_junk(sentences, labels):
    """
    Removes junk sentences:
    - Single '.'
    - ['"', '$#$'] with all 'O' labels
    Returns cleaned sentences and labels.
    """
    clean_sentences, clean_labels = [], []

    for words, tags in zip(sentences, labels):
        # Case 1: sentence is just "."
        if len(words) == 1 and words[0] == ".":
            continue

        # Case 2: sentence is ['"', '$#$'] with all 'O' labels
        if words == ['"', '$#$'] and all(tag == ["O"] for tag in tags):
            continue

        # Keep everything else
        clean_sentences.append(words)
        clean_labels.append(tags)

    return clean_sentences, clean_labels

In [17]:
train_sentences, train_labels = remove_junk(train_sentences, train_labels)
test_sentences, test_labels = remove_junk(test_sentences, test_labels)
val_sentences, val_labels = remove_junk(val_sentences, val_labels)

In [18]:
def show_sentences_with_dollar(sentences, labels):
    """
    Prints all sentences end with '$#$'.
    """
    for i, (words, tags) in enumerate(zip(sentences, labels)):
        if words and (words[-1] == "$#$"):
            print(f"Sentence {i}: {words} | Labels: {tags}")

In [155]:
# show_sentences_with_dollar(train_sentences, train_labels)

In [19]:
def remove_trailing_dollars(sentences, labels):
    """
    Removes one or more '$#$' tokens from the end of each sentence.
    """
    clean_sentences, clean_labels = [], []

    for words, tags in zip(sentences, labels):
        # Remove trailing $#$ (could be one or many)
        while words and words[-1] == "$#$":
            words = words[:-1]
            tags = tags[:-1]

        # Only keep if something remains
        if words:
            clean_sentences.append(words)
            clean_labels.append(tags)

    return clean_sentences, clean_labels

In [20]:
train_sentences, train_labels = remove_trailing_dollars(train_sentences, train_labels)
test_sentences, test_labels = remove_trailing_dollars(test_sentences, test_labels)
val_sentences, val_labels = remove_trailing_dollars(val_sentences, val_labels)

In [21]:
def show_single_word_sentences(sentences, labels):
    """
    Prints all sentences that contain only one word/token.
    """
    for i, (words, tags) in enumerate(zip(sentences, labels)):
        if len(words) == 1:  # just one token
            print(f"Sentence {i}: {words} | Labels: {tags}")

In [22]:
show_single_word_sentences(train_sentences, train_labels)

Sentence 131: ['shopify'] | Labels: [['O']]
Sentence 311: ['الخسرانة'] | Labels: [['O']]
Sentence 474: ['المدير'] | Labels: [['B-OCC']]
Sentence 750: ['نكشات'] | Labels: [['O']]
Sentence 1077: ['"'] | Labels: [['O']]


In [23]:
show_single_word_sentences(test_sentences, test_labels)

Sentence 92: ['الأختام'] | Labels: [['O']]
Sentence 135: ['المجانية'] | Labels: [['O']]
Sentence 162: ['google'] | Labels: [['B-WEBSITE']]
Sentence 251: ['animations'] | Labels: [['O']]


In [24]:
show_single_word_sentences(val_sentences, val_labels)

Sentence 147: ['أصحابه'] | Labels: [['O']]


In [25]:
def remove_junk_single_word(sentences, labels):
    clean_sentences, clean_labels = [], []

    for words, tags in zip(sentences, labels):
        # Remove if it's exactly ['"']
        if len(words) == 1 and words[0] == '"':
            continue

        clean_sentences.append(words)
        clean_labels.append(tags)

    return clean_sentences, clean_labels

In [26]:
train_sentences, train_labels = remove_junk_single_word(train_sentences, train_labels)

In [27]:
show_single_word_sentences(train_sentences, train_labels)

Sentence 131: ['shopify'] | Labels: [['O']]
Sentence 311: ['الخسرانة'] | Labels: [['O']]
Sentence 474: ['المدير'] | Labels: [['B-OCC']]
Sentence 750: ['نكشات'] | Labels: [['O']]


In [28]:
def remove_duplicates(sentences, labels, split_name=""):
    """
    Removes duplicate (sentence, labels) pairs.
    Keeps the first occurrence.
    Prints how many were removed.
    """
    seen = set()
    new_sentences, new_labels = [], []
    removed = 0

    for words, tags in zip(sentences, labels):
        key = (tuple(words), tuple(tuple(t) for t in tags))  # make hashable
        if key not in seen:
            seen.add(key)
            new_sentences.append(words)
            new_labels.append(tags)
        else:
            removed += 1

    print(f"[{split_name}] Removed {removed} duplicates (kept {len(new_sentences)})")

    return new_sentences, new_labels

In [29]:
train_sentences, train_labels = remove_duplicates(train_sentences, train_labels, "Train")
test_sentences, test_labels = remove_duplicates(test_sentences, test_labels, "Test")
val_sentences, val_labels = remove_duplicates(val_sentences, val_labels, "Val")

[Train] Removed 0 duplicates (kept 1081)
[Test] Removed 0 duplicates (kept 346)
[Val] Removed 0 duplicates (kept 154)


In [30]:
def find_empty_entity_sentences(sentences, labels, return_examples=False, max_examples=10):
    """
    Finds sentences that have no entities (all tags are just 'O').

    sentences: list of list of tokens
    labels: list of list of list of tags (e.g. [['B-ORG'], ['O'], ...])
    """
    empty_indices = []
    examples = []

    for i, (words, tag_lists) in enumerate(zip(sentences, labels)):
        has_entity = False
        for tag_list in tag_lists:
            # tag_list is a list like ['I-OCC', 'B-ORG'] or ['O']
            if any(tag != "O" for tag in tag_list):
                has_entity = True
                break
        if not has_entity:
            empty_indices.append(i)
            if return_examples and len(examples) < max_examples:
                examples.append((i, words, tag_lists))

    print(f"Total sentences: {len(sentences)}")
    print(f"Empty sentences (no entities): {len(empty_indices)}")
    print(f"Percentage: {len(empty_indices)/len(sentences)*100:.2f}%")

    if return_examples:
        return empty_indices, examples
    return empty_indices

In [31]:
empty_idx = find_empty_entity_sentences(train_sentences, train_labels, return_examples=False)

Total sentences: 1081
Empty sentences (no entities): 233
Percentage: 21.55%


In [32]:
empty_idx = find_empty_entity_sentences(test_sentences, test_labels, return_examples=False)

Total sentences: 346
Empty sentences (no entities): 72
Percentage: 20.81%


In [33]:
empty_idx = find_empty_entity_sentences(val_sentences, val_labels, return_examples=False)

Total sentences: 154
Empty sentences (no entities): 23
Percentage: 14.94%


In [34]:
import random

def remove_empty_sentences(sentences, labels, num_to_remove, seed=42):
    """
    Removes a specified number of sentences that have no entities.

    Args:
        sentences (list[list[str]]): List of tokenized sentences.
        labels (list[list[list[str]]]): Corresponding labels.
        num_to_remove (int): Number of empty sentences to remove.
        seed (int): Random seed for reproducibility.

    Returns:
        (clean_sentences, clean_labels)
    """
    # Step 1: find indices of empty sentences
    empty_indices = []
    for i, tags in enumerate(labels):
        has_entity = any(tag != "O" for token_tags in tags for tag in token_tags)
        if not has_entity:
            empty_indices.append(i)

    # Step 2: choose which to remove
    random.seed(seed)
    to_remove = set(random.sample(empty_indices, min(num_to_remove, len(empty_indices))))

    # Step 3: build cleaned lists
    clean_sentences, clean_labels = [], []
    removed_count = 0
    for i, (words, tags) in enumerate(zip(sentences, labels)):
        if i in to_remove:
            removed_count += 1
            continue
        clean_sentences.append(words)
        clean_labels.append(tags)

    print(f"Requested removal: {num_to_remove}")
    print(f"Actually removed: {removed_count} (out of {len(empty_indices)} empty sentences)")

    return clean_sentences, clean_labels

In [35]:
# train_sentences, train_labels = remove_empty_sentences(train_sentences, train_labels, num_to_remove=150)

In [36]:
def format_ner_data(sentences, labels):

    formatted_data = []

    for words, tag_lists in zip(sentences, labels):
        sentence_text = " ".join(words)
        entities = []

        # We’ll track open entities for each entity type separately
        open_entities = {}

        for word, tag_list in zip(words, tag_lists):
            if not tag_list:  # safety check
                continue

            for tag in tag_list:
                if tag == "O":
                    # Close any open entities of all types
                    to_close = []
                    for ent_type, tokens in open_entities.items():
                        if tokens:
                            entities.append({
                                "entity_value": " ".join(tokens),
                                "entity_type": ent_type
                            })
                            to_close.append(ent_type)
                    for ent_type in to_close:
                        open_entities[ent_type] = []
                    continue

                prefix, entity_type = tag.split("-", 1)

                if prefix == "B":
                    # Close previous entity of the same type
                    if entity_type in open_entities and open_entities[entity_type]:
                        entities.append({
                            "entity_value": " ".join(open_entities[entity_type]),
                            "entity_type": entity_type
                        })
                    # Start new entity
                    open_entities[entity_type] = [word]

                elif prefix == "I":
                    if entity_type in open_entities and open_entities[entity_type]:
                        open_entities[entity_type].append(word)
                    else:
                        # Malformed case: start fresh
                        open_entities[entity_type] = [word]

        # Flush any remaining entities
        for ent_type, tokens in open_entities.items():
            if tokens:
                entities.append({
                    "entity_value": " ".join(tokens),
                    "entity_type": ent_type
                })

        formatted_data.append({
            "text": sentence_text,
            "entities": json.dumps({"story_entities": entities}, ensure_ascii=False)
            })

    return pd.DataFrame(formatted_data)

In [37]:
train_df = format_ner_data(train_sentences, train_labels)
test_df = format_ner_data(test_sentences, test_labels)
val_df = format_ner_data(val_sentences, val_labels)

In [38]:
train_df["text"][69]

'ونحث الدول بصفة خاصة على بذل كل جهد ممكن لمباشرة تحقيقات في الهجمات الموجهة ضد العاملين في مجال الرعاية الصحية والمرافق الطبية ووسائل النقل الطبي وإدانتها كافة بصفتها انتهاكات للقانون الدولي ، ولا سيما القانون الدولي الإنساني .'

In [39]:
train_df["entities"][69]

'{"story_entities": [{"entity_value": "العاملين في مجال الرعاية الصحية", "entity_type": "NORP"}, {"entity_value": "للقانون الدولي", "entity_type": "LAW"}, {"entity_value": "القانون الدولي الإنساني", "entity_type": "LAW"}]}'

In [40]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(1081, 2)
(346, 2)
(154, 2)


In [41]:
def count_total_entities(df):
    total_entities = 0

    for entity_json in df["entities"]:
        entity_data = json.loads(entity_json)
        total_entities += len(entity_data["story_entities"])

    return total_entities

In [42]:
train_entity_count = count_total_entities(train_df)
test_entity_count = count_total_entities(test_df)
val_entity_count = count_total_entities(val_df)

print(train_entity_count)
print(test_entity_count)
print(val_entity_count)

2883
870
470


In [43]:
from collections import Counter
import json

def count_entities(df):
    """
    Count how many entities of each type exist in a DataFrame.
    Assumes df['entities'] is JSON string or dict with {"story_entities": [...]}
    """
    counter = Counter()

    for entity_data in df["entities"]:
        # handle JSON string vs dict
        if isinstance(entity_data, str):
            entity_data = json.loads(entity_data)
        for ent in entity_data["story_entities"]:
            counter[ent["entity_type"]] += 1

    return counter

In [44]:
count_entities(train_df)

Counter({'ORG': 594,
         'PERS': 214,
         'OCC': 174,
         'GPE': 677,
         'CARDINAL': 65,
         'DATE': 553,
         'ORDINAL': 181,
         'LAW': 20,
         'NORP': 168,
         'EVENT': 107,
         'TIME': 10,
         'WEBSITE': 21,
         'LOC': 33,
         'FAC': 30,
         'PERCENT': 1,
         'MONEY': 8,
         'CURR': 10,
         'LANGUAGE': 8,
         'PRODUCT': 3,
         'UNIT': 4,
         'QUANTITY': 2})

In [45]:
count_entities(test_df)

Counter({'PERS': 75,
         'GPE': 219,
         'EVENT': 21,
         'ORG': 194,
         'NORP': 37,
         'WEBSITE': 11,
         'DATE': 156,
         'TIME': 3,
         'MONEY': 3,
         'CURR': 3,
         'CARDINAL': 22,
         'OCC': 64,
         'FAC': 15,
         'ORDINAL': 38,
         'LAW': 4,
         'LOC': 4,
         'PERCENT': 1})

In [46]:
count_entities(val_df)

Counter({'ORG': 87,
         'OCC': 33,
         'GPE': 117,
         'PERS': 42,
         'DATE': 84,
         'UNIT': 1,
         'QUANTITY': 1,
         'PERCENT': 1,
         'WEBSITE': 3,
         'CARDINAL': 20,
         'NORP': 25,
         'TIME': 1,
         'FAC': 6,
         'ORDINAL': 25,
         'MONEY': 2,
         'CURR': 3,
         'EVENT': 12,
         'LOC': 7})

> The dataset is highly imbalanced

In [47]:
train_path = "/content/drive/MyDrive/Wojood_NER/csv_files/train.csv"
test_path = "/content/drive/MyDrive/Wojood_NER/csv_files/test.csv"
val_path = "/content/drive/MyDrive/Wojood_NER/csv_files/val.csv"

train_df.to_csv(train_path, index=False, encoding="utf-8-sig")
test_df.to_csv(test_path, index=False, encoding="utf-8-sig")
val_df.to_csv(val_path, index=False, encoding="utf-8-sig")

In [48]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(1081, 2)
(346, 2)
(154, 2)


In [3]:
EntityType = Literal[
    "PERS", "NORP", "OCC", "ORG", "GPE", "LOC", "FAC", "EVENT",
    "DATE", "TIME", "CARDINAL", "ORDINAL", "PERCENT", "LANGUAGE",
    "QUANTITY", "WEBSITE", "UNIT", "LAW", "MONEY", "PRODUCT", "CURR"
]

class NEREntity(BaseModel):
    entity_value: str = Field(..., description="The actual named entity found in the text.")
    entity_type: EntityType = Field(..., description="The entity type")

class NERData(BaseModel):
    story_entities: List[NEREntity] = Field(..., description="A list of entities found in the text.")

In [50]:
def convert_to_pydantic_format(entities_str):
    entities_dict = json.loads(entities_str)
    story_entities = [NEREntity(**entity) for entity in entities_dict["story_entities"]]
    return NERData(story_entities=story_entities).model_dump_json()

train_df["entities"] = train_df["entities"].apply(convert_to_pydantic_format)
test_df["entities"] = test_df["entities"].apply(convert_to_pydantic_format)
val_df["entities"] = val_df["entities"].apply(convert_to_pydantic_format)

In [51]:
train_df['text'][69]

'ونحث الدول بصفة خاصة على بذل كل جهد ممكن لمباشرة تحقيقات في الهجمات الموجهة ضد العاملين في مجال الرعاية الصحية والمرافق الطبية ووسائل النقل الطبي وإدانتها كافة بصفتها انتهاكات للقانون الدولي ، ولا سيما القانون الدولي الإنساني .'

In [52]:
json.loads(train_df['entities'][69])

{'story_entities': [{'entity_value': 'العاملين في مجال الرعاية الصحية',
   'entity_type': 'NORP'},
  {'entity_value': 'للقانون الدولي', 'entity_type': 'LAW'},
  {'entity_value': 'القانون الدولي الإنساني', 'entity_type': 'LAW'}]}

## Finetuning Data

In [53]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [54]:
def data_format(row):
    formatting_prompt = [
        {
            "role": "system",
            "content": "\n".join([
                "You are an advanced NLP entity extraction assistant.",
                "Your task is to extract named entities from Arabic text according to a given Pydantic schema.",
                "Ensure that the extracted entities exactly match how they appear in the text, without modifications.",
                "Follow the schema strictly, maintaining the correct entity types and structure.",
                "Output the extracted entities in JSON format, structured according to the provided Pydantic schema.",
                "Do not add explanations, introductions, or extra text, Only return the formatted JSON output."
            ])
        },
        {
            "role": "user",
            "content": "\n".join([
                "## Text:",
                row['text'].strip(),
                "",
                "## Pydantic Schema:",
                json.dumps(
                    NERData.model_json_schema(), ensure_ascii=False, indent=2
                ),
                "",
                "## Text Entities:",
                "```json"
            ])
        },
        {
            "role": "assistant",
            "content": row["entities"]
        }
    ]

    text = tokenizer.apply_chat_template(
        formatting_prompt,
        tokenize=False,
        add_generation_prompt=True
    )

    return text

In [55]:
train_df["text"] = train_df.apply(data_format, axis=1)
test_df["text"] = test_df.apply(data_format, axis=1)
val_df["text"] = val_df.apply(data_format, axis=1)

In [56]:
train_final = train_df[["text"]]
test_final = test_df[["text"]]
val_final = val_df[["text"]]

In [58]:
train_final['text'][0]

'<|im_start|>system\nYou are an advanced NLP entity extraction assistant.\nYour task is to extract named entities from Arabic text according to a given Pydantic schema.\nEnsure that the extracted entities exactly match how they appear in the text, without modifications.\nFollow the schema strictly, maintaining the correct entity types and structure.\nOutput the extracted entities in JSON format, structured according to the provided Pydantic schema.\nDo not add explanations, introductions, or extra text, Only return the formatted JSON output.<|im_end|>\n<|im_start|>user\n## Text:\nفقد حرصت روسيا على تعطيل مشاريع قرارات مختلفة في مجلس الأمن ضد الأسد وتدخلت عسكرياً لقلب الكفة لصالحه ،\n\n## Pydantic Schema:\n{\n  "$defs": {\n    "NEREntity": {\n      "properties": {\n        "entity_value": {\n          "description": "The actual named entity found in the text.",\n          "title": "Entity Value",\n          "type": "string"\n        },\n        "entity_type": {\n          "description":

In [59]:
train_dataset = Dataset.from_pandas(train_final)
test_dataset = Dataset.from_pandas(test_final)
val_dataset = Dataset.from_pandas(val_final)

In [60]:
dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "validation": val_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1081
    })
    test: Dataset({
        features: ['text'],
        num_rows: 346
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 154
    })
})

In [61]:
train_dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/datasets/train_dataset")
test_dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/datasets/test_dataset")
val_dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/datasets/val_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1081 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/346 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/154 [00:00<?, ? examples/s]

In [62]:
dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/dataset/")

Saving the dataset (0/1 shards):   0%|          | 0/1081 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/346 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/154 [00:00<?, ? examples/s]

## Model Evaluation

In [63]:
base_model_id = "Qwen/Qwen2.5-0.5B-Instruct"

In [64]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto"
)

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [65]:
def extract_entities_from_story(story, model, tokenizer):
    """
    Extract named entities from Arabic text using the fine-tuned model.

    Args:
        story (str): The Arabic text to extract entities from
        model: The fine-tuned model
        tokenizer: The tokenizer

    Returns:
        str: The model's response containing extracted entities in JSON format
    """
    # Create the messages for entity extraction
    entities_extraction_messages = [
        {
            "role": "system",
            "content": "\n".join([
                "You are an advanced NLP entity extraction assistant.",
                "Your task is to extract named entities from Arabic text according to a given Pydantic schema.",
                "Ensure that the extracted entities exactly match how they appear in the text, without modifications.",
                "Follow the schema strictly, maintaining the correct entity types and structure.",
                "Output the extracted entities in JSON format, structured according to the provided Pydantic schema.",
                "Do not add explanations, introductions, or extra text, Only return the formatted JSON output."
            ])
        },
        {
            "role": "user",
            "content": "\n".join([
                "## Text:",
                story.strip(),
                "",
                "## Pydantic Schema:",
                json.dumps(
                    NERData.model_json_schema(), ensure_ascii=False, indent=2
                ),
                "",
                "## Text Entities:",
                "```json"
            ])
        }
    ]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        entities_extraction_messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize and generate
    model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=1024,
        do_sample=False,
        top_k=None,
        temperature=None,
        top_p=None,
    )

    # Decode response
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [66]:
story = """
المحكمة العليا الإسرائيلية تقرر استمرار عمل الكسّارات الإسرائيلية في الضفة الغربية بتاريخ ( 29 / 12 / 2011 ) .
"""

response = extract_entities_from_story(story, model, tokenizer)
response

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'```json\n{\n  "entity_value": "المحكمة العليا الإسرائيلية",\n  "entity_type": "ORG"\n}\n```'

In [67]:
json.loads(response)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [68]:
story = """
مضابط بلدية نابلس عام ( 1308 ) هجري مضبط رقم 435 .
"""

response = extract_entities_from_story(story, model, tokenizer)
response

'```json\n{\n  "entity_value": "بلدية نابلس عام (1308) هجري مضبط رقم 435",\n  "entity_type": "LOCATION"\n}\n```json'

In [69]:
json.loads(response)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [70]:
story = """
تتجه أنظار "وول ستريت" إلى إنفيديا، سهم شركة الرقائق الرائدة الذي كان رمزاً لطفرة الذكاء الاصطناعي والمفضل لدى المستثمرين الأفراد، قبل تقرير أرباح الشركة، بعد الجرس يوم الأربعاء.
"""

response = extract_entities_from_story(story, model, tokenizer)
response

'```json\n{\n  "entity_value": "وول ستريت",\n  "entity_type": "ORG"\n}\n```'

In [71]:
json.loads(response)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

## Unlsoth Finetuning

In [5]:
!pip install protobuf==5.29.5



In [12]:
!pip install -U bitsandbytes

In [7]:
!pip install -qU unsloth

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.4/205.4 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.7/131.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.6/213.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
train_dataset = load_from_disk("/content/drive/MyDrive/Wojood_NER/datasets/train_dataset")
val_dataset = load_from_disk("/content/drive/MyDrive/Wojood_NER/datasets/val_dataset")

In [5]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [6]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.9.2: Fast Qwen2 patching. Transformers: 4.56.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = False,
    random_state = 3407,
    max_seq_length = 2048,
    use_rslora = True,
    loftq_config = None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.2 patched 24 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [8]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,

    train_dataset = train_dataset,
    eval_dataset = val_dataset,

    dataset_text_field = "text",
    max_seq_length = 1024,

    args = TrainingArguments(
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 8,
        gradient_accumulation_steps = 4,

        num_train_epochs = 1,
        warmup_steps = 10,

        learning_rate=7e-4,
        lr_scheduler_type = "cosine",

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),

        weight_decay=0.05,
        max_grad_norm = 1.0,

        save_strategy = "epoch",
        save_total_limit = 1,
        eval_strategy = "steps",
        eval_steps = 10,
        logging_steps = 10,

        output_dir = "/content/drive/MyDrive/Wojood_NER/model_checkpoints",
        optim = "adamw_8bit",
        seed = 3407,
        report_to="tensorboard",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1081 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/154 [00:00<?, ? examples/s]

In [9]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,081 | Num Epochs = 1 | Total steps = 68
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 8,798,208 of 502,830,976 (1.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,0.7976,0.274811
20,0.2592,0.31629
30,0.2745,0.238532
40,0.2428,0.225294
50,0.2637,0.217651
60,0.2143,0.210324


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=68, training_loss=0.3262525937136482, metrics={'train_runtime': 295.0887, 'train_samples_per_second': 3.663, 'train_steps_per_second': 0.23, 'total_flos': 1396413322621440.0, 'train_loss': 0.3262525937136482, 'epoch': 1.0})

In [10]:
model.save_pretrained("/content/drive/MyDrive/Wojood_NER/model")
tokenizer.save_pretrained("/content/drive/MyDrive/Wojood_NER/model")

('/content/drive/MyDrive/Wojood_NER/model/tokenizer_config.json',
 '/content/drive/MyDrive/Wojood_NER/model/special_tokens_map.json',
 '/content/drive/MyDrive/Wojood_NER/model/chat_template.jinja',
 '/content/drive/MyDrive/Wojood_NER/model/vocab.json',
 '/content/drive/MyDrive/Wojood_NER/model/merges.txt',
 '/content/drive/MyDrive/Wojood_NER/model/added_tokens.json',
 '/content/drive/MyDrive/Wojood_NER/model/tokenizer.json')

In [11]:
from google.colab import userdata
HUGGINGFACE_API_KEY = userdata.get('HUGGINGFACE_API_KEY')

model.push_to_hub("AhmedNabil1/arabic_ner_qwen_model", token=HUGGINGFACE_API_KEY)
tokenizer.push_to_hub("AhmedNabil1/arabic_ner_qwen_model", token=HUGGINGFACE_API_KEY)

README.md:   0%|          | 0.00/616 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/35.2M [00:00<?, ?B/s]

Saved model to https://huggingface.co/AhmedNabil1/arabic_ner_qwen_model


## Inference

In [12]:
model_path = "/content/drive/MyDrive/Wojood_NER/model"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

model = FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.9.2: Fast Qwen2 patching. Transformers: 4.56.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [14]:
def extract_entities_from_story(story, model, tokenizer):
    """
    Extract named entities from Arabic text using the fine-tuned model.

    Args:
        story (str): The Arabic text to extract entities from
        model: The fine-tuned model
        tokenizer: The tokenizer

    Returns:
        str: The model's response containing extracted entities in JSON format
    """
    # Create the messages for entity extraction
    entities_extraction_messages = [
        {
            "role": "system",
            "content": "\n".join([
                "You are an advanced NLP entity extraction assistant.",
                "Your task is to extract named entities from Arabic text according to a given Pydantic schema.",
                "Ensure that the extracted entities exactly match how they appear in the text, without modifications.",
                "Follow the schema strictly, maintaining the correct entity types and structure.",
                "Output the extracted entities in JSON format, structured according to the provided Pydantic schema.",
                "Do not add explanations, introductions, or extra text, Only return the formatted JSON output."
            ])
        },
        {
            "role": "user",
            "content": "\n".join([
                "## Text:",
                story.strip(),
                "",
                "## Pydantic Schema:",
                json.dumps(
                    NERData.model_json_schema(), ensure_ascii=False, indent=2
                ),
                "",
                "## Text Entities:",
                "```json"
            ])
        }
    ]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        entities_extraction_messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize and generate
    model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=1024,
        do_sample=False,
        top_k=None,
        temperature=None,
        top_p=None,
    )

    # Decode response
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [15]:
story = """
المحكمة العليا الإسرائيلية تقرر استمرار عمل الكسّارات الإسرائيلية في الضفة الغربية بتاريخ ( 29 / 12 / 2011 ) .
"""

response = extract_entities_from_story(story, model, tokenizer)
response

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'{"story_entities":[{"entity_value":"المحكمة العليا الإسرائيلية","entity_type":"ORG"},{"entity_value":"فلسطيني","entity_type":"NORP"},{"entity_value":"بتاريخ ( 29 / 12 / 2011 )","entity_type":"DATE"}]}'

In [16]:
json.loads(response)

{'story_entities': [{'entity_value': 'المحكمة العليا الإسرائيلية',
   'entity_type': 'ORG'},
  {'entity_value': 'فلسطيني', 'entity_type': 'NORP'},
  {'entity_value': 'بتاريخ ( 29 / 12 / 2011 )', 'entity_type': 'DATE'}]}

In [17]:
story = """
مضابط بلدية نابلس عام ( 1308 ) هجري مضبط رقم 435 .
"""

response = extract_entities_from_story(story, model, tokenizer)
response

'{"story_entities":[{"entity_value":"بلدية نابلس","entity_type":"ORG"},{"entity_value":"نابلس","entity_type":"GPE"},{"entity_value":"عام ( 1308 ) هجري","entity_type":"DATE"},{"entity_value":"435","entity_type":"ORDINAL"}]}'

In [18]:
json.loads(response)

{'story_entities': [{'entity_value': 'بلدية نابلس', 'entity_type': 'ORG'},
  {'entity_value': 'نابلس', 'entity_type': 'GPE'},
  {'entity_value': 'عام ( 1308 ) هجري', 'entity_type': 'DATE'},
  {'entity_value': '435', 'entity_type': 'ORDINAL'}]}

In [19]:
story = """
تتجه أنظار "وول ستريت" إلى إنفيديا، سهم شركة الرقائق الرائدة الذي كان رمزاً لطفرة الذكاء الاصطناعي والمفضل لدى المستثمرين الأفراد، قبل تقرير أرباح الشركة، بعد الجرس يوم الأربعاء.
"""

response = extract_entities_from_story(story, model, tokenizer)
response

'{"story_entities":[{"entity_value":"وول ستريت","entity_type":"ORG"},{"entity_value":"إنفيديا","entity_type":"ORG"},{"entity_value":"الذين يُعرفون باسم ذوي الأقلية","entity_type":"NORP"}]}'

In [20]:
json.loads(response)

{'story_entities': [{'entity_value': 'وول ستريت', 'entity_type': 'ORG'},
  {'entity_value': 'إنفيديا', 'entity_type': 'ORG'},
  {'entity_value': 'الذين يُعرفون باسم ذوي الأقلية', 'entity_type': 'NORP'}]}