In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!pip install transformers torch -q


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "CAMeL-Lab/bert-base-arabic-camelbert-ca-ner"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

id2label = model.config.id2label   # مثلاً {0: 'O', 1: 'B-PER', ...}
label2id = model.config.label2id

print(id2label)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/979 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PERS', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PERS', 8: 'O'}


In [5]:
def ner_predict(sentence, model, tokenizer):
    """
    ترجع list of (word, NER-tag) لجملة عربية.
    """
    words = sentence.split()

    tokens = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128,
    )
    word_ids = tokens.word_ids()

    model.eval()
    with torch.no_grad():
        inputs = {k: v for k, v in tokens.items() if k != "token_type_ids"}
        outputs = model(**inputs)
        preds = outputs.logits.argmax(-1).squeeze(0).tolist()

    # هل مفاتيح id2label أعداد ولا سترنغ؟
    example_key = next(iter(id2label.keys()))
    keys_are_str = isinstance(example_key, str)

    word_labels = []
    for w_idx in range(len(words)):
        first_subword_index = None
        for pos, wid in enumerate(word_ids):
            if wid == w_idx:
                first_subword_index = pos
                break

        if first_subword_index is None:
            word_labels.append("O")
        else:
            idx = preds[first_subword_index]
            label = id2label[str(idx)] if keys_are_str else id2label[idx]
            word_labels.append(label)

    return list(zip(words, word_labels))


In [6]:
def pretty_print_ner(sentence, model, tokenizer):
    tagged = ner_predict(sentence, model, tokenizer)

    print("Sentence:")
    for w, tag in tagged:
        print(f"{w:<20} {tag}")
    print("*" * 60)


In [7]:
script_lines = [
    "أحمد: متى سنصل إلى إسطنبول؟",
    "ليلى: سمعت أن الشركة الجديدة في شارع الاستقلال.",
    "في المساء اجتمع الأصدقاء في مقهى صغير قرب الميناء.",
]

for i, line in enumerate(script_lines, start=1):
    print(f"\nLine {i}: {line}")
    tagged = ner_predict(line, model, tokenizer)
    for w, tag in tagged:
        if tag != "O":
            print(f"  {w:<15} -> {tag}")
    print("-" * 40)



Line 1: أحمد: متى سنصل إلى إسطنبول؟


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

  أحمد:           -> B-PERS
  إسطنبول؟        -> B-LOC
----------------------------------------

Line 2: ليلى: سمعت أن الشركة الجديدة في شارع الاستقلال.
  ليلى:           -> B-PERS
  الاستقلال.      -> B-LOC
----------------------------------------

Line 3: في المساء اجتمع الأصدقاء في مقهى صغير قرب الميناء.
----------------------------------------


In [8]:
from collections import Counter

def summarize_entities(lines, model, tokenizer):
    per_counter = Counter()
    loc_counter = Counter()
    org_counter = Counter()

    for line in lines:
        tagged = ner_predict(line, model, tokenizer)
        current_entity = []
        current_label = None

        for w, tag in tagged:
            if tag.startswith("B-"):
                # لو كان فيه كيان شغال، نسجّله أول
                if current_entity and current_label:
                    text = " ".join(current_entity)
                    if current_label == "PER":
                        per_counter[text] += 1
                    elif current_label == "LOC":
                        loc_counter[text] += 1
                    elif current_label == "ORG":
                        org_counter[text] += 1
                # نبدأ كيان جديد
                current_label = tag.split("-")[1]
                current_entity = [w]
            elif tag.startswith("I-") and current_label == tag.split("-")[1]:
                current_entity.append(w)
            else:
                if current_entity and current_label:
                    text = " ".join(current_entity)
                    if current_label == "PER":
                        per_counter[text] += 1
                    elif current_label == "LOC":
                        loc_counter[text] += 1
                    elif current_label == "ORG":
                        org_counter[text] += 1
                current_entity = []
                current_label = None

        # نهاية السطر
        if current_entity and current_label:
            text = " ".join(current_entity)
            if current_label == "PER":
                per_counter[text] += 1
            elif current_label == "LOC":
                loc_counter[text] += 1
            elif current_label == "ORG":
                org_counter[text] += 1

    return per_counter, loc_counter, org_counter


In [9]:
per_counter, loc_counter, org_counter = summarize_entities(script_lines, model, tokenizer)

print("Top PERSON entities:")
for ent, c in per_counter.most_common(10):
    print(f"{ent:<20} {c}")

print("\nTop LOCATION entities:")
for ent, c in loc_counter.most_common(10):
    print(f"{ent:<20} {c}")

print("\nTop ORGANIZATION entities:")
for ent, c in org_counter.most_common(10):
    print(f"{ent:<20} {c}")


Top PERSON entities:

Top LOCATION entities:
إسطنبول؟             1
الاستقلال.           1

Top ORGANIZATION entities:
