## Downloading Data

In [1]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.


In [2]:
!git clone https://github.com/SinaLab/ArabicNER.git

Cloning into 'ArabicNER'...
remote: Enumerating objects: 610, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 610 (delta 24), reused 11 (delta 7), pack-reused 568 (from 1)[K
Receiving objects: 100% (610/610), 288.97 KiB | 7.81 MiB/s, done.
Resolving deltas: 100% (365/365), done.


In [3]:
!cd ArabicNER && cd data && ls

test.txt  train.txt  val.txt


In [11]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
import shutil
shutil.move("/content/ArabicNER/data/", "/content/drive/MyDrive/Wojood_NER/")

'/content/drive/MyDrive/Wojood_NER/data'

## Data Preprocessing

In [8]:
!pip install -qU datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import json
import pandas as pd
from pydantic import BaseModel, Field
from typing import List, Literal
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset, DatasetDict, load_from_disk

In [10]:
def load_ner_data(file_path):

    sentences, labels = [], []

    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        raw_sentences = content.split("\n\n")

        for raw_sentence in raw_sentences:
            words, sentence_labels = [], []
            for line in raw_sentence.split("\n"):
                parts = line.split()
                if len(parts) >= 2:
                    word = parts[0]
                    tag_list = parts[1:]
                    words.append(word)
                    sentence_labels.append(tag_list)

            if words and sentence_labels:
                sentences.append(words)
                labels.append(sentence_labels)

    return sentences, labels

In [11]:
train_sentences, train_labels = load_ner_data("/content/drive/MyDrive/Wojood_NER/data/train.txt")
test_sentences, test_labels = load_ner_data("/content/drive/MyDrive/Wojood_NER/data/test.txt")
val_sentences, val_labels = load_ner_data("/content/drive/MyDrive/Wojood_NER/data/val.txt")

In [12]:
train_sentences[69]

['مراسلة',
 'بلدية',
 'مدينة',
 'البيرة',
 'بخصوص',
 'مدينة',
 'البيرة',
 'ونظام',
 'التقسيمات',
 'الإدارية',
 'بتاريخ',
 '(',
 '25',
 '/',
 '1',
 '/',
 '1966',
 ')',
 '.']

In [13]:
train_labels[69]

[['B-OCC'],
 ['I-OCC', 'B-ORG'],
 ['I-OCC', 'I-ORG', 'B-GPE'],
 ['I-OCC', 'I-ORG', 'I-GPE'],
 ['O'],
 ['B-GPE'],
 ['I-GPE'],
 ['O'],
 ['O'],
 ['O'],
 ['B-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['I-DATE'],
 ['O']]

In [14]:
def format_ner_data(sentences, labels):
    formatted_data = []

    for words, tag_lists in zip(sentences, labels):
        sentence_text = " ".join(words)
        entities = []

        for word, tag_list in zip(words, tag_lists):
            for tag in tag_list:
                if tag != "O":
                    entities.append({"entity_value": word, "entity_type": tag})

        formatted_data.append({
            "text": sentence_text,
            "entities": json.dumps({"story_entities": entities}, ensure_ascii=False)
        })

    return pd.DataFrame(formatted_data)

In [15]:
train_df = format_ner_data(train_sentences, train_labels)
test_df = format_ner_data(test_sentences, test_labels)
val_df = format_ner_data(val_sentences, val_labels)

In [16]:
train_df["text"][69]

'مراسلة بلدية مدينة البيرة بخصوص مدينة البيرة ونظام التقسيمات الإدارية بتاريخ ( 25 / 1 / 1966 ) .'

In [17]:
train_df["entities"][69]

'{"story_entities": [{"entity_value": "مراسلة", "entity_type": "B-OCC"}, {"entity_value": "بلدية", "entity_type": "I-OCC"}, {"entity_value": "بلدية", "entity_type": "B-ORG"}, {"entity_value": "مدينة", "entity_type": "I-OCC"}, {"entity_value": "مدينة", "entity_type": "I-ORG"}, {"entity_value": "مدينة", "entity_type": "B-GPE"}, {"entity_value": "البيرة", "entity_type": "I-OCC"}, {"entity_value": "البيرة", "entity_type": "I-ORG"}, {"entity_value": "البيرة", "entity_type": "I-GPE"}, {"entity_value": "مدينة", "entity_type": "B-GPE"}, {"entity_value": "البيرة", "entity_type": "I-GPE"}, {"entity_value": "بتاريخ", "entity_type": "B-DATE"}, {"entity_value": "(", "entity_type": "I-DATE"}, {"entity_value": "25", "entity_type": "I-DATE"}, {"entity_value": "/", "entity_type": "I-DATE"}, {"entity_value": "1", "entity_type": "I-DATE"}, {"entity_value": "/", "entity_type": "I-DATE"}, {"entity_value": "1966", "entity_type": "I-DATE"}, {"entity_value": ")", "entity_type": "I-DATE"}]}'

In [18]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(1114, 2)
(357, 2)
(158, 2)


In [19]:
def count_total_entities(df):
    total_entities = 0

    for entity_json in df["entities"]:
        entity_data = json.loads(entity_json)
        total_entities += len(entity_data["story_entities"])

    return total_entities

In [20]:
train_entity_count = count_total_entities(train_df)
test_entity_count = count_total_entities(test_df)
val_entity_count = count_total_entities(val_df)

print(train_entity_count)
print(test_entity_count)
print(val_entity_count)

7261
2172
1140


In [21]:
train_df.head()

Unnamed: 0,text,entities
0,فقد حرصت روسيا على تعطيل مشاريع قرارات مختلفة ...,"{""story_entities"": [{""entity_value"": ""روسيا"", ..."
1,مراسلة سكرتير اللجنة القومية العربية في نابلس ...,"{""story_entities"": [{""entity_value"": ""سكرتير"",..."
2,2 - يعاقب كل من ارتكب أي من الأفعال المنصوص عل...,"{""story_entities"": [{""entity_value"": ""2"", ""ent..."
3,18 - اتخاذ كافة الإجراءات اللازمة لأجراء الاست...,"{""story_entities"": [{""entity_value"": ""18"", ""en..."
4,رسالة من وزير الداخلية الأردني إلى وزير الصحة ...,"{""story_entities"": [{""entity_value"": ""وزير"", ""..."


In [22]:
train_path = "/content/drive/MyDrive/Wojood_NER/csv_files/train.csv"
test_path = "/content/drive/MyDrive/Wojood_NER/csv_files/test.csv"
val_path = "/content/drive/MyDrive/Wojood_NER/csv_files/val.csv"

train_df.to_csv(train_path, index=False, encoding="utf-8-sig")
test_df.to_csv(test_path, index=False, encoding="utf-8-sig")
val_df.to_csv(val_path, index=False, encoding="utf-8-sig")

In [23]:
train_df = pd.concat([train_df, test_df], ignore_index=True)

In [24]:
print(train_df.shape)
print(val_df.shape)

(1471, 2)
(158, 2)


In [26]:
EntityType = Literal[
    "B-PERS", "I-PERS", "B-NORP", "I-NORP", "B-OCC", "I-OCC", "B-ORG", "I-ORG",
    "B-GPE", "I-GPE", "B-LOC", "I-LOC", "B-FAC", "I-FAC", "B-EVENT", "I-EVENT",
    "B-DATE", "I-DATE", "B-TIME", "I-TIME", "B-CARDINAL", "I-CARDINAL",
    "B-ORDINAL", "I-ORDINAL", "B-PERCENT", "I-PERCENT", "B-LANGUAGE", "I-LANGUAGE",
    "B-QUANTITY", "I-QUANTITY", "B-WEBSITE", "I-WEBSITE", "B-UNIT", "I-UNIT",
    "B-LAW", "I-LAW", "B-MONEY", "I-MONEY", "B-PRODUCT", "I-PRODUCT", "B-CURR", "I-CURR"
]

class NEREntity(BaseModel):
    entity_value: str = Field(..., description="The actual named entity found in the text.")
    entity_type: EntityType = Field(..., description="The entity type")

class NERData(BaseModel):
    story_entities: List[NEREntity] = Field(..., description="A list of entities found in the text.")

In [26]:
def convert_to_pydantic_format(entities_str):
    entities_dict = json.loads(entities_str)
    story_entities = [NEREntity(**entity) for entity in entities_dict["story_entities"]]
    return NERData(story_entities=story_entities).model_dump_json()

train_df["entities"] = train_df["entities"].apply(convert_to_pydantic_format)
val_df["entities"] = val_df["entities"].apply(convert_to_pydantic_format)

## Finetuning Data

In [27]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [28]:
def data_format(row):
    formatting_prompt = [
        {
            "role": "system",
            "content": "\n".join([
                "You are an advanced NLP entity extraction assistant.",
                "Your task is to extract named entities from Arabic text according to a given Pydantic schema.",
                "Ensure that the extracted entities exactly match how they appear in the text, without modifications.",
                "Follow the schema strictly, maintaining the correct entity types and structure.",
                "Output the extracted entities in JSON format, structured according to the provided Pydantic schema.",
                "Do not add explanations, introductions, or extra text, Only return the formatted JSON output."
            ])
        },
        {
            "role": "user",
            "content": "\n".join([
                "## Text:",
                row['text'].strip(),
                "",
                "## Pydantic Schema:",
                json.dumps(
                    NERData.model_json_schema(), ensure_ascii=False, indent=2
                ),
                "",
                "## Text Entities:",
                "```json"
            ])
        },
        {
            "role": "assistant",
            "content": row["entities"]
        }
    ]

    text = tokenizer.apply_chat_template(
        formatting_prompt,
        tokenize=False,
        add_generation_prompt=True
    )

    return text

In [29]:
train_df["text"] = train_df.apply(data_format, axis=1)
val_df["text"] = val_df.apply(data_format, axis=1)

In [30]:
train_final = train_df[["text"]]
val_final = val_df[["text"]]

In [31]:
train_final

Unnamed: 0,text
0,<|im_start|>system\nYou are an advanced NLP en...
1,<|im_start|>system\nYou are an advanced NLP en...
2,<|im_start|>system\nYou are an advanced NLP en...
3,<|im_start|>system\nYou are an advanced NLP en...
4,<|im_start|>system\nYou are an advanced NLP en...
...,...
1466,<|im_start|>system\nYou are an advanced NLP en...
1467,<|im_start|>system\nYou are an advanced NLP en...
1468,<|im_start|>system\nYou are an advanced NLP en...
1469,<|im_start|>system\nYou are an advanced NLP en...


In [32]:
train_dataset = Dataset.from_pandas(train_final)
val_dataset = Dataset.from_pandas(val_final)

In [33]:
dataset = DatasetDict({"train": train_dataset, "validation": val_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1471
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 158
    })
})

In [34]:
train_dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/datasets/train_dataset")
val_dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/datasets/val_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1471 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/158 [00:00<?, ? examples/s]

In [35]:
dataset.save_to_disk("/content/drive/MyDrive/Wojood_NER/dataset/")

Saving the dataset (0/1 shards):   0%|          | 0/1471 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/158 [00:00<?, ? examples/s]

## Model Evaluation

In [24]:
story = """
تتجه أنظار "وول ستريت" إلى إنفيديا، سهم شركة الرقائق الرائدة الذي كان رمزاً لطفرة الذكاء الاصطناعي والمفضل لدى المستثمرين الأفراد، قبل تقرير أرباح الشركة، بعد الجرس يوم الأربعاء.
"""

In [27]:
entities_extraction_messages = [
    {
        "role": "system",
        "content": "\n".join([
            "You are an advanced NLP entity extraction assistant.",
            "Your task is to extract named entities from Arabic text according to a given Pydantic schema.",
            "Ensure that the extracted entities exactly match how they appear in the text, without modifications.",
            "Follow the schema strictly, maintaining the correct entity types and structure.",
            "Output the extracted entities in JSON format, structured according to the provided Pydantic schema.",
            "Do not add explanations, introductions, or extra text, Only return the formatted JSON output."
        ])
    },
    {
        "role": "user",
        "content": "\n".join([
            "## Text:",
            story.strip(),
            "",
            "## Pydantic Schema:",
            json.dumps(
                NERData.model_json_schema(), ensure_ascii=False, indent=2
            ),
            "",
            "## Text Entities:",
            "```json"
        ])
    }
]

In [58]:
base_model_id = "Qwen/Qwen2.5-1.5B-Instruct"

In [41]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto"
)

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [67]:
text = tokenizer.apply_chat_template(
    entities_extraction_messages,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to("cuda")

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=1024,
    do_sample=False, top_k=None, temperature=None, top_p=None,
)

generated_ids = [
    output_ids[len(input_ids):]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [68]:
response

'[\n  {\n    "entity_value": "إنفيديا",\n    "entity_type": "B-PERSON"\n  },\n  {\n    "entity_value": "رقم السهم",\n    "entity_type": "I-PERSON"\n  },\n  {\n    "entity_value": "شركة الرقائق",\n    "entity_type": "B-ORGANIZATION"\n  },\n  {\n    "entity_value": "ذكاء اصطناعي",\n    "entity_type": "I-PRODUCT"\n  },\n  {\n    "entity_value": "المستثمرين الأفراد",\n    "entity_type": "I-PERSON"\n  },\n  {\n    "entity_value": "أرباح الشركة",\n    "entity_type": "I-PRODUCT"\n  },\n  {\n    "entity_value": "الجرس",\n    "entity_type": "I-PLACE"\n  },\n  {\n    "entity_value": "الأربعاء",\n    "entity_type": "I-DAY"\n  }\n]'

In [None]:
json.loads(response)

## Unlsoth Finetuning

In [13]:
train_dataset = load_from_disk("/content/drive/MyDrive/Wojood_NER/datasets/train_dataset")
val_dataset = load_from_disk("/content/drive/MyDrive/Wojood_NER/datasets/val_dataset")

In [14]:
!pip install -qU protobuf unsloth

In [15]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

In [16]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True
)

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [17]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    max_seq_length = 2048,
    use_rslora = True,
    loftq_config = None,
)

In [18]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,

    train_dataset = train_dataset,
    eval_dataset = val_dataset,

    dataset_text_field = "text",
    max_seq_length = 2048,

    args = TrainingArguments(
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        gradient_accumulation_steps = 4,

        warmup_steps = 10,
        max_steps = 80,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),

        save_strategy = "steps",
        save_steps = 10,
        eval_strategy = "steps",
        eval_steps = 10,
        save_total_limit = 2,
        logging_steps = 10,

        output_dir = "/content/drive/MyDrive/Wojood_NER/model_checkpoints",
        optim = "adamw_8bit",
        seed = 3407,
        report_to="tensorboard"
    ),
)

In [19]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,471 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 80
 "-____-"     Number of trainable parameters = 18,464,768


Step,Training Loss,Validation Loss
10,1.1002,0.764631
20,0.5361,0.294163
30,0.2076,0.185448
40,0.1653,0.174022
50,0.176,0.165562
60,0.1624,0.161061
70,0.1629,0.158358
80,0.1556,0.157493


TrainOutput(global_step=80, training_loss=0.3332588583230972, metrics={'train_runtime': 1192.9913, 'train_samples_per_second': 1.073, 'train_steps_per_second': 0.067, 'total_flos': 8600102806573056.0, 'train_loss': 0.3332588583230972})

In [20]:
model.save_pretrained("/content/drive/MyDrive/Wojood_NER/model")
tokenizer.save_pretrained("/content/drive/MyDrive/Wojood_NER/model")

('/content/drive/MyDrive/Wojood_NER/model/tokenizer_config.json',
 '/content/drive/MyDrive/Wojood_NER/model/special_tokens_map.json',
 '/content/drive/MyDrive/Wojood_NER/model/vocab.json',
 '/content/drive/MyDrive/Wojood_NER/model/merges.txt',
 '/content/drive/MyDrive/Wojood_NER/model/added_tokens.json',
 '/content/drive/MyDrive/Wojood_NER/model/tokenizer.json')

In [33]:
from google.colab import userdata
HUGGINGFACE_API_KEY = userdata.get('HUGGINGFACE_API_KEY')

model.push_to_hub("AhmedNabil1/arabic_ner_qwen_model", token=HUGGINGFACE_API_KEY)
tokenizer.push_to_hub("AhmedNabil1/arabic_ner_qwen_model", token=HUGGINGFACE_API_KEY)

README.md:   0%|          | 0.00/616 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/73.9M [00:00<?, ?B/s]

Saved model to https://huggingface.co/AhmedNabil1/arabic_ner_qwen_model


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

## Inference

In [22]:
model_path = "/content/drive/MyDrive/Wojood_NER/model"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = 2048,
    dtype = torch.float16 if torch.cuda.is_available() else None,
    load_in_4bit = True
)

model = FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [28]:
text = tokenizer.apply_chat_template(
    entities_extraction_messages,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to("cuda")

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=1024,
    do_sample=False, top_k=None, temperature=None, top_p=None,
)

generated_ids = [
    output_ids[len(input_ids):]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [29]:
response

'{"story_entities":[{"entity_value":"إنفيديا","entity_type":"B-ORG"},{"entity_value":"شركة","entity_type":"B-ORG"},{"entity_value":"الرقائق","entity_type":"B-ORG"},{"entity_value":"ذكاء","entity_type":"B-PRODUCT"},{"entity_value":"الاصطناعي","entity_type":"I-PRODUCT"},{"entity_value":"المفضل","entity_type":"B-PRODUCT"},{"entity_value":"مستثمرين","entity_type":"B-PERS"},{"entity_value":"الأفراد","entity_type":"I-PERS"}]}'

In [31]:
json.loads(response)

{'story_entities': [{'entity_value': 'إنفيديا', 'entity_type': 'B-ORG'},
  {'entity_value': 'شركة', 'entity_type': 'B-ORG'},
  {'entity_value': 'الرقائق', 'entity_type': 'B-ORG'},
  {'entity_value': 'ذكاء', 'entity_type': 'B-PRODUCT'},
  {'entity_value': 'الاصطناعي', 'entity_type': 'I-PRODUCT'},
  {'entity_value': 'المفضل', 'entity_type': 'B-PRODUCT'},
  {'entity_value': 'مستثمرين', 'entity_type': 'B-PERS'},
  {'entity_value': 'الأفراد', 'entity_type': 'I-PERS'}]}