# How to build a **NER** model ?
1. dataset
2. framework that supports building ner models -> spacy
3. what is the shape of the dataset i need to make it work with spacy
4. build training pipeline
5. evaluate

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [2]:
from datasets import load_dataset

dataset = load_dataset("eriktks/conll2003")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for eriktks/conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/eriktks/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
dataset["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [3]:
!python -m spacy init config config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")

# Convert dataset to spacy formate
def convert_conll_to_spacy_format(example):
    tokens = example['tokens']
    tags = example['ner_tags']
    labels = dataset['train'].features['ner_tags'].feature.names

    entities = []
    start = 0
    text = ""
    for token, tag in zip(tokens, tags):
        if text:
            text += " "
            start += 1
        token_start = start
        token_end = start + len(token)
        text += token
        if labels[tag] != "O":
            ent_type = labels[tag][2:]
            if labels[tag].startswith("B-"):
                entities.append([token_start, token_end, ent_type])
            elif labels[tag].startswith("I-") and entities:
                entities[-1][1] = token_end
        start = token_end

    return (text, {"entities": [tuple(ent) for ent in entities]})

In [7]:
from spacy.training import Example
from spacy.tokens import DocBin

doc_bin = DocBin()
for example in dataset["train"]:
    text, annotations = convert_conll_to_spacy_format(example)
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    doc_bin.add(example.reference)
doc_bin.to_disk("train.spacy")

for example in dataset["validation"]:
    text, annotations = convert_conll_to_spacy_format(example)
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    doc_bin.add(example.reference)
doc_bin.to_disk("dev.spacy")

In [8]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output ./output

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.28    0.00    0.00    0.00    0.00
  0     200        277.27   2996.40   51.56   53.47   49.79    0.52
  0     400        290.58   2281.65   68.98   68.74   69.22    0.69
  0     600        240.43   1997.41   76.49   77.49   75.51    0.76
  0     800        432.80   1988.48   80.59   81.30   79.89    0.81
  0    1000       1128.01   2315.70   86.51   87.15   85.87    0.87
  1    1200        461.59   2012.91   88.95   89.24   88.65    0.89
  1    1400        478.27   1745.72   90.13   89.93   90.33    0.90
  1    1600        635.11   2047.83   92.90   93.02   92.77    0.9

In [11]:
import spacy

nlp = spacy.load("./output/model-best")
doc = nlp("Hugging Face is based in New York.")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label :{ent.label_}")

Entity: Hugging Face, Label :ORG
Entity: New York, Label :LOC


In [12]:
doc = nlp("Hi, I am Sarah and I live in New York. I have an Iphone 14pro")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label :{ent.label_}")

Entity: Sarah, Label :PER
Entity: New York, Label :LOC
Entity: Iphone 14pro, Label :MISC
