In [12]:
import spacy
from spacy.training.example import Example
from spacy.tokens import DocBin

nlp = spacy.blank("en")  # Create a blank English model
ner = nlp.add_pipe("ner")  # Add NER pipeline

# Define training data (sample)
TRAIN_DATA = [
    ("Apple is looking at buying U.K. startup for $1 billion", 
     {"entities": [(0, 5, "ORG"), (27, 31, "LOC"), (44, 54, "MONEY")]}),

    ("Elon Musk founded SpaceX", 
     {"entities": [(0, 9, "PERSON"), (18, 24, "ORG")]}),

    ("Tesla is opening a new plant in Texas", 
     {"entities": [(0, 5, "ORG"), (34, 39, "LOC")]}),  # Added Texas as GPE

    ("Amazon's headquarters is in Seattle", 
     {"entities": [(0, 6, "ORG"), (27, 34, "LOC")]}),

    ("Google is based in California", 
     {"entities": [(0, 6, "ORG"), (17, 27, "LOC")]}),


]

# Add labels to the model
for _, annotations in TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

# Convert data into spaCy format
db = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = [doc.char_span(start, end, label) for start, end, label in annotations["entities"]]
    doc.ents = [ent for ent in ents if ent is not None]
    db.add(doc)

db.to_disk("./train.spacy")  # Save the formatted training data


In [13]:
import os
print("File exists:", os.path.exists("./train.spacy"))


File exists: True


In [14]:
db = DocBin().from_disk("./train.spacy")
docs = list(db.get_docs(nlp.vocab))

for doc in docs:
    print([(ent.text, ent.label_) for ent in doc.ents])


[('Apple', 'ORG'), ('U.K.', 'LOC'), ('$1 billion', 'MONEY')]
[('Elon Musk', 'PERSON'), ('SpaceX', 'ORG')]
[('Tesla', 'ORG')]
[('Amazon', 'ORG')]
[('Google', 'ORG')]


In [15]:
import random

nlp.begin_training()
for epoch in range(20):  # Adjust epochs as needed
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print(f"Epoch {epoch+1}, Loss: {losses}")




Epoch 1, Loss: {'ner': 27.67866127938032}
Epoch 2, Loss: {'ner': 20.772579601034522}
Epoch 3, Loss: {'ner': 10.623432686115848}
Epoch 4, Loss: {'ner': 10.884079404712537}
Epoch 5, Loss: {'ner': 7.701406372836451}
Epoch 6, Loss: {'ner': 5.871904224225555}
Epoch 7, Loss: {'ner': 4.1351512459421285}
Epoch 8, Loss: {'ner': 2.024194936417084}
Epoch 9, Loss: {'ner': 1.2571193380157102}
Epoch 10, Loss: {'ner': 0.2827654943323666}
Epoch 11, Loss: {'ner': 0.007764039087490088}
Epoch 12, Loss: {'ner': 0.003119634221266556}
Epoch 13, Loss: {'ner': 5.925263782684832e-05}
Epoch 14, Loss: {'ner': 6.018586171279166e-06}
Epoch 15, Loss: {'ner': 3.2223673323862414e-06}
Epoch 16, Loss: {'ner': 6.277056919783784e-07}
Epoch 17, Loss: {'ner': 4.1590309486718154e-07}
Epoch 18, Loss: {'ner': 6.857662104340117e-07}
Epoch 19, Loss: {'ner': 2.1894220469427437e-07}
Epoch 20, Loss: {'ner': 1.4637296083601193e-06}


In [36]:
import spacy

nlp = spacy.load("en_core_web_sm")  # Pre-trained model
doc = nlp("Elon Musk is buying a Redmi showroom in India by 2026")

for ent in doc.ents:
    print(ent.text, ent.label_)


Elon Musk PERSON
Redmi ORG
India GPE
2026 DATE


In [35]:
import spacy

nlp = spacy.load("en_core_web_sm")  # Pre-trained model
doc = nlp("Tesla is looking at buying U.K. startup by Steve Jobs for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)


Tesla ORG
U.K. GPE
Steve Jobs PERSON
$1 billion MONEY
