In [1]:
!pip install spacy datasets
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# Cell 3: Setup and Data Loading
import os
from pathlib import Path
import spacy
from spacy.tokens import DocBin
from spacy.training import Example

# Dataset paths (Kaggle)
dataset_path = Path("/kaggle/input/ner-conll2003-dataset")
train_file = dataset_path / "eng.train"
val_file = dataset_path / "eng.testa"
test_file = dataset_path / "eng.testb"

# Verify files
print("Files found:")
print(f"- Train: {train_file.exists()}")
print(f"- Val: {val_file.exists()}")
print(f"- Test: {test_file.exists()}")

Files found:
- Train: False
- Val: False
- Test: False


In [5]:
import os

# Define dataset paths (update these paths based on your dataset location)
train_file = "/kaggle/input/conll2003-dataset/conll2003/eng.train"
val_file = "/kaggle/input/conll2003-dataset/conll2003/eng.testa"
test_file = "/kaggle/input/conll2003-dataset/conll2003/eng.testb"

# Check if files exist
for file in [train_file, val_file, test_file]:
    print(f"File found: {os.path.exists(file)} - {file}")

# Print first 20 lines of the train file to inspect the format
with open(train_file, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Print first few lines
print("\n".join(lines[:20]))


File found: True - /kaggle/input/conll2003-dataset/conll2003/eng.train
File found: True - /kaggle/input/conll2003-dataset/conll2003/eng.testa
File found: True - /kaggle/input/conll2003-dataset/conll2003/eng.testb
-DOCSTART- -X- -X- O



EU NNP B-NP B-ORG

rejects VBZ B-VP O

German JJ B-NP B-MISC

call NN I-NP O

to TO B-VP O

boycott VB I-VP O

British JJ B-NP B-MISC

lamb NN I-NP O

. . O O



Peter NNP B-NP B-PER

Blackburn NNP I-NP I-PER



BRUSSELS NNP B-NP B-LOC

1996-08-22 CD I-NP O



The DT B-NP O

European NNP I-NP B-ORG



In [6]:
def read_conll_file(filepath):
    """Reads a CoNLL-2003 formatted file and extracts sentences with entity labels."""
    sentences = []
    sentence = []
    
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip() == "":
                if sentence:  
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split()
                if len(parts) > 3:  
                    word, tag = parts[0], parts[-1]  # Word and its entity tag
                    sentence.append((word, tag))
    
    return sentences

# Read the train and validation datasets
train_sentences = read_conll_file(train_file)
val_sentences = read_conll_file(val_file)

# Print a sample of the extracted data
print("Sample extracted sentence with labels:")
print(train_sentences[:3])  # Print first 3 sentences


Sample extracted sentence with labels:
[[('-DOCSTART-', 'O')], [('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-MISC'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('British', 'B-MISC'), ('lamb', 'O'), ('.', 'O')], [('Peter', 'B-PER'), ('Blackburn', 'I-PER')]]


In [7]:
import spacy
from spacy.tokens import DocBin

# Load an empty English spaCy model
nlp = spacy.blank("en")

def convert_to_spacy_format(sentences):
    """Converts tokenized sentences into spaCy's binary training format."""
    doc_bin = DocBin()
    
    for sentence in sentences:
        words, labels = zip(*sentence)  # Separate words and labels
        text = " ".join(words)  # Reconstruct sentence
        doc = nlp.make_doc(text)  # Create spaCy doc object
        entities = []
        start = 0

        for i, word in enumerate(words):
            end = start + len(word)

            if labels[i] != "O":  # Ignore non-entity words
                label_type = labels[i].split("-")[-1]  # Extract label type
                span = doc.char_span(start, end, label=label_type)

                if span:  # Avoid None values
                    entities.append(span)

            start = end + 1  # Move to the next word
        
        doc.ents = entities  # Assign entities
        doc_bin.add(doc)  # Add doc to the DocBin
    
    return doc_bin

# Convert datasets
train_data = convert_to_spacy_format(train_sentences)
val_data = convert_to_spacy_format(val_sentences)

# Save as binary files for training
train_data.to_disk("./train.spacy")
val_data.to_disk("./val.spacy")
print("✅ Data successfully converted to spaCy format!")


✅ Data successfully converted to spaCy format!


In [8]:
import spacy
from spacy.training import Example
from spacy.tokens import DocBin

# Load an empty English spaCy model
nlp = spacy.blank("en")

# Add Named Entity Recognition (NER) pipeline
ner = nlp.add_pipe("ner")

# Load training data
train_data = DocBin().from_disk("./train.spacy")
val_data = DocBin().from_disk("./val.spacy")

# Convert DocBin to Example format
train_examples = []
for doc in train_data.get_docs(nlp.vocab):
    train_examples.append(Example.from_dict(nlp.make_doc(doc.text), {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}))

# Add entity labels to NER
for example in train_examples:
    for ent in example.reference.ents:
        ner.add_label(ent.label_)

# Training configuration
optimizer = nlp.begin_training()
n_iter = 5  # Number of epochs

for epoch in range(n_iter):
    losses = {}
    for batch in spacy.util.minibatch(train_examples, size=8):
        nlp.update(batch, losses=losses, drop=0.5)
    print(f"Epoch {epoch+1}/{n_iter}, Loss: {losses['ner']:.4f}")

# Save the trained model
output_dir = "./ner_model"
nlp.to_disk(output_dir)
print(f"✅ Model training completed and saved to {output_dir}")


[2025-04-02 07:24:21,849] [INFO] Created vocabulary
[2025-04-02 07:24:21,851] [INFO] Finished initializing nlp object
  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist


Epoch 1/5, Loss: 28519.2202
Epoch 2/5, Loss: 16901.2711
Epoch 3/5, Loss: 13308.3688
Epoch 4/5, Loss: 11350.4216
Epoch 5/5, Loss: 10264.6780
✅ Model training completed and saved to ./ner_model


In [9]:
import spacy

# Load the trained model
output_dir = "./ner_model"
nlp_ner = spacy.load(output_dir)

# Test sentences
test_sentences = [
    "Barack Obama was the president of the United States.",
    "Apple Inc. is based in Cupertino, California.",
    "Lionel Messi plays for Inter Miami CF.",
    "Google was founded by Larry Page and Sergey Brin."
]

# Run NER on test sentences
for sentence in test_sentences:
    doc = nlp_ner(sentence)
    print(f"\nText: {sentence}")
    print("Entities:")
    for ent in doc.ents:
        print(f"  {ent.text} - {ent.label_}")



Text: Barack Obama was the president of the United States.
Entities:
  Barack - PER
  Obama - PER
  United - LOC
  States - LOC

Text: Apple Inc. is based in Cupertino, California.
Entities:
  Apple - ORG
  Inc. - ORG
  Cupertino - PER
  California - LOC

Text: Lionel Messi plays for Inter Miami CF.
Entities:
  Lionel - PER
  Messi - PER
  Inter - ORG
  Miami - ORG
  CF - ORG

Text: Google was founded by Larry Page and Sergey Brin.
Entities:
  Larry - PER
  Page - PER
  Sergey - PER
  Brin - PER


In [10]:
import spacy
from spacy.scorer import Scorer
from spacy.training import Example
from spacy.tokens import DocBin

# Load trained model
output_dir = "./ner_model"
nlp_ner = spacy.load(output_dir)

# Load validation data
val_data = DocBin().from_disk("./val.spacy")

# Convert validation data into examples
examples = []
for doc in val_data.get_docs(nlp_ner.vocab):
    example = Example(nlp_ner(doc.text), doc)
    examples.append(example)

# Evaluate model
scorer = Scorer()
scores = scorer.score(examples)

# Calculate accuracy manually
correct_predictions = scores["ents_per_type"]  # Contains entity-wise correct/missed counts
total_correct = sum([v["p"] * v["r"] / v["f"] if v["f"] > 0 else 0 for v in correct_predictions.values()])  # Approximate correct entities
total_predictions = sum([v["p"] for v in correct_predictions.values()])  # All predicted entities

accuracy = total_correct / total_predictions if total_predictions > 0 else 0.0

# Print evaluation metrics
print(f"Precision: {scores['ents_p']:.4f}")
print(f"Recall: {scores['ents_r']:.4f}")
print(f"F1-score: {scores['ents_f']:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Precision: 0.8635
Recall: 0.8534
F1-score: 0.8584
Accuracy: 0.9854
