In [16]:
import spacy
from spacy.tokens import Doc
from spacy.training import Example
from datasets import load_dataset
from tqdm import tqdm
import random
import os
import numpy as np
from spacy.util import minibatch, compounding

# Load dataset
dataset = load_dataset("conll2003")
print("Dataset structure:", dataset)

# Define NER label mapping (from numeric to string format that spaCy uses)
# Get label names directly from the dataset
label_list = dataset["train"].features["ner_tags"].feature.names
print("Labels:", label_list)

# Function to convert tokens and tags to spaCy examples
def create_spacy_examples(nlp, dataset_split, max_samples=None):
    examples = []
    
    sample_count = len(dataset_split) if max_samples is None else min(max_samples, len(dataset_split))
    
    for i in tqdm(range(sample_count), desc=f"Processing {sample_count} examples"):
        item = dataset_split[i]
        tokens = item["tokens"]
        ner_tags = item["ner_tags"]
        
        # Create a spaCy Doc with the tokens
        # We need to set proper spaces to ensure correct character offsets
        spaces = [True] * len(tokens)
        if spaces:  # Make sure the last token doesn't have a trailing space
            spaces[-1] = False
        
        doc = Doc(nlp.vocab, words=tokens, spaces=spaces)
        
        # Extract entities based on BIO tags
        entities = []
        current_entity = None
        
        for token_idx, (token, tag_id) in enumerate(zip(doc, ner_tags)):
            tag = label_list[tag_id]
            
            # Starting a new entity
            if tag.startswith("B-"):
                if current_entity is not None:
                    # Add the previous entity
                    entities.append(current_entity)
                # Start a new entity
                current_entity = {
                    "start": token.idx,
                    "end": token.idx + len(token.text),
                    "label": tag[2:]  # Remove the "B-" prefix
                }
            # Inside an entity
            elif tag.startswith("I-") and current_entity is not None:
                # Only extend if it's the same entity type
                if current_entity["label"] == tag[2:]:
                    current_entity["end"] = token.idx + len(token.text)
                else:
                    # If entity type has changed, add the previous one and start a new one
                    entities.append(current_entity)
                    current_entity = {
                        "start": token.idx,
                        "end": token.idx + len(token.text),
                        "label": tag[2:]
                    }
            # Outside any entity
            elif tag == "O":
                if current_entity is not None:
                    entities.append(current_entity)
                    current_entity = None
        
        # Add any entity that's still open at the end
        if current_entity is not None:
            entities.append(current_entity)
        
        # Create the entity spans dict for the Example object
        spans = {"entities": [(ent["start"], ent["end"], ent["label"]) for ent in entities]}
        
        # Create and add the example
        examples.append(Example.from_dict(doc, spans))
    
    return examples


Dataset structure: DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [17]:

# Train a custom NER model
def train_spacy_model(train_examples, validation_examples=None):
    # Load a blank English model
    nlp = spacy.blank("en")
    
    # Create a new NER component and add it to the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")
    
    # Add entity labels from our dataset
    for label in set([label[2:] for label in label_list if label.startswith("B-")]):
        ner.add_label(label)
    
    # Configure training
    n_iter = 30
    
    # Initialize the optimizer
    optimizer = nlp.initialize(lambda: train_examples)
    
    # Training loop
    batch_sizes = compounding(4.0, 32.0, 1.001)  # Gradually increase batch size
    
    # Store metrics for each epoch
    training_losses = []
    validation_metrics = []
    
    print(f"Training with {len(train_examples)} examples")
    print(f"Validation with {len(validation_examples) if validation_examples else 0} examples")
    
    for i in range(n_iter):
        # Shuffle training data
        random.shuffle(train_examples)
        
        # Create batches and train
        losses = {}
        batches = minibatch(train_examples, size=batch_sizes)
        
        # Training step
        for batch in tqdm(list(batches), desc=f"Epoch {i+1}/{n_iter}"):
            nlp.update(batch, drop=0.2, losses=losses)
        
        # Track training loss
        epoch_loss = losses.get("ner", 0)
        training_losses.append(epoch_loss)
        print(f"Epoch {i+1}/{n_iter}, Loss: {epoch_loss:.4f}")
        
    return nlp, {"training_losses": training_losses, "validation_metrics": validation_metrics}


In [18]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create examples from the dataset
print("Creating training examples...")
train_examples = create_spacy_examples(nlp, dataset["train"], max_samples=None)

print("Creating validation examples...")
validation_examples = create_spacy_examples(nlp, dataset["validation"], max_samples=500)

print("Creating test examples...")
test_examples = create_spacy_examples(nlp, dataset["test"], max_samples=500)

# Train the model
print("\nTraining the model...")
trained_model, metrics = train_spacy_model(train_examples, validation_examples)

# Save the model
if not os.path.exists("./models"):
    os.makedirs("./models")

trained_model.to_disk("./models/conll_ner_model")
print("Model saved to ./models/conll_ner_model")


Creating training examples...


Processing 14041 examples: 100%|██████████| 14041/14041 [00:04<00:00, 3195.46it/s]


Creating validation examples...


Processing 500 examples: 100%|██████████| 500/500 [00:00<00:00, 3028.69it/s]


Creating test examples...


Processing 500 examples: 100%|██████████| 500/500 [00:00<00:00, 3866.58it/s]



Training the model...
Training with 14041 examples
Validation with 500 examples


Epoch 1/30: 100%|██████████| 1548/1548 [01:39<00:00, 15.52it/s]


Epoch 1/30, Loss: 14734.8828


Epoch 2/30: 100%|██████████| 567/567 [01:01<00:00,  9.28it/s]


Epoch 2/30, Loss: 7467.6943


Epoch 3/30: 100%|██████████| 439/439 [00:45<00:00,  9.58it/s]


Epoch 3/30, Loss: 5149.2837


Epoch 4/30: 100%|██████████| 439/439 [00:45<00:00,  9.60it/s]


Epoch 4/30, Loss: 4257.0566


Epoch 5/30: 100%|██████████| 439/439 [00:45<00:00,  9.66it/s]


Epoch 5/30, Loss: 3407.7698


Epoch 6/30: 100%|██████████| 439/439 [00:45<00:00,  9.70it/s]


Epoch 6/30, Loss: 2972.8726


Epoch 7/30: 100%|██████████| 439/439 [00:46<00:00,  9.47it/s]


Epoch 7/30, Loss: 2608.0769


Epoch 8/30: 100%|██████████| 439/439 [00:45<00:00,  9.73it/s]


Epoch 8/30, Loss: 2452.7244


Epoch 9/30: 100%|██████████| 439/439 [00:49<00:00,  8.87it/s]


Epoch 9/30, Loss: 2352.5393


Epoch 10/30: 100%|██████████| 439/439 [01:13<00:00,  5.98it/s]


Epoch 10/30, Loss: 1928.2369


Epoch 11/30: 100%|██████████| 439/439 [01:21<00:00,  5.41it/s]


Epoch 11/30, Loss: 1905.8416


Epoch 12/30: 100%|██████████| 439/439 [01:31<00:00,  4.79it/s]


Epoch 12/30, Loss: 1701.5107


Epoch 13/30: 100%|██████████| 439/439 [00:49<00:00,  8.86it/s]


Epoch 13/30, Loss: 1561.4275


Epoch 14/30: 100%|██████████| 439/439 [00:45<00:00,  9.57it/s]


Epoch 14/30, Loss: 1575.1489


Epoch 15/30: 100%|██████████| 439/439 [01:16<00:00,  5.76it/s]


Epoch 15/30, Loss: 1426.1211


Epoch 16/30: 100%|██████████| 439/439 [01:11<00:00,  6.15it/s]


Epoch 16/30, Loss: 1352.1597


Epoch 17/30: 100%|██████████| 439/439 [01:12<00:00,  6.07it/s]


Epoch 17/30, Loss: 1376.2960


Epoch 18/30: 100%|██████████| 439/439 [01:00<00:00,  7.23it/s]


Epoch 18/30, Loss: 1154.5421


Epoch 19/30: 100%|██████████| 439/439 [00:57<00:00,  7.65it/s]


Epoch 19/30, Loss: 1186.5900


Epoch 20/30: 100%|██████████| 439/439 [01:14<00:00,  5.90it/s]


Epoch 20/30, Loss: 1254.2452


Epoch 21/30: 100%|██████████| 439/439 [01:18<00:00,  5.62it/s]


Epoch 21/30, Loss: 1067.8771


Epoch 22/30: 100%|██████████| 439/439 [01:14<00:00,  5.93it/s]


Epoch 22/30, Loss: 1019.4557


Epoch 23/30: 100%|██████████| 439/439 [01:14<00:00,  5.87it/s]


Epoch 23/30, Loss: 970.7191


Epoch 24/30: 100%|██████████| 439/439 [01:12<00:00,  6.05it/s]


Epoch 24/30, Loss: 944.7495


Epoch 25/30: 100%|██████████| 439/439 [01:14<00:00,  5.87it/s]


Epoch 25/30, Loss: 892.5607


Epoch 26/30: 100%|██████████| 439/439 [01:14<00:00,  5.92it/s]


Epoch 26/30, Loss: 978.9185


Epoch 27/30: 100%|██████████| 439/439 [01:13<00:00,  5.99it/s]


Epoch 27/30, Loss: 878.4814


Epoch 28/30: 100%|██████████| 439/439 [01:14<00:00,  5.90it/s]


Epoch 28/30, Loss: 809.5173


Epoch 29/30: 100%|██████████| 439/439 [01:13<00:00,  5.93it/s]


Epoch 29/30, Loss: 830.5732


Epoch 30/30: 100%|██████████| 439/439 [01:14<00:00,  5.88it/s]


Epoch 30/30, Loss: 750.6155
Model saved to ./models/conll_ner_model


In [19]:

# Evaluate model on examples
def evaluate_examples(nlp, examples):
    tp = fp = fn = 0
    
    # Entity-wise evaluation
    entity_scores = {}
    
    for ex in examples:
        # Check if 'entities' exists in spans, if not skip this example
        if "entities" not in ex.reference.spans:
            continue
            
        gold_entities = set([(e[0], e[1], e[2]) for e in ex.reference.spans["entities"]])
        
        # Get predictions
        pred_doc = nlp(ex.reference.text)
        pred_entities = set([(e.start_char, e.end_char, e.label_) for e in pred_doc.ents])
        
        # Update counts
        for entity in gold_entities:
            if entity in pred_entities:
                tp += 1
                
                # Entity-specific metrics
                if entity[2] not in entity_scores:
                    entity_scores[entity[2]] = {"tp": 0, "fp": 0, "fn": 0}
                entity_scores[entity[2]]["tp"] += 1
            else:
                fn += 1
                
                # Entity-specific metrics
                if entity[2] not in entity_scores:
                    entity_scores[entity[2]] = {"tp": 0, "fp": 0, "fn": 0}
                entity_scores[entity[2]]["fn"] += 1
        
        for entity in pred_entities:
            if entity not in gold_entities:
                fp += 1
                
                # Entity-specific metrics
                if entity[2] not in entity_scores:
                    entity_scores[entity[2]] = {"tp": 0, "fp": 0, "fn": 0}
                entity_scores[entity[2]]["fp"] += 1
    
    # Calculate overall metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate entity-specific metrics
    for entity_type, counts in entity_scores.items():
        entity_tp = counts["tp"]
        entity_fp = counts["fp"]
        entity_fn = counts["fn"]
        
        entity_precision = entity_tp / (entity_tp + entity_fp) if (entity_tp + entity_fp) > 0 else 0
        entity_recall = entity_tp / (entity_tp + entity_fn) if (entity_tp + entity_fn) > 0 else 0
        entity_f1 = 2 * entity_precision * entity_recall / (entity_precision + entity_recall) if (entity_precision + entity_recall) > 0 else 0
        
        entity_scores[entity_type]["precision"] = entity_precision
        entity_scores[entity_type]["recall"] = entity_recall
        entity_scores[entity_type]["f1"] = entity_f1
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "entity_scores": entity_scores
    }

# Display detailed evaluation results
def display_evaluation_results(scores):
    print("\n--- Evaluation Results ---")
    print(f"Overall Precision: {scores['precision']:.4f}")
    print(f"Overall Recall: {scores['recall']:.4f}")
    print(f"Overall F1 Score: {scores['f1']:.4f}")
    
    print("\nEntity-specific scores:")
    for entity_type, metrics in scores["entity_scores"].items():
        print(f"\n{entity_type}:")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1 Score: {metrics['f1']:.4f}")

# Function to display example predictions
def show_example_predictions(nlp, examples, num_examples=5):
    print("\n--- Example Predictions ---")
    
    for i, example in enumerate(examples[:num_examples]):
        text = example.reference.text
        # gold_entities = [(text[start:end], label) for start, end, label in example.reference.spans["entities"]]
        
        # Get predictions
        doc = nlp(text)
        pred_entities = [(ent.text, ent.label_) for ent in doc.ents]
        
        print(f"\nExample {i+1}:")
        print(f"Text: {text}")
        # print("Gold entities:", gold_entities)
        print("Predicted entities:", pred_entities)


In [20]:

# Create a blank spaCy model
nlp = spacy.blank("en")

# Create examples from the dataset
print("Creating training examples...")
train_examples = create_spacy_examples(nlp, dataset["train"], max_samples=2000)

print("Creating validation examples...")
validation_examples = create_spacy_examples(nlp, dataset["validation"], max_samples=500)

print("Creating test examples...")
test_examples = create_spacy_examples(nlp, dataset["test"], max_samples=500)

# # Train the model
# print("\nTraining the model...")
# trained_model, metrics = train_spacy_model(train_examples, validation_examples)

# # Save the model
# if not os.path.exists("./models"):
#     os.makedirs("./models")

# trained_model.to_disk("./models/conll_ner_model")
# print("Model saved to ./models/conll_ner_model")


Creating training examples...


Processing 2000 examples: 100%|██████████| 2000/2000 [00:00<00:00, 3568.72it/s]


Creating validation examples...


Processing 500 examples: 100%|██████████| 500/500 [00:00<00:00, 4648.49it/s]


Creating test examples...


Processing 500 examples: 100%|██████████| 500/500 [00:00<00:00, 5012.48it/s]


In [21]:
# load saved model
trained_model = spacy.load("./models/conll_ner_model")
# Evaluate on test data
print("\nEvaluating on test data...")
test_scores = evaluate_examples(trained_model, test_examples)
display_evaluation_results(test_scores)

# Show some example predictions
show_example_predictions(trained_model, test_examples)

# Function to test on custom text
def test_ner(text):
    doc = trained_model(text)
    print("\n--- Custom Text Prediction ---")
    print(f"Text: {text}")
    print("Predicted entities:")
    for ent in doc.ents:
        print(f"  {ent.text} ({ent.label_})")

# Test on a custom text
test_ner("John Smith from Google visited New York last week for a conference about AI technology.")



Evaluating on test data...

--- Evaluation Results ---
Overall Precision: 0.0000
Overall Recall: 0.0000
Overall F1 Score: 0.0000

Entity-specific scores:

--- Example Predictions ---

Example 1:
Text: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Predicted entities: [('JAPAN', 'PER'), ('LUCKY', 'PER'), ('CHINA', 'ORG')]

Example 2:
Text: Nadim Ladki
Predicted entities: [('Nadim Ladki', 'PER')]

Example 3:
Text: AL-AIN , United Arab Emirates 1996-12-06
Predicted entities: [('AL', 'LOC'), ('AIN', 'ORG'), ('United Arab Emirates', 'PER')]

Example 4:
Text: Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
Predicted entities: [('Japan', 'LOC'), ('Asian Cup', 'MISC'), ('Syria', 'LOC')]

Example 5:
Text: But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .
Predicted entities: [('China', 'LOC'), ('Uzbekistan', 'LOC')]

--- Cus

In [25]:
text = "Hi, Apple Inc. is a company in California."
text = "Apple Inc. announced the launch of its latest iPhone 15 in California on September 12, 2023. CEO Tim Cook highlighted new features, including an advanced A17 Bionic chip and improved camera technology. Analysts predict that the device will boost Apple's market share, especially in the United States and Europe."
doc = trained_model(text)
print("\n--- Custom Text Prediction ---")
print(f"Text: {text}")
print("Predicted entities:")
for ent in doc.ents:
    print(f"  {ent.text} ({ent.label_})")




--- Custom Text Prediction ---
Text: Apple Inc. announced the launch of its latest iPhone 15 in California on September 12, 2023. CEO Tim Cook highlighted new features, including an advanced A17 Bionic chip and improved camera technology. Analysts predict that the device will boost Apple's market share, especially in the United States and Europe.
Predicted entities:
  Apple Inc. (ORG)
  California (LOC)
  Tim Cook (PER)
  A17 Bionic (PER)
  Apple (ORG)
  United States (LOC)
  Europe (LOC)
