In [1]:
import spacy
from spacy.tokens import Doc
from spacy.training import Example
from datasets import load_dataset
from tqdm import tqdm
import random
import os
import numpy as np
from spacy.util import minibatch

dataset = load_dataset("conll2003")
print("Dataset structure:", dataset)

label_list = dataset["train"].features["ner_tags"].feature.names
print("Labels:", label_list)

epochs = 35
max_samples = None
batch_size = 32


# Create a spaCy Doc with the tokens
def create_training_examples(nlp, dataset_split, max_samples=None):
    examples = []
    
    sample_count = len(dataset_split) if max_samples is None else min(max_samples, len(dataset_split))
    
    for i in tqdm(range(sample_count), desc=f"Processing {sample_count} examples"):
        item = dataset_split[i]
        tokens = item["tokens"]
        ner_tags = item["ner_tags"]
        
        spaces = [True] * len(tokens)
        if spaces:  
            spaces[-1] = False
        
        doc = Doc(nlp.vocab, words=tokens, spaces=spaces)
        entities = []
        current_entity = None
        
        for token_idx, (token, tag_id) in enumerate(zip(doc, ner_tags)):
            tag = label_list[tag_id]
            
            if tag.startswith("B-") or tag.startswith("I-") or tag == "O":
                if current_entity is not None:
                    # Add the previous entity
                    entities.append(current_entity)
                    current_entity = None
                # Start a new entity
                current_entity = {
                    "start": token.idx,
                    "end": token.idx + len(token.text),
                    "label": tag 
                }
        
        if current_entity is not None:
            entities.append(current_entity)
        
        reference = doc.copy()
        for ent in entities:
            span = reference.char_span(ent["start"], ent["end"], label=ent["label"])
            if span is not None:
                reference.ents = list(reference.ents) + [span]
        
        # Create example
        example = Example(doc, reference)
        examples.append(example)
    return examples


  from .autonotebook import tqdm as notebook_tqdm


Dataset structure: DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [2]:

def train_spacy_model(train_examples):
    nlp = spacy.blank("en")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")
    
    unique_labels = set()
    for label in label_list:
        unique_labels.add(label)
    
    for label in unique_labels:
        print(f"Adding label: {label}")
        ner.add_label(label)
    
    n_iter = epochs
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    training_losses = []
    
    print(f"Training with {len(train_examples)} examples")
    
    # Train the model
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        
        for i in range(n_iter):
            random.shuffle(train_examples)
            
            losses = {}
            batches = minibatch(train_examples, size=batch_size)
            
            for batch in tqdm(list(batches), desc=f"Epoch {i+1}/{n_iter}"):
                nlp.update(batch, drop=0.2, losses=losses)
            
            epoch_loss = losses.get("ner", 0)
            training_losses.append(epoch_loss)
            print(f"Epoch {i+1}/{n_iter}, Loss: {epoch_loss:.4f}")
        
    
    return nlp, {"training_losses": training_losses}


In [3]:

nlp = spacy.blank("en")

# Create training and validation data
print("Preparing training data...")
train_examples = create_training_examples(nlp, dataset["train"], max_samples=max_samples) 


Preparing training data...


Processing 14041 examples: 100%|██████████| 14041/14041 [00:05<00:00, 2763.68it/s]


In [4]:

# Train model
print("Training model...")
trained_model, metrics = train_spacy_model(train_examples)



Training model...
Adding label: B-ORG
Adding label: I-ORG
Adding label: B-MISC
Adding label: O
Adding label: B-LOC
Adding label: I-PER
Adding label: I-MISC
Adding label: I-LOC
Adding label: B-PER
Training with 14041 examples


Epoch 1/35: 100%|██████████| 439/439 [00:41<00:00, 10.58it/s]


Epoch 1/35, Loss: 24167.9258


Epoch 2/35: 100%|██████████| 439/439 [00:42<00:00, 10.41it/s]


Epoch 2/35, Loss: 9244.8447


Epoch 3/35: 100%|██████████| 439/439 [00:44<00:00,  9.78it/s]


Epoch 3/35, Loss: 6270.2588


Epoch 4/35: 100%|██████████| 439/439 [00:45<00:00,  9.57it/s]


Epoch 4/35, Loss: 4702.3926


Epoch 5/35: 100%|██████████| 439/439 [00:43<00:00, 10.12it/s]


Epoch 5/35, Loss: 3676.5750


Epoch 6/35: 100%|██████████| 439/439 [00:44<00:00,  9.93it/s]


Epoch 6/35, Loss: 3160.5959


Epoch 7/35: 100%|██████████| 439/439 [00:48<00:00,  8.98it/s]


Epoch 7/35, Loss: 2689.3250


Epoch 8/35: 100%|██████████| 439/439 [00:46<00:00,  9.39it/s]


Epoch 8/35, Loss: 2391.2646


Epoch 9/35: 100%|██████████| 439/439 [00:48<00:00,  9.10it/s]


Epoch 9/35, Loss: 2204.1631


Epoch 10/35: 100%|██████████| 439/439 [00:46<00:00,  9.46it/s]


Epoch 10/35, Loss: 1926.3594


Epoch 11/35: 100%|██████████| 439/439 [00:44<00:00,  9.78it/s]


Epoch 11/35, Loss: 1740.3558


Epoch 12/35: 100%|██████████| 439/439 [00:43<00:00, 10.16it/s]


Epoch 12/35, Loss: 1672.2102


Epoch 13/35: 100%|██████████| 439/439 [00:46<00:00,  9.45it/s]


Epoch 13/35, Loss: 1452.6094


Epoch 14/35: 100%|██████████| 439/439 [00:44<00:00,  9.80it/s]


Epoch 14/35, Loss: 1459.4565


Epoch 15/35: 100%|██████████| 439/439 [00:44<00:00,  9.93it/s]


Epoch 15/35, Loss: 1337.9694


Epoch 16/35: 100%|██████████| 439/439 [00:42<00:00, 10.21it/s]


Epoch 16/35, Loss: 1266.2065


Epoch 17/35: 100%|██████████| 439/439 [00:42<00:00, 10.42it/s]


Epoch 17/35, Loss: 1185.5085


Epoch 18/35: 100%|██████████| 439/439 [00:41<00:00, 10.62it/s]


Epoch 18/35, Loss: 1103.0460


Epoch 19/35: 100%|██████████| 439/439 [00:43<00:00, 10.11it/s]


Epoch 19/35, Loss: 1201.1912


Epoch 20/35: 100%|██████████| 439/439 [00:42<00:00, 10.32it/s]


Epoch 20/35, Loss: 1037.9600


Epoch 21/35: 100%|██████████| 439/439 [00:40<00:00, 10.74it/s]


Epoch 21/35, Loss: 982.6530


Epoch 22/35: 100%|██████████| 439/439 [00:41<00:00, 10.47it/s]


Epoch 22/35, Loss: 913.4857


Epoch 23/35: 100%|██████████| 439/439 [00:41<00:00, 10.52it/s]


Epoch 23/35, Loss: 960.3520


Epoch 24/35: 100%|██████████| 439/439 [00:41<00:00, 10.60it/s]


Epoch 24/35, Loss: 952.8929


Epoch 25/35: 100%|██████████| 439/439 [00:42<00:00, 10.40it/s]


Epoch 25/35, Loss: 906.9959


Epoch 26/35: 100%|██████████| 439/439 [00:43<00:00,  9.99it/s]


Epoch 26/35, Loss: 879.9451


Epoch 27/35: 100%|██████████| 439/439 [00:43<00:00, 10.14it/s]


Epoch 27/35, Loss: 845.0806


Epoch 28/35: 100%|██████████| 439/439 [00:43<00:00, 10.10it/s]


Epoch 28/35, Loss: 854.9001


Epoch 29/35: 100%|██████████| 439/439 [00:41<00:00, 10.66it/s]


Epoch 29/35, Loss: 816.9431


Epoch 30/35: 100%|██████████| 439/439 [00:41<00:00, 10.65it/s]


Epoch 30/35, Loss: 729.6969


Epoch 31/35: 100%|██████████| 439/439 [00:40<00:00, 10.82it/s]


Epoch 31/35, Loss: 692.9923


Epoch 32/35: 100%|██████████| 439/439 [00:41<00:00, 10.65it/s]


Epoch 32/35, Loss: 738.2781


Epoch 33/35: 100%|██████████| 439/439 [00:40<00:00, 10.77it/s]


Epoch 33/35, Loss: 727.5371


Epoch 34/35: 100%|██████████| 439/439 [00:40<00:00, 10.81it/s]


Epoch 34/35, Loss: 664.5427


Epoch 35/35: 100%|██████████| 439/439 [00:41<00:00, 10.62it/s]

Epoch 35/35, Loss: 708.0412





In [5]:
# Save model to pickle
import pickle
with open("spacy_ner_model.pkl", "wb") as f:
    pickle.dump(trained_model, f)

In [15]:
def predict(model, text):
    doc = model(text)
    token_entities = []
    token = []
    for ent in doc.ents:
        token.append(ent.text)
        token_entities.append(ent.label_)
    return token,token_entities

text = "Apple Inc is an American multinational technology company headquartered in Cupertino, California."
tokens,entities = predict(trained_model, text)
print(tokens)
print(entities)

dataset = load_dataset("conll2003")
test_examples = dataset["test"][:500]



['Apple', 'Inc', 'is', 'an', 'American', 'multinational', 'technology', 'company', 'headquartered', 'in', 'Cupertino', ',', 'California', '.']
['B-ORG', 'I-ORG', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O']


In [14]:
from collections import Counter

def calculate_ner_metrics(true_entities, pred_entities, labels):
    assert len(true_entities) == len(pred_entities), "Mismatch in number of sentences"

    # Flatten lists
    y_true = [label for seq in true_entities for label in seq]
    y_pred = [label for seq in pred_entities for label in seq]

    # Calculate TP, FP, FN for each label
    label_counts = {label: Counter() for label in labels}

    for true, pred in zip(y_true, y_pred):
        if true == pred:
            label_counts[true]['TP'] += 1
        else:
            label_counts[true]['FN'] += 1
            label_counts[pred]['FP'] += 1

    # Compute metrics for each label
    metrics = {}
    total_tp = total_fp = total_fn = 0

    for label, counts in label_counts.items():
        tp, fp, fn = counts['TP'], counts['FP'], counts['FN']
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        total_tp += tp
        total_fp += fp
        total_fn += fn

        metrics[label] = {"Precision": precision, "Recall": recall, "F1-score": f1}

    # Micro-averaged scores
    overall_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    overall_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

    # Accuracy
    accuracy = sum(1 for true, pred in zip(y_true, y_pred) if true == pred) / len(y_true)

    metrics["Overall"] = {
        "Precision": overall_precision,
        "Recall": overall_recall,
        "F1-score": overall_f1,
        "Accuracy": accuracy
    }

    return metrics


def evaluate(model, test_dataset, label_list):

    
    true_entities = []
    pred_entities = []
    tokens = test_dataset["tokens"]
    ner_tags = test_dataset["ner_tags"]
    x = 5
    for i in range(len(tokens)):
        token = tokens[i]
        tag = ner_tags[i]
        tag_with_label = [label_list[tag_id] for tag_id in tag]
        text = " ".join(token)
        _, predicted_entities = predict(model, text)
        true_entities.append(tag_with_label)
        pred_entities.append(predicted_entities)
    
    metrics = calculate_ner_metrics(true_entities, pred_entities, label_list)
    print("NER Metrics:")
    for label, scores in metrics.items():
        print(f"Label: {label}")
        for metric, score in scores.items():
            print(f"{metric}: {score:.4f}")
        print()
    print(len(metrics))
    return metrics


import pickle
with open("spacy_ner_model.pkl", "rb") as f:
    model = pickle.load(f)

test_dataset = test_examples
label_list = dataset["train"].features["ner_tags"].feature.names
print("Labels:", label_list)
metric = evaluate(model, test_dataset,label_list=label_list)   
    

Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
NER Metrics:
Label: O
Precision: 0.8614
Recall: 0.8657
F1-score: 0.8635

Label: B-PER
Precision: 0.1667
Recall: 0.1667
F1-score: 0.1667

Label: I-PER
Precision: 0.1667
Recall: 0.1667
F1-score: 0.1667

Label: B-ORG
Precision: 0.1250
Recall: 0.1429
F1-score: 0.1333

Label: I-ORG
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Label: B-LOC
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Label: I-LOC
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Label: B-MISC
Precision: 0.3333
Recall: 0.2857
F1-score: 0.3077

Label: I-MISC
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Label: Overall
Precision: 0.7553
Recall: 0.7553
F1-score: 0.7553
Accuracy: 0.7553

10
