# Named Entity Recognition (NER)

# 1. Create Dataset

In [5]:
# # 1. Setup
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [6]:
import pandas as pd
import spacy
import json

In [7]:
# Step 1: Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Step 2: Load your CSV file
df = pd.read_csv("whisper_with_groundtruth_100.csv")

# Step 3: Auto-label with spaCy NER
def extract_entities(text):
    doc = nlp(str(text))
    return [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

df["entities"] = df["whisper_transcript"].astype(str).apply(extract_entities)

# Step 4: Convert to training format for inspection
train_data = []
for _, row in df.iterrows():
    text = row["whisper_transcript"]
    entities = row["entities"]
    if entities:
        train_data.append({"text": text, "annotations": {"entities": entities}})

# Step 5: Save to CSV
ner_df = pd.DataFrame(train_data)
ner_df.to_csv("ner_train_data.csv", index=False)

# 2. Explore Data

In [5]:
import ast

In [6]:
# Step 1: Load the CSV file
df = pd.read_csv("ner_train_data.csv")

In [11]:
# Step 1: Load the CSV file
df = pd.read_csv("ner_train_data.csv")

# Step 2: Parse stringified lists into Python objects
df["annotations"] = df["annotations"].apply(ast.literal_eval)

# Step 3: Print entity spans and labels for the first 10 rows
for i in range(min(3, len(df))):
    row = df.iloc[i]
    text = row["text"]
    entities = row["annotations"]["entities"]

    print(f"\n--- Row {i+1} ---")
    print("Original Text:")
    print(text)
    print("Entities:")
    for start, end, label in entities:
        print(f"{label}: '{text[start:end]}'")


--- Row 1 ---
Original Text:
vivid light of a judgment day. The girl, moreover, was not prone to take for granted that she herself lived in the mind of others.
Entities:
DATE: 'a judgment day'

--- Row 2 ---
Original Text:
asked Isabella Breply, why, as a kind of compliment, a compliment on what? On your so beautifully existing. He liked me too much, she presently declared. That's a way we all have.
Entities:
PERSON: 'Isabella Breply'

--- Row 3 ---
Original Text:
Henrietta doesn't. Oh, hang Henrietta, said Ralph Corsley. If you ask me, I'm delighted at it. Is that why your father did it for your amusement? I differ with Miss Stackpole.
Entities:
ORG: 'Henrietta'
ORG: 'Henrietta'
PERSON: 'Ralph Corsley'
PERSON: 'Stackpole'


# Train

In [12]:
import pandas as pd
import ast
import random
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from tqdm import tqdm

# Step 1: Load CSV and parse annotations
df = pd.read_csv("ner_train_data.csv")
df["annotations"] = df["annotations"].apply(ast.literal_eval)

# Step 2: Format into spaCy training format
TRAIN_DATA = []
for i, row in df.iterrows():
    TRAIN_DATA.append((row["text"], row["annotations"]))

# Step 3: Create blank English NLP pipeline and add NER
nlp = spacy.blank("en")

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Step 4: Add entity labels
for _, annotations in TRAIN_DATA:
    for start, end, label in annotations["entities"]:
        ner.add_label(label)

# Step 5: Train the model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(10):  # number of epochs
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.5))
        for batch in tqdm(batches, desc=f"Epoch {i+1}"):
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.3, losses=losses)
        print(f"Losses at epoch {i+1}:", losses)

# Step 6: Save the trained model
output_dir = "ner_custom_model"
nlp.to_disk(output_dir)
print(f"Model saved to: {output_dir}")


Epoch 1: 583it [11:12,  1.15s/it]


Losses at epoch 1: {'ner': np.float32(38593.55)}


Epoch 2: 583it [11:10,  1.15s/it]


Losses at epoch 2: {'ner': np.float32(30071.406)}


Epoch 3: 583it [11:17,  1.16s/it]


Losses at epoch 3: {'ner': np.float32(26914.508)}


Epoch 4: 583it [10:46,  1.11s/it]


Losses at epoch 4: {'ner': np.float32(25240.418)}


Epoch 7: 583it [09:57,  1.02s/it]


Losses at epoch 7: {'ner': np.float32(24523.87)}


Epoch 8: 583it [10:07,  1.04s/it]


Losses at epoch 8: {'ner': np.float32(24345.117)}


Epoch 9: 583it [10:07,  1.04s/it]


Losses at epoch 9: {'ner': np.float32(24128.111)}


Epoch 10: 583it [10:01,  1.03s/it]


Losses at epoch 10: {'ner': np.float32(23705.365)}
Model saved to: ner_custom_model


# Try

In [1]:
import spacy

# Load your trained model
nlp2 = spacy.load("ner_custom_model")

# Run it on new text
text = "Grace is a student registed in Northwestern University on June 2024."
doc = nlp2(text)

# Print recognized entities
print("Entities in Text:")
for ent in doc.ents:
    print(ent.text, ent.label_)


Entities in Text:
Grace PERSON
Northwestern University FAC
June 2024 DATE
