In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
import json
import random
import os

print("--- Starting NER Model Training ---")

# --- 1. Define Paths ---
RAW_DATA_PATH = "../data/raw/Entity Recognition in Resumes.json"
MODEL_OUTPUT_PATH = "../app/models/ner_model/" # Path to save the trained model

# --- 2. Load and Convert the Data ---
print(f"Loading raw data from: {RAW_DATA_PATH}")

TRAIN_DATA = []
with open(RAW_DATA_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        text = data.get('content')
        annotations = data.get('annotation')

        if not text or not annotations:
            continue

        entities = []
        for ann in annotations:
            points = ann.get('points')
            label = ann.get('label')
            if not points or not label:
                continue

            # Convert Dataturks format to spaCy format
            start = points[0]['start']
            end = points[0]['end']

            # Ensure no overlapping entities, simple version
            entities.append((start, end + 1, label[0])) # +1 as spaCy is exclusive

        TRAIN_DATA.append((text, {"entities": entities}))

print(f"Loaded and converted {len(TRAIN_DATA)} resume annotations.")

# --- 3. Prepare Data for spaCy ---

# We are creating a new, blank English model
nlp = spacy.blank("en")
print("Created blank 'en' model")

# Create a DocBin to store the training data
db = DocBin()

for text, annotations in TRAIN_DATA:
    try:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)

        # Filter out any entities that spaCy finds invalid
        valid_ents = []
        for ent in example.reference.ents:
            if ent.start_char < ent.end_char:
                span = doc.char_span(ent.start_char, ent.end_char, label=ent.label_)
                if span is not None:
                    valid_ents.append(span)

        doc.ents = valid_ents
        db.add(doc)
    except Exception as e:
        print(f"Error processing doc: {e}")

# Save the DocBin to disk
db.to_disk("./train.spacy")
print("Saved training data to train.spacy")

# --- 4. Run the Training ---
print("\n--- ðŸš€ Starting Model Training ---")

# We will use the spaCy CLI to train.
# First, create a config file.
# Run this in your terminal:
# python -m spacy init config --lang en --pipeline ner --optimize efficiency config.cfg
#
# This creates a base_config.cfg. We need to edit it to point to our data.
config_text = """
[paths]
train = "./train.spacy"
dev = "./train.spacy"

[system]
gpu_allocator = "pytorch"

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000

[components]

[components.tok2vec]
factory = "tok2vec"

[components.ner]
factory = "ner"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 1000

[initialize]
vectors = "en_core_web_sm"
"""

# Save the config file
with open("config.cfg", "w", encoding="utf-8") as f:
    f.write(config_text)

print("Saved config.cfg. Now, run the training command in your terminal:")
print("\n" + "="*30)
print(f"python -m spacy train config.cfg --output {MODEL_OUTPUT_PATH} --n-epoch 20")
print("="*30 + "\n")
print("After training, the best model will be saved in 'app/models/ner_model/model-best'")

--- Starting NER Model Training ---
Loading raw data from: ../data/raw/Entity Recognition in Resumes.json
Loaded and converted 220 resume annotations.


  from .autonotebook import tqdm as notebook_tqdm
Application Development Associate - A..." with entities "[(1295, 1622, 'Skills'), (993, 1154, 'Skills'), (9...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Active member of IIIT Committee in ..." with entities "[(1155, 1199, 'Email Address'), (743, 1141, 'Skill...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Hyderabad, Telangana - Email..." with entities "[(3749, 3757, 'Skills'), (3709, 3718, 'Skills'), (...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Operational Analyst (SQL DBA) Enginee..." with entities "[(8098, 8384, 'Skills'), (8008, 8050, 'Skills'), (...". Use `spacy.training.offsets_to_biluo_tags

Created blank 'en' model
Error processing doc: [E103] Trying to set conflicting doc.ents: '(38, 58, 'Companies worked at')' and '(38, 44, 'Companies worked at')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(1803, 1821, 'Skills')' and '(1803, 1809, 'Companies worked at')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(707, 712, 'Location')' and '(677, 719, 'College Name')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(1417, 1423, 'Compa

Test Engineer

Mangalore, Karnataka - E..." with entities "[(2110, 2404, 'Skills'), (2055, 2064, 'Location'),...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Ulhasnagar, Maharashtra - Em..." with entities "[(605, 753, 'Skills'), (403, 407, 'Graduation Year...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Test Engineer - Infosys Limited

- Em..." with entities "[(3132, 3611, 'Skills'), (3005, 3083, 'Skills'), (...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Product development engineer ..." with entities "[(3036, 3078, 'Skills'), (2922, 3018, 'Skills'), (...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment

Error processing doc: [E103] Trying to set conflicting doc.ents: '(819, 835, 'Designation')' and '(812, 835, 'Designation')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(13883, 13887, 'Skills')' and '(13883, 13886, 'Skills')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(3385, 3391, 'Companies worked at')' and '(3345, 3896, 'Skills')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(2886, 2889, 'Companies worked at')' and '(2861, 3074, '

Principal Engineer Technical Staff ..." with entities "[(2656, 2689, 'Skills'), (2586, 2626, 'College Nam...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Bengaluru, Karnataka - Email me on ..." with entities "[(3517, 3878, 'Skills'), (3387, 3481, 'Skills'), (...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
hyderbad, Telangana - Email me on In..." with entities "[(271, 290, 'Skills'), (231, 261, 'College Name'),...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Quantitative Analyst

- Email me o..." with entities "[(773, 848, 'Skills'), (735, 740, 'Graduation Year...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the al

Error processing doc: [E103] Trying to set conflicting doc.ents: '(4774, 4778, 'Location')' and '(4744, 4789, 'College Name')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(3461, 3465, 'Graduation Year')' and '(1416, 3890, 'Skills')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(15, 50, 'Designation')' and '(15, 49, 'Designation')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(7777, 7784, 'Years of Experience')' and '(7745, 7837, 'Skil

Cluster HR Manager - Velammal New

Chennai,..." with entities "[(3758, 4638, 'Skills'), (3725, 3742, 'College Nam...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Jr. ASP.NET Developer in True Vision..." with entities "[(1085, 1500, 'Skills'), (848, 961, 'Skills'), (83...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
7 years of experience in IT Netw..." with entities "[(5471, 5838, 'Skills'), (5457, 5461, 'Graduation ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Pune, Maharashtra - Email me on Inde..." with entities "[(1901, 1906, 'Skills'), (1812, 1816, 'Graduation ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check

Error processing doc: [E103] Trying to set conflicting doc.ents: '(2907, 2938, 'Designation')' and '(2907, 2937, 'Designation')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(1105, 1113, 'Companies worked at')' and '(1080, 1121, 'Designation')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(3981, 3990, 'Location')' and '(3973, 3990, 'Companies worked at')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
Error processing doc: [E103] Trying to set conflicting doc.ents: '(1258, 1273, 'Companies worked at')'

specialist - Technology Process

IN..." with entities "[(7858, 7886, 'College Name'), (7853, 7856, 'Degre...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
MICROSOFT - Backup Administrator..." with entities "[(3917, 3975, 'Skills'), (3790, 3835, 'College Nam...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Microsoft SQL-SERVER

Dhule, Mahara..." with entities "[(1482, 1487, 'Location'), (1369, 1413, 'Email Add...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
New Delhi, Delhi - Email me on Indee..." with entities "[(937, 980, 'Email Address'), (580, 923, 'Skills')...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alig