In [18]:
import random
import spacy
from spacy.training import Example
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("../data/sample_resumes/Resume/resumeDataSet2_transformed_filtered.csv")
df = df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[0:200]

In [19]:
# Load the model
nlp = spacy.load("en_core_web_sm")

# Clear existing entity ruler if any
if "entity_ruler" in nlp.pipe_names:
    nlp.remove_pipe("entity_ruler")

# Add entity ruler
ruler = nlp.add_pipe("entity_ruler", before='ner')

# Define skill categories
SweSkills = ["Python", "Java", "JavaScript", "C++", "React", "Angular", "Node.js", "Git", "CI/CD"]
DsSkills = ["Python", "R", "SQL", "Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch"]
DoSkills = ["Linux", "Docker", "Kubernetes", "AWS", "Azure", "CI/CD", "Jenkins", "Terraform", "Ansible"]

# Create patterns
patterns = []
for skill in SweSkills:
    patterns.append({"label": "SWE", "pattern": skill})
for skill in DsSkills:
    patterns.append({"label": "DATASCIENCE", "pattern": skill})
for skill in DoSkills:
    patterns.append({"label": "DEVOPS", "pattern": skill})

# Add patterns to ruler
ruler.add_patterns(patterns)

In [20]:
# Create training examples
examples = []
for text in data['Resume_str']:
    # Create docs
    doc = nlp.make_doc(text)
    processed_doc = nlp(text)
    
    # Get entities
    entities = []
    for ent in processed_doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    
    # Create reference doc
    ref_doc = nlp.make_doc(text)
    spans = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            spans.append(span)
    ref_doc.ents = spans
    
    # Create example
    example = Example(doc, ref_doc)
    examples.append(example)

In [21]:
# Define get_examples function
def get_examples():
    return examples

# Initialize the model
optimizer = nlp.initialize(get_examples=get_examples)

# Training loop
for i in range(10):
    random.shuffle(examples)
    losses = {}
    for example in examples:
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Iteration {i}, Losses: {losses}")

Iteration 0, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 15493.518325675966}
Iteration 1, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4866.753208094015}
Iteration 2, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 2628.4833126113463}
Iteration 3, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1766.408451013135}
Iteration 4, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1167.9157375047607}
Iteration 5, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 912.3970398268485}
Iteration 6, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 788.7775304283356}
Iteration 7, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 683.5223332765775}
Iteration 8, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 621.0065984775883}
Iteration 9, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 706.1355272609607}


In [22]:
# Test on a few examples
print("\nTesting the model on sample text:")
for i, text in enumerate(data['Resume_str'][:2]):
    doc = nlp(text)
    print(f"\nExample {i+1}:")
    print("Text preview:", text[:200], "...\n")
    
    for ent in doc.ents:
        print(f"  - '{ent.text}' ({ent.label_}) [{ent.start_char}:{ent.end_char}]")


Testing the model on sample text:

Example 1:
Text preview: Skills Strong CS fundamentals and problem solving Ethereum, Smart Contracts, Solidity skills Golang, Node, Angular, React Culturally fit for startup environment MongoDB, PostGresql, MySql Enthusiastic ...

  - 'Skills' (PERSON) [0:6]
  - 'Ethereum' (ORG) [50:58]
  - 'Golang' (GPE) [93:99]
  - 'Node' (PERSON) [101:105]
  - 'Angular' (SWE) [107:114]
  - 'React' (SWE) [116:121]
  - 'PostGresql' (ORG) [170:180]
  - 'MySql' (PERSON) [182:187]
  - 'AWS' (DEVOPS) [227:230]
  - 'Docker' (DEVOPS) [232:238]
  - 'Microservices' (ORDINAL) [240:253]
  - 'January' (SWE) [305:312]
  - 'Engineering' (ORDINAL) [334:345]
  - 'Blockchain' (ORDINAL) [524:534]
  - 'Skill' (ORDINAL) [566:571]
  - '16' (SWE) [603:605]
  - 'CONTRACTS-' (ORG) [614:624]
  - 'Exprience' (GPE) [655:664]
  - '9' (SWE) [667:668]
  - 'Exprience' (GPE) [721:730]
  - 'Xinfin' (GPE) [803:809]
  - 'Hybrid' (ORDINAL) [834:840]
  - 'Tradefinex' (PERSON) [964:974]
  - 'Land' (TIM

  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)


In [23]:
# Save the trained model
output_dir = "../models/spacy_skill_ner"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ../models/spacy_skill_ner


In [24]:
# Example of how to load and use the saved model
def extract_skills(resume_text):
    # Load the saved model
    loaded_nlp = spacy.load("../models/spacy_skill_ner")
    
    doc = loaded_nlp(resume_text)
    skills = {
        "SWE": [],
        "DATASCIENCE": [],
        "DEVOPS": []
    }
    
    for ent in doc.ents:
        if ent.label_ in skills:
            if ent.text not in skills[ent.label_]:
                skills[ent.label_].append(ent.text)
    
    return skills

# Test the inference
sample_text = """
TECHNICAL SKILLS
Programming Languages: Python, Java, JavaScript
Data Science: SQL, Machine Learning, TensorFlow
DevOps: Docker, AWS, Kubernetes
"""

extracted_skills = extract_skills(sample_text)
print("\nExtracted Skills:")
for category, skills in extracted_skills.items():
    print(f"{category}: {', '.join(skills)}")


Extracted Skills:
SWE: Python, Java, JavaScript
DATASCIENCE: SQL
DEVOPS: Docker, AWS


  matches = self.matcher(doc, allow_missing=True, as_spans=False)
