In [1]:
import random
import spacy
from spacy.training import Example
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("../data/sample_resumes/Resume/resumeDataSet2_transformed_filtered.csv")
df = df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[0:200]

In [2]:
# Load the model
nlp = spacy.load("en_core_web_sm")

# Clear existing entity ruler if any
if "entity_ruler" in nlp.pipe_names:
    nlp.remove_pipe("entity_ruler")

# Add entity ruler
ruler = nlp.add_pipe("entity_ruler", before='ner')

# Define skill categories
SweSkills = ["Python", "Java", "JavaScript", "C++", "React", "Angular", "Node.js", "Git", "CI/CD"]
DsSkills = ["Python", "R", "SQL", "Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch"]
DoSkills = ["Linux", "Docker", "Kubernetes", "AWS", "Azure", "CI/CD", "Jenkins", "Terraform", "Ansible"]

# Create patterns
patterns = []
for skill in SweSkills:
    patterns.append({"label": "SWE", "pattern": skill})
for skill in DsSkills:
    patterns.append({"label": "DATASCIENCE", "pattern": skill})
for skill in DoSkills:
    patterns.append({"label": "DEVOPS", "pattern": skill})

# Add patterns to ruler
ruler.add_patterns(patterns)

In [3]:
# Create training examples
examples = []
for text in data['Resume_str']:
    # Create docs
    doc = nlp.make_doc(text)
    processed_doc = nlp(text)
    
    # Get entities
    entities = []
    for ent in processed_doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    
    # Create reference doc
    ref_doc = nlp.make_doc(text)
    spans = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            spans.append(span)
    ref_doc.ents = spans
    
    # Create example
    example = Example(doc, ref_doc)
    examples.append(example)

In [4]:
# Define get_examples function
def get_examples():
    return examples

# Initialize the model
optimizer = nlp.initialize(get_examples=get_examples)

# Training loop
for i in range(10):
    random.shuffle(examples)
    losses = {}
    for example in examples:
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Iteration {i}, Losses: {losses}")

Iteration 0, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 17338.905635804145}
Iteration 1, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 5512.826236850817}
Iteration 2, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 2961.821882210253}
Iteration 3, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 2251.9016194236706}
Iteration 4, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1598.2339108574188}
Iteration 5, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1401.3457851694914}
Iteration 6, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1226.3585886710127}
Iteration 7, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 953.3223121622555}
Iteration 8, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 892.1314600995261}
Iteration 9, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 754.1162122070535}


In [5]:
# Test on a few examples
print("\nTesting the model on sample text:")
for i, text in enumerate(data['Resume_str'][:2]):
    doc = nlp(text)
    print(f"\nExample {i+1}:")
    print("Text preview:", text[:200], "...\n")
    
    for ent in doc.ents:
        print(f"  - '{ent.text}' ({ent.label_}) [{ent.start_char}:{ent.end_char}]")


Testing the model on sample text:

Example 1:
Text preview: Education Details 
January 2015 Bachelor of Engineering EXTC Mumbai, Maharashtra Mumbai University
January 2012 Diploma Industrial Electronics Vashi, MAHARASHTRA, IN Fr. Agnel Polytechnic
ETL Devel ...

  - 'MAHARASHTRA' (ORG) [152:163]
  - 'Fr' (ORG) [168:170]
  - 'Exprience' (GPE) [253:262]
  - 'California' (GPE) [317:327]
  - 'Mar' (LANGUAGE) [354:357]
  - 'California' (GPE) [408:418]
  - 'BSC' (ORG) [524:527]
  - 'Informatica' (ORG) [664:675]
  - '9.6.1' (CARDINAL) [676:681]
  - 'Oracle' (PERSON) [683:689]
  - '11' (CARDINAL) [690:692]
  - 'Facets' (GPE) [695:701]
  - 'Informatica' (ORG) [750:761]
  - 'Center' (GPE) [768:774]
  - '9.6.1' (CARDINAL) [775:780]
  - 'Oracle' (PERSON) [783:789]
  - '11' (CARDINAL) [790:792]
  - 'SQL' (DATASCIENCE) [795:798]
  - 'SQL' (DATASCIENCE) [803:806]
  - 'UNIX' (ORG) [809:813]
  - 'Facets' (GPE) [815:821]
  - 'Informatica' (ORG) [980:991]
  - 'Informatica' (ORG) [1206:1217]
  - 'Writing

  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)


In [7]:
# Save the trained model
output_dir = "../models/spacy_skill_ner"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ../models/spacy_skill_ner


In [8]:
# Example of how to load and use the saved model
def extract_skills(resume_text):
    # Load the saved model
    loaded_nlp = spacy.load("../models/spacy_skill_ner")
    
    doc = loaded_nlp(resume_text)
    skills = {
        "SWE": [],
        "DATASCIENCE": [],
        "DEVOPS": []
    }
    
    for ent in doc.ents:
        if ent.label_ in skills:
            if ent.text not in skills[ent.label_]:
                skills[ent.label_].append(ent.text)
    
    return skills

# Test the inference
sample_text = """
TECHNICAL SKILLS
Programming Languages: Python, Java, JavaScript
Data Science: SQL, Machine Learning, TensorFlow
DevOps: Docker, AWS, Kubernetes
"""

extracted_skills = extract_skills(sample_text)
print("\nExtracted Skills:")
for category, skills in extracted_skills.items():
    print(f"{category}: {', '.join(skills)}")


Extracted Skills:
SWE: Python, Java, JavaScript
DATASCIENCE: SQL
DEVOPS: Machine, AWS


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


In [None]:
# Using model w/ the main CSv to create visualizations with DFs and grabbing highest counted skills/  

totalSkills = {
    "SWE": [],
    "DS": [],
    "DO": [],
}
for idx, resume_text in enumerate(data["Resume_str"][:51]):
    # print(f"Resume ", idx+1)
    skills = extract_skills(resume_text)
    for i in range(len(skills["SWE"])):
        totalSkills["SWE"].append(skills["SWE"][i])

    for i in range(len(skills["DATASCIENCE"])):
        totalSkills["DS"].append(skills["DATASCIENCE"][i])

    for i in range(len(skills["DEVOPS"])):
        totalSkills["DO"].append(skills["DEVOPS"][i])


print(totalSkills)
print(len(totalSkills["SWE"]))
print(len(totalSkills["DS"]))
print(len(totalSkills["DO"]))

unique_swe_skills = {}
unique_dataScience_skills = {}
unique_devOps_skills = {}

for skill in totalSkills["SWE"]:
    unique_swe_skills[skill] = unique_swe_skills.get(skill, 0) +1 
for skill in totalSkills["DS"]:
    unique_dataScience_skills[skill] = unique_dataScience_skills.get(skill, 0) +1 
for skill in totalSkills["DO"]:
    unique_devOps_skills[skill] = unique_devOps_skills.get(skill, 0) +1 

top_swe_skill = max(unique_swe_skills, key=unique_swe_skills.get)
top_swe_count = unique_swe_skills[top_swe_skill]

top_ds_skill = max(unique_dataScience_skills, key=unique_dataScience_skills.get)
top_ds_count = unique_dataScience_skills[top_ds_skill]

top_do_skill = max(unique_dataScience_skills, key=unique_dataScience_skills.get)
top_do_count = unique_dataScience_skills[top_do_skill]

unique_swe_df = pd.DataFrame(list(unique_swe_skills.items()), columns=["skills", "count"])
unique_dS_df= pd.DataFrame(list(unique_dataScience_skills.items()), columns=["skills", "count"])
unique_dO_df = pd.DataFrame(list(unique_devOps_skills.items()), columns=["skills", "count"])

print(f"The highest count of unique swe skills {top_swe_skill} : {top_swe_count}")
print(f"The highest count of unique swe skills {top_ds_skill} : {top_ds_count}")
print(f"The highest count of unique swe skills {top_do_skill} : {top_do_count}")

print(unique_swe_df)
print(unique_dS_df)
print(unique_dO_df)

print(unique_swe_skills)
print(unique_devOps_skills)
print(unique_dataScience_skills)



  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches = self.matcher(doc, allow_missing=True, as_spans=False)
  matches 

{'SWE': ['Java', 'Angular', 'React', 'MySql', 'Node.js', 'Java', 'Java', 'C++', 'Python', 'fault', 'Git', 'Python', 'C++', 'Python', 'Java', 'Java', 'Bill', 'Python', 'Java', 'JavaScript', 'Angular', 'JavaScript', 'Git', 'JavaScript', 'Astellas', 'Java', 'C++', 'Java', 'Performance', 'C++', 'Java', 'Java', 'Python', 'Java', 'JavaScript', 'C++', 'JavaScript', 'Angular', 'Java', 'React', 'Java', 'JavaScript', 'Angular', 'Python', 'Python', 'fault', 'Git', 'Java', 'JavaScript', 'Angular', 'Java', 'Java', 'Java', 'C++', 'JavaScript', 'Angular', 'Java', 'React', 'Python', 'C++', 'Java', 'Java', 'Bill', 'C++', 'Python', 'Java'], 'DS': ['SQL', 'SQL', 'SQL', 'SQL', 'SQL', 'Cloud', 'SQL', 'Python', 'restfull', 'Cloud', 'SQL', 'Python', 'SQL', 'Devops', 'Bill', 'COSMOSS', 'WebLogic', 'SQL', 'Knockout', 'SQL', 'Node', 'UML', 'Sikkim', 'Blockchain', 'SQL', 'SQL', 'Attendance', 'Python', 'SQL', 'SQL', 'Point', 'Python', 'Devops', 'SQL', 'CSS', 'UI', 'SQL', 'SQL', 'jsp', 'OmegaSoft', 'Python', 'rest

In [12]:
print(data["Resume_str"])

266    Education Details \r\nJanuary 2015 Bachelor of...
55     Skills â¢ Language: Java â¢ Operating System...
323    Skills Strong CS fundamentals and problem solv...
51     Computer Skills: Languages And Script: JSP, Se...
99     Education Details \r\nMay 2013 Master Computer...
                             ...                        
105    Education Details \r\nMay 2013 Master Computer...
121    Education Details \r\nJune 2013 to June 2016 D...
172    Skills VISA B1-VISA (USA) Onsite Visits to Swe...
192    CORE COMPETENCIES ~ Ant ~ Maven ~ GIT ~ Bitbuc...
70     Education Details \r\nAugust 2010 to May 2017 ...
Name: Resume_str, Length: 200, dtype: object
