In [1]:
print("Welcome")

Welcome


In [5]:
import spacy
import pickle
import random
import json

In [2]:
import os

os.getcwd()

'c:\\Users\\arshi\\Downloads\\Desktop\\Bro-Project\\SkillForge.ai\\research'

In [3]:
os.chdir('c:\\Users\\arshi\\Downloads\\Desktop\\Bro-Project\\SkillForge.ai')

In [4]:
os.getcwd()

'c:\\Users\\arshi\\Downloads\\Desktop\\Bro-Project\\SkillForge.ai'

In [7]:
file_path = r'C:\Users\arshi\Downloads\Desktop\Bro-Project\SkillForge.ai\data\train_data.json'

In [8]:
with open(file_path, 'r') as file:
    train_data = json.load(file)

In [12]:
len(train_data)

200

In [19]:
text_sample = train_data[0]

In [30]:
def convert_json_to_spacy_format(json_data):
    """
    Convert JSON resume data into spaCy NER training format.
    
    Args:
        json_data (list): List of dictionaries containing resume data in the provided JSON format.
        
    Returns:
        list: List of tuples in (text, annotations) format suitable for spaCy NER training.
    """
    training_data = []
    
    for entry in json_data:
        if not isinstance(entry, list) or len(entry) != 2:
            continue
            
        text = entry[0]
        entities = entry[1].get("entities", [])
        
        # Convert entity format from [start, end, label] to (start, end, label)
        formatted_entities = []
        for entity in entities:
            if len(entity) == 3:
                formatted_entities.append(tuple(entity))
        
        training_data.append((text, {"entities": formatted_entities}))
    
    return training_data

In [31]:
convert_data = convert_json_to_spacy_format(train_data)

In [33]:
len(convert_data)

200

In [34]:
from spacy.training import Example
from spacy.util import minibatch, compounding
from pathlib import Path

In [13]:
nlp = spacy.blank('en')

In [35]:
def train_ner_model(training_data, model_name="resume_ner", output_dir=None, n_iter=100):
    """
    Train a spaCy NER model with the provided training data.
    
    Args:
        training_data (list): List of training examples in (text, annotations) format
        model_name (str): Name for the new model
        output_dir (str): Directory to save the trained model (None to not save)
        n_iter (int): Number of training iterations
        
    Returns:
        spacy.Language: The trained NER model
    """
    # Load a blank English model or existing model
    try:
        nlp = spacy.load(model_name)
        print(f"Loaded existing model '{model_name}'")
    except:
        nlp = spacy.blank("en")
        print(f"Created blank 'en' model")
    
    # Add NER pipeline if it doesn't exist
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    # Add entity labels to the NER component
    for _, annotations in training_data:
        for ent in annotations.get("entities", []):
            ner.add_label(ent[2])
    
    # Disable other pipelines during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        # Initialize the model with random weights
        nlp.initialize(lambda: [Example.from_dict(nlp.make_doc(text), annotations) 
                      for text, annotations in training_data])
        
        # Training loop
        print("Beginning training...")
        for itn in range(n_iter):
            random.shuffle(training_data)
            losses = {}
            
            # Batch the examples using spaCy's minibatch
            batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                
                nlp.update(examples, drop=0.5, losses=losses)
            
            print(f"Iteration {itn + 1}/{n_iter}, Loss: {losses['ner']:.4f}")
    
    # Save the model if output directory is specified
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)
        nlp.to_disk(output_dir)
        print(f"Model saved to {output_dir}")
    
    return nlp

In [37]:
def filter_overlapping_entities(training_data):
    """
    Filter out overlapping entities from training data
    """
    cleaned_data = []
    
    for text, annotations in training_data:
        entities = annotations.get("entities", [])
        
        # Sort entities by start position
        entities.sort(key=lambda x: x[0])
        
        filtered_entities = []
        prev_end = -1
        
        for ent in entities:
            start, end, label = ent
            if start >= prev_end:
                filtered_entities.append(ent)
                prev_end = end
        
        cleaned_data.append((text, {"entities": filtered_entities}))
    
    return cleaned_data

In [38]:
convert_data = filter_overlapping_entities(convert_data)

In [39]:
trained_model = train_ner_model(
        convert_data,
        model_name="resume_ner",
        output_dir="./resume_ner_model",
        n_iter=50
    )

Created blank 'en' model




Beginning training...
Iteration 1/50, Loss: 47579.8359
Iteration 2/50, Loss: 6992.9209
Iteration 3/50, Loss: 7047.4023
Iteration 4/50, Loss: 6989.9585
Iteration 5/50, Loss: 5767.9517
Iteration 6/50, Loss: 5175.3687
Iteration 7/50, Loss: 5107.7129
Iteration 8/50, Loss: 6294.2598
Iteration 9/50, Loss: 9640.9434
Iteration 10/50, Loss: 5812.9209
Iteration 11/50, Loss: 4823.1514
Iteration 12/50, Loss: 4462.7983
Iteration 13/50, Loss: 4387.7583
Iteration 14/50, Loss: 4272.4111
Iteration 15/50, Loss: 3902.9854
Iteration 16/50, Loss: 4150.3872
Iteration 17/50, Loss: 3922.3477
Iteration 18/50, Loss: 3707.1101
Iteration 19/50, Loss: 3955.3198
Iteration 20/50, Loss: 4617.3809
Iteration 21/50, Loss: 3459.7190
Iteration 22/50, Loss: 3379.2861
Iteration 23/50, Loss: 3316.2708
Iteration 24/50, Loss: 3322.6455
Iteration 25/50, Loss: 3336.6458
Iteration 26/50, Loss: 3447.4082
Iteration 27/50, Loss: 3363.4622
Iteration 28/50, Loss: 3277.3762
Iteration 29/50, Loss: 3184.9900
Iteration 30/50, Loss: 3022.3

In [45]:
test_text = "'Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co=IN https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co=IN   SKILLS  APEX. (Less than 1 year), Data Structures (3 years), FLEXCUBE (5 years), Oracle (5 years), Algorithms (3 years)  LINKS  https://www.linkedin.com/in/govardhana-k-61024944/  ADDITIONAL INFORMATION  Technical Proficiency:  Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle PL-SQL programming, Sales Force with APEX. Tools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer, PL/SQL Developer, WinSCP, Putty Web Technologies: JavaScript, XML, HTML, Webservice  Operating Systems: Linux, Windows Version control system SVN & Git-Hub Databases: Oracle Middleware: Web logic, OC4J Product FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x  https://www.linkedin.com/in/govardhana-k-61024944/"
doc = trained_model(test_text)
print("\nTest prediction:")
# for ent in doc.ents:
#     print(f"{ent.text} ({ent.label_})")
   
skills = [ent.text for ent in doc.ents if ent.label_ == 'Skills']


Test prediction:


In [43]:
train_data[0]

['Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [46]:
skills

['APEX. (Less than 1 year), Data Structures (3 years), FLEXCUBE (5 years), Oracle (5 years), Algorithms (3 years)',
 'Technical Proficiency:  Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle PL-SQL programming, Sales Force with APEX. Tools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer, PL/SQL Developer, WinSCP, Putty Web Technologies: JavaScript, XML, HTML, Webservice  Operating Systems: Linux, Windows Version control system SVN & Git-Hub Databases: Oracle Middleware: Web logic, OC4J Product FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x  https://www.linkedin.com/in/govardhana-k-61024944/']

## Second way to create model using span

In [53]:
def convert_to_spans_format(training_data, nlp):
    """
    Convert entities to spans format with validation for:
    - Invalid offsets (start > end)
    - Offsets exceeding text length
    - Proper token alignment
    """
    spans_data = []
    error_count = 0
    
    for text, annotations in training_data:
        doc = nlp.make_doc(text)
        spans = []
        
        for ent in annotations.get("entities", []):
            if len(ent) != 3:
                continue
                
            start, end, label = ent
            
            # Validate offsets
            if start >= end:
                error_count += 1
                continue
                
            if end > len(text):
                error_count += 1
                continue
                
            span = doc.char_span(start, end, label=label)
            if span is not None:
                spans.append(span)
            else:
                error_count += 1
        
        spans_data.append((text, {"spans": spans}))
    
    if error_count > 0:
        print(f"Warning: Skipped {error_count} invalid entity spans")
    
    return spans_data

In [56]:
nlp1 = spacy.blank("en")

In [54]:
convert_data_span = filter_overlapping_entities(train_data)

In [57]:
train_data_span = convert_to_spans_format(convert_data,nlp1)



In [59]:
def train_ner_model_span(training_data, model_name="resume_ner", output_dir=None, n_iter=100):
    """Train with error handling for overlapping entities"""
    # Load or create model
    try:
        nlp = spacy.load(model_name)
        print(f"Loaded existing model '{model_name}'")
    except:
        nlp = spacy.blank("en")
        print(f"Created blank 'en' model")
    
    # Add NER pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    # Add labels
    for _, annotations in training_data:
        for ent in annotations.get("entities", []):
            ner.add_label(ent[2])
    
    # Training with error handling
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.initialize()
        
        print("Beginning training...")
        for itn in range(n_iter):
            random.shuffle(training_data)
            losses = {}
            
            batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    try:
                        example = Example.from_dict(doc, annotations)
                        examples.append(example)
                    except ValueError as e:
                        print(f"Skipping problematic example: {str(e)}")
                        continue
                
                if examples:  # Only update if we have valid examples
                    nlp.update(examples, drop=0.5, losses=losses, sgd=optimizer)
            
            print(f"Iteration {itn + 1}/{n_iter}, Loss: {losses.get('ner', 0):.4f}")
    
    if output_dir:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        nlp.to_disk(output_dir)
        print(f"Model saved to {output_dir}")
    
    return nlp

In [60]:
trained_model2 = train_ner_model_span(
    train_data_span,
    model_name="resume_ner2",
    output_dir="./resume_ner_model2",
    n_iter=50
)

Created blank 'en' model
Beginning training...
Skipping problematic example: [E879] Unexpected type for 'spans' data. Provide a dictionary mapping keys to a list of spans, with each span represented by a tuple (start_char, end_char). The tuple can be optionally extended with a label and a KB ID.
Skipping problematic example: [E879] Unexpected type for 'spans' data. Provide a dictionary mapping keys to a list of spans, with each span represented by a tuple (start_char, end_char). The tuple can be optionally extended with a label and a KB ID.
Skipping problematic example: [E879] Unexpected type for 'spans' data. Provide a dictionary mapping keys to a list of spans, with each span represented by a tuple (start_char, end_char). The tuple can be optionally extended with a label and a KB ID.
Skipping problematic example: [E879] Unexpected type for 'spans' data. Provide a dictionary mapping keys to a list of spans, with each span represented by a tuple (start_char, end_char). The tuple can be

In [None]:
test_text = "Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co=IN https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co=IN   SKILLS  APEX. (Less than 1 year), Data Structures (3 years), FLEXCUBE (5 years), Oracle (5 years), Algorithms (3 years)  LINKS  https://www.linkedin.com/in/govardhana-k-61024944/  ADDITIONAL INFORMATION  Technical Proficiency:  Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle PL-SQL programming, Sales Force with APEX. Tools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer, PL/SQL Developer, WinSCP, Putty Web Technologies: JavaScript, XML, HTML, Webservice  Operating Systems: Linux, Windows Version control system SVN & Git-Hub Databases: Oracle Middleware: Web logic, OC4J Product FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x  https://www.linkedin.com/in/govardhana-k-61024944/"
doc = trained_model2(test_text)
print("\nTest prediction:")
# for ent in doc.ents:
#     print(f"{ent.text} ({ent.label_})")
   
skills2 = [ent.text for ent in doc.ents if ent.label_ == 'Skills']


Test prediction:


In [62]:
skills2

[]

In [63]:
print('model creation failed')

model creation failed
