## Importing Modules

In [60]:
import re
import json
import spacy
import logging
from spacy.training import Example,offsets_to_biluo_tags
from spacy.tokens import Doc, Span
import random
from spacy.util import minibatch, compounding
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support,  accuracy_score
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

## Loading json files 

In [63]:
train_file_path = 'C:/Users/madhu/OneDrive/Desktop/ACM/Named_Entity_Recognition/NER/traindata.json' 
test_file_path = 'C:/Users/madhu/OneDrive/Desktop/ACM/Named_Entity_Recognition/NER/testdata.json'

### Loading train data


In [66]:
train_data=[]
with open(train_file_path , 'r', encoding='utf-8') as file:
    for line in file:
        resume = json.loads(line)
        train_data.append(resume)

if train_data:
    print(train_data[0])      
        

{'content': 'Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\n

### Loading test data


In [69]:
test_data=[]
with open(test_file_path , 'r', encoding='utf-8') as file:
    for line in file:
        resume = json.loads(line)
        test_data.append(resume)


## Preprocessing steps

### Removing white spaces and new line characters

In [73]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

for entry in train_data:
    entry['content'] = clean_text(entry['content'])

for entry in test_data:
    entry['content'] = clean_text(entry['content'])

### Removing overlapping entities

In [76]:
def remove_overlapping_entities(annotations):
    entities = sorted(annotations['entities'], key=lambda x: (x[0], x[1]))
    filtered_entities = []
    prev_start, prev_end = -1, -1

    for start, end, label in entities:
        if start >= prev_end:
            filtered_entities.append((start, end, label))
            prev_start, prev_end = start, end
        else:
            if end > prev_end:
                if filtered_entities:
                    filtered_entities.pop()  # Remove the last entity
                filtered_entities.append((start, end, label))
                prev_start, prev_end = start, end

    annotations['entities'] = filtered_entities
    return annotations

### Function to filter misaligned entities

In [112]:
def filter_misaligned_entities(nlp, text, entities):
    biluo_tags = offsets_to_biluo_tags(nlp.make_doc(text), entities)
    return [entity for entity, tag in zip(entities, biluo_tags) if tag != 'O']


## Function to convert data to spaCy format


In [87]:
def convert_data_to_spacy(data):
    processed_data = []

    for data in data:
        text = data['content']
        entities = []
        for annotation in data['annotation']:
            # Only a single point in text annotation.
            point = annotation['points'][0]
            start = point['start']
            end = point['end'] + 1
            entity_text = text[start:end].strip()

            # Correct start and end positions based on stripped text.
            start = text.index(entity_text)
            end = start + len(entity_text)

            labels = annotation['label']
            # Handle both list of labels or a single label.
            if not isinstance(labels, list):
                labels = [labels]

            for label in labels:
                # Dataturks indices are both inclusive [start, end] but Spacy is not [start, end)
                entities.append((point['start'], point['end'] + 1, label))

        annotations = {"entities": entities}
        annotations = remove_overlapping_entities(annotations)  # Clean overlapping entities
         # Filter out misaligned entities
        entities = filter_misaligned_entities(nlp, text, annotations['entities'])
        processed_data.append((text, annotations))

    return processed_data


### Converting the train data

In [90]:
train_data = convert_data_to_spacy(train_data)

In [92]:
print(train_data[0])

('Govardhana K Senior Software Engineer Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68 Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX. Designations & Promotions Willing to relocate: Anywhere WORK EXPERIENCE Senior Software Engineer Cloud Lending Solutions - Bangalore, Karnataka - January 2018 to Present Present Senior Consultant Oracle - Bangalore, Karnataka - November 2016 to December 2017 Staff Consultant Oracle - Bangalore, Karnataka - January 2014 to October 2016 Associate Consultant Oracle - Bangalore, Karnataka - November 2012 to December 2013 EDUCATION B.E in Computer Science Engineering Adithya Institute of Technology - Tamil Nadu September 2008 to June 2012 https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co

## Load blank english model 

In [95]:
 nlp = spacy.blank('en')  # Create blank Language class

### Adding the NER pipeline component

In [98]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner', last=True)

### Adding labels to the NER component

In [101]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

### Disabling other pipeline components to only train NER


In [104]:
 # Get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # Only train NER
    optimizer = nlp.begin_training()

## Training the model

In [107]:
# Training the model
def train_spacy(train_data, iterations=10):
    for itn in range(iterations):
        print(f"Starting iteration {itn}")
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],  # Batch of Example objects
                drop=0.2,  # Dropout - make it harder to memorize data
                sgd=optimizer,  # Callable to update weights
                losses=losses
            )
        print(losses)
    return nlp

In [109]:
nlp_model = train_spacy(train_data)

Starting iteration 0
{'ner': 4314.65722655211}
Starting iteration 1
{'ner': 601.3107491989562}
Starting iteration 2
{'ner': 925.4968483394929}
Starting iteration 3
{'ner': 414.3662813244367}
Starting iteration 4
{'ner': 606.333228907427}
Starting iteration 5
{'ner': 699.9589178571306}
Starting iteration 6
{'ner': 786.1347077396551}
Starting iteration 7
{'ner': 597.2934646112649}
Starting iteration 8
{'ner': 382.6191091736131}
Starting iteration 9
{'ner': 558.6145360851781}


## Predictions on test set

In [115]:
for i, data in enumerate(test_data):
    text = data['content']
    doc = nlp_model(text)
    print(f"Entities in test text {i}:")
    for ent in doc.ents:
        print(f" - {ent.text}: {ent.label_}")
    print("\n")


Entities in test text 0:
 - Abhishek Jha: Name
 - Application Development Associate - Accenture: Designation
 - indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a: Email Address


Entities in test text 1:
 - Afreen Jamadar: Name
 - Active member: Designation


Entities in test text 2:
 - Akhil Yadav: Name
 - Polemaina: Location


Entities in test text 3:
 - Alok Khandai: Name
 - Operational Analyst (SQL DBA): Designation
 - Bengaluru: Location


Entities in test text 4:
 - Ananya Chavan: Name


Entities in test text 5:
 - Anvitha Rao: Name
 - Automation developer: Designation
 - of Technology - Bengaluru, Karnataka September 2012: College Name


Entities in test text 6:
 - arjun ks: Name


Entities in test text 7:
 - Arun Elumalai: Name


Entities in test text 8:
 - Ashalata Bisoyi: Name


Entities in test text 9:
 - Ashok Kunam: Name
 - Team Lead: Designation
 - Microsoft: Companies worked at


Entities in test text 10:
 - Asish Ratha: Name
 - Subject matter Expert: Designation


Entities in 

## Saving the model

In [118]:
output_path = 'C:/Users/madhu/OneDrive/Desktop/ACM/Named_Entity_Recognition/NER/ner_model'
nlp.to_disk(output_path)
print(f"Model saved to {output_path}")

Model saved to C:/Users/madhu/OneDrive/Desktop/ACM/Named_Entity_Recognition/NER/ner_model
