## Import needed packages

In [1]:
# Install needed packages
!pip install spacy
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
# Import needed packages

import json
import logging
import math
import pickle
import random
import re
import sys
import spacy

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from seqeval.metrics import f1_score

from spacy import displacy
from spacy.training.example import Example
from spacy.training import offsets_to_biluo_tags

from itertools import groupby, chain

In [19]:
# Stop displayin warnings
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

## Prepare the data

The dataset has 220 items of which 220 items have been manually labeled.

The labels are divided into following 10 categories:
*   Name
*   College Name
*   Degree
*   Graduation Year
*   Years of Experience
*   Companies worked at
*   Designation
*   Skills
*   Location
*   Email Address

In [6]:
# Import the data
df = pd.read_json ('Entity Recognition in Resumes.json',lines=True)

In [7]:
# Display the first ten different lines
df.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [8]:
# Remove the \n of each CV
for i in range(len(df)):
  df["content"][i] = df["content"][i].replace("\n", " ")
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["content"][i] = df["content"][i].replace("\n", " ")


Unnamed: 0,content,annotation,extras
0,Abhishek Jha Application Development Associate...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar Active member of IIIT Committee...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan lecturer - oracle tutorials Mum...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [9]:
# Remove the 'extras' column 
df = df.drop(['extras'], axis=1)
df.head()

Unnamed: 0,content,annotation
0,Abhishek Jha Application Development Associate...,"[{'label': ['Skills'], 'points': [{'start': 12..."
1,Afreen Jamadar Active member of IIIT Committee...,"[{'label': ['Email Address'], 'points': [{'sta..."
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","[{'label': ['Skills'], 'points': [{'start': 37..."
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"[{'label': ['Skills'], 'points': [{'start': 80..."
4,Ananya Chavan lecturer - oracle tutorials Mum...,"[{'label': ['Degree'], 'points': [{'start': 20..."


In [10]:
# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
  """Convert data to SpaCy format.
  Args:
    dataturks_JSON_FilePath (String): The path to the JSON file.

  Returns:
    list: The training data.
  """
  
  try:
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
      lines = f.readlines()

    for line in lines:
      data = json.loads(line)
      text = data['content'].replace("\n", " ")
      entities = []
      data_annotations = data['annotation']
      if data_annotations is not None:
        for annotation in data_annotations:
          # Only a single point in text annotation.
          point = annotation['points'][0]
          labels = annotation['label']
          # Handle both list of labels or a single label.
          if not isinstance(labels, list):
            labels = [labels]

          for label in labels:
            point_start = point['start']
            point_end = point['end']
            point_text = point['text']
                        
            lstrip_diff = len(point_text) - len(point_text.lstrip())
            rstrip_diff = len(point_text) - len(point_text.rstrip())
            if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
            if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
            entities.append((point_start, point_end + 1 , label))
      training_data.append((text, {"entities" : entities}))
    return training_data
  except Exception as e:
    logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
    return None 

In [11]:
# Remove extra spaces from each CV
def trim_entity_spans(data: list) -> list:
  """Removes leading and trailing white spaces from entity spans.
  Args:
    data (list): The data to be cleaned in spaCy JSON format.

  Returns:
    list: The cleaned data.
  """
  
  invalid_span_tokens = re.compile(r'\s')

  cleaned_data = []
  for text, annotations in data:
    entities = annotations['entities']
    valid_entities = []
    for start, end, label in entities:
      valid_start = start
      valid_end = end
      while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
        valid_start += 1
      while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
        valid_end -= 1
      valid_entities.append([valid_start, valid_end, label])
    cleaned_data.append([text, {'entities': valid_entities}])
  return cleaned_data

In [12]:
# Get each line of the file
with open('Entity Recognition in Resumes.json', 'r') as f:
      lines = f.readlines()

In [13]:
# Clean the data
data = trim_entity_spans(convert_dataturks_to_spacy('Entity Recognition in Resumes.json'))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [14]:
# Clean the annotated data
def clean_entities(training_data):
  """Removes overlapping entities.
  Args:
    training_data (list): The training data in SpaCy format.

  Returns:
    list: The cleaned data (with no overlapping entities).
  """

  clean_data = []
  for text, annotation in training_data:
        
    entities = annotation.get('entities')
    entities_copy = entities.copy()
        
    # Append entity only if it is longer than its overlapping entity
    i = 0
    for entity in entities_copy:
      j = 0
      for overlapping_entity in entities_copy:
        # Skip self
        if i != j:
          e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
          # Delete any entity that overlaps, keep if longer
          if ((e_start >= oe_start and e_start <= oe_end) \
          or (e_end <= oe_end and e_end >= oe_start)) \
          and ((e_end - e_start) <= (oe_end - oe_start)):
            entities.remove(entity)
        j += 1
      i += 1
    clean_data.append((text, {'entities': entities}))
                
  return clean_data

In [15]:
# Clean the data
data = clean_entities(data)

In [16]:
# Split the data
def train_test_split(data, test_size, random_state):
  """Split the data into train and test sets.
  Args:
    data (list): The cleaned data.
    test_size (number): The size of the test set.
    random_state (number): The radnom state number (for sets reproduction).

  Returns:
    list: The train data.
    list: The test data.
  """
  
  random.Random(random_state).shuffle(data)
  test_idx = len(data) - math.floor(test_size * len(data))
  train_set = data[0: test_idx]
  test_set = data[test_idx: ]
  return train_set, test_set

In [17]:
# Get the train and test sets
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

## Train the NER model

In [20]:
# Training function
def train_spacy():
  """Train the ner model baesd on the training data.
  Returns:
    object: The ner object.
  """  

  nlp = spacy.blank('en')  # Create blank Language class
  # Create the built-in pipeline components and add them to the pipeline
  # nlp.create_pipe works for built-ins that are registered with spaCy
  if 'ner' not in nlp.pipe_names:
    # ner = nlp.create_pipe('ner')
    ner = nlp.add_pipe("ner", last=True)
        
  # Add labels
  for _, annotations in train_data:
    for ent in annotations.get("entities"):
      ner.add_label(ent[2])
            
  # Get names of other pipes to disable them during training
  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
  with nlp.disable_pipes(*other_pipes):  # Only train NER
    optimizer = nlp.begin_training()
    for itn in range(10):
      print("Statring iteration " + str(itn))
      random.shuffle(train_data)
      losses = {}
      for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        # Update the model
        nlp.update([example], losses=losses, drop=0.3, sgd=optimizer)
      print(losses)
      print("----------------------------")
  return nlp

In [21]:
# Train the ner model
nlp = train_spacy()

Statring iteration 0




{'ner': 13065.77521394494}
----------------------------
Statring iteration 1
{'ner': 5119.908427166601}
----------------------------
Statring iteration 2
{'ner': 4329.97508010868}
----------------------------
Statring iteration 3
{'ner': 4367.289222588205}
----------------------------
Statring iteration 4
{'ner': 3584.574239579521}
----------------------------
Statring iteration 5
{'ner': 3156.016597302485}
----------------------------
Statring iteration 6
{'ner': 3101.6413720193923}
----------------------------
Statring iteration 7
{'ner': 2891.3553231821174}
----------------------------
Statring iteration 8
{'ner': 2624.8537775919494}
----------------------------
Statring iteration 9
{'ner': 2573.18014525528}
----------------------------


## Test the NER model

In [22]:
# Convert prediction to the bilou format
def doc_to_bilou(nlp, text):
  """Convert ner model predictions to the bilou format.
  Args:
    nlp (object): ner model.
    text (string): text to predict.

  Returns:
    list: list of the prediction in the bilou format.
  """
  
  doc = nlp(text)
  tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
  entities = []
  for entity, group in groupby(tokens, key=lambda t: t[-1]):
    if not entity:
      continue
    group = list(group)
    _, start, _ = group[0]
    word, last, _ = group[-1]
    end = last + len(word)
        
    entities.append((start, end, entity))

  gold = offsets_to_biluo_tags(nlp(text), entities = entities)
  pred_ents = gold
    
  return pred_ents

In [23]:
# Get the predicted and actual labels
y_test = []
y_pred = []

for text, annots in test_data:
  gold = offsets_to_biluo_tags(nlp.make_doc(text), entities = annots.get("entities"))
  ents = gold
  pred_ents = doc_to_bilou(nlp, text)
    
  y_test.append(ents)
  y_pred.append(pred_ents)



In [24]:
# Pre-process the test predictions
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_test)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

In [25]:
# ner model report
def ner_report(y_true, y_pred):
  """Classification report for a list of BIO-encoded sequences.
     It computes token-level metrics and discards "O" labels.
  Args:
    y_true (list): List of predictions from the test set.
    y_pred (list): List of predictions from the ner model.

  Returns:
    object: Classification report.
  """
  lb = LabelBinarizer()
  y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
  y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
  tagset = set(lb.classes_)
  tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
  class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
  return classification_report(
    y_true_combined,
    y_pred_combined,
    labels = [class_indices[cls] for cls in tagset],
    target_names = tagset
    ), accuracy_score(y_true_combined, y_true_combined
    ), f1_score(y_test, y_pred)

In [26]:
# Get the ner model report
report, accuracy, score = ner_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# Print the model report
print(report)

                       precision    recall  f1-score   support

                    -       0.00      0.00      0.00       428
       B-College Name       0.67      0.57      0.62        54
       I-College Name       0.50      0.53      0.52       103
       L-College Name       0.63      0.54      0.58        54
       U-College Name       1.00      0.17      0.29         6
B-Companies worked at       0.57      0.65      0.61        48
I-Companies worked at       0.00      0.00      0.00        10
L-Companies worked at       0.44      0.50      0.47        48
U-Companies worked at       0.41      0.45      0.43        92
             B-Degree       0.88      0.47      0.61        45
             I-Degree       0.81      0.47      0.59       132
             L-Degree       0.83      0.44      0.58        45
             U-Degree       0.30      0.60      0.40         5
        B-Designation       0.64      0.52      0.57        91
        I-Designation       0.65      0.37      0.47  

In [28]:
# Get the model accuracy
print(accuracy)

1.0


In [29]:
# Get the model f1-score
print(score)

0.5294117647058824


## Save the model

In [30]:
# Save the model to disk
filename = 'ner_resumes.sav'
pickle.dump(nlp, open(filename, 'wb'))

## Try the model on a sample from the test data

In [31]:
# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [32]:
# Define colors and options for display
colors = {
    "Skills": "#ffe599", 
    "College Name": "#5e72e4",
    "Graduation Year": "#f4f5f7",
    "Designation": "#11cdef",
    "Name": "#2dce89", 
    "Degree": "#f5365c",
    "Companies worked at": "#fb6340",
    "Location": "#849cbc",
    "Email Address": "#4a6a48",
    "Years of Experience": "#e0ecfc"
    }
options = {"ents": ['Skills', 
                    'College Name', 
                    'Graduation Year',
                    'Designation',
                    'Name',
                    'Degree',
                    'Companies worked at',
                    'Location',
                    'Email Address',
                    'Years of Experience'
                  ], 
           "colors": colors}

In [35]:
# Try the model predictions on one resume
for text, annots in test_data:
  doc = loaded_model(text)
  tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
  entities = []
  for entity, group in groupby(tokens, key=lambda t: t[-1]):
    if not entity:
      continue
    group = list(group)
    _, start, _ = group[0]
    word, last, _ = group[-1]
    end = last + len(word)
        
    entities.append((start, end, entity))
  displacy.render(doc, style = "ent", options=options, jupyter = True)
  break