## Import library


In [1]:
import re
import json
import re 
import json
from pprint import pprint

## Convert data from Dataturks to SpaCy

In [4]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

## Analysis the Dataset

In [5]:
dataset_path='data/Entity Recognition in Resumes.json'
data = trim_entity_spans(convert_dataturks_to_spacy(dataset_path))
pprint(data[0])

['Abhishek Jha Application Development Associate - Accenture  Bengaluru, '
 'Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  '
 '• To work for an organization which provides me the opportunity to improve '
 "my skills and knowledge for my individual and company's growth in best "
 'possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK '
 'EXPERIENCE  Application Development Associate  Accenture -  November 2017 to '
 'Present  Role: Currently working on Chat-bot. Developing Backend Oracle '
 'PeopleSoft Queries for the Bot which will be triggered based on given input. '
 'Also, Training the bot for different possible utterances (Both positive and '
 'negative), which will be given as input by the user.  EDUCATION  B.E in '
 'Information science and engineering  B.v.b college of engineering and '
 'technology -  Hubli, Karnataka  August 2013 to June 2017  12th in '
 'Mathematics  Woodbine modern school  April 2011 to March 2013  10th  '
 'Kendriy

In [6]:
print("Total dataset: ",len(data))

Total dataset:  220


In [24]:
print('Sample resume texts:')
pprint(data[0][0])

Sample resume texts:
('Abhishek Jha Application Development Associate - Accenture  Bengaluru, '
 'Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  '
 '• To work for an organization which provides me the opportunity to improve '
 "my skills and knowledge for my individual and company's growth in best "
 'possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK '
 'EXPERIENCE  Application Development Associate  Accenture -  November 2017 to '
 'Present  Role: Currently working on Chat-bot. Developing Backend Oracle '
 'PeopleSoft Queries for the Bot which will be triggered based on given input. '
 'Also, Training the bot for different possible utterances (Both positive and '
 'negative), which will be given as input by the user.  EDUCATION  B.E in '
 'Information science and engineering  B.v.b college of engineering and '
 'technology -  Hubli, Karnataka  August 2013 to June 2017  12th in '
 'Mathematics  Woodbine modern school  April 2011 to March 20

In [31]:
print("Entity with start and end position in the text:")
pprint(data[0][1])

Entity with start and end position in the text:
{'entities': [[1296, 1622, 'Skills'],
              [993, 1154, 'Skills'],
              [939, 957, 'College Name'],
              [883, 905, 'College Name'],
              [856, 860, 'Graduation Year'],
              [771, 814, 'College Name'],
              [727, 769, 'Designation'],
              [407, 416, 'Companies worked at'],
              [372, 405, 'Designation'],
              [95, 145, 'Email Address'],
              [60, 69, 'Location'],
              [49, 58, 'Companies worked at'],
              [13, 46, 'Designation'],
              [0, 12, 'Name']]}


In [61]:
for index,person in enumerate(data[:3]):
    print("person number: ",index+1)
    for entity in person[1]['entities']:
        print(entity[2])
    print('\n')


person number:  1
Skills
Skills
College Name
College Name
Graduation Year
College Name
Designation
Companies worked at
Designation
Email Address
Location
Companies worked at
Designation
Name


person number:  2
Email Address
Skills
Graduation Year
College Name
Degree
Graduation Year
College Name
Degree
Email Address
Location
Name


person number:  3
Skills
Skills
Skills
Skills
Skills
Skills
Skills
Skills
Skills
College Name
Degree
Location
Companies worked at
Designation
Location
Companies worked at
Designation
Email Address
Location
Name




### Train Test Split

In [7]:
import random
import math

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set

In [8]:
train_data, test_data = train_test_split(data, test_size = 0.1, random_state = 42)
print('train set len: ',len(train_data))
print('test set len: ',len(test_data))

train set len:  198
test set len:  22
