# Named Entity Recognition Dataset

## Dataset

In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/resume-entities-for-ner/Entity Recognition in Resumes.json
/kaggle/input/resume-enitity-ner/Resume.json


In [8]:
import re

## Cleaning Entities

### Processing Indexes

In [9]:
import json

def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                # only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    print((point_start, point_end + 1 , label,point_text))
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

In [10]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')
    span_entities=[]
    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end

            # Ensure valid_start and valid_end are within bounds
            if valid_start < 0:
                valid_start = 0
            if valid_end > len(text):
                valid_end = len(text)

            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1

            if valid_start < valid_end:  # Ensure valid spans
                valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
        span_entities.append(valid_entities)
    return cleaned_data,span_entities


In [11]:
data = convert_dataturks_to_spacy("/kaggle/input/resume-enitity-ner/Resume.json")

(1749, 1755, 'Companies worked at', 'Oracle')
(1696, 1702, 'Companies worked at', 'Oracle')
(1417, 1423, 'Companies worked at', 'Oracle')
(1356, 1793, 'Skills', 'Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle\nPL-SQL programming, Sales Force with APEX.\nTools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer,\nPL/SQL Developer, WinSCP, Putty\nWeb Technologies: JavaScript, XML, HTML, Webservice\n\nOperating Systems: Linux, Windows\nVersion control system SVN & Git-Hub\nDatabases: Oracle\nMiddleware: Web logic, OC4J\nProduct FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x')
(1209, 1215, 'Companies worked at', 'Oracle')
(1136, 1247, 'Skills', 'APEX. (Less than 1 year), Data Structures (3 years), FLEXCUBE (5 years), Oracle (5 years),\nAlgorithms (3 years)\n')
(928, 932, 'Graduation Year', '2012')
(858, 889, 'College Name', 'Adithya Institute of Technology')
(821, 856, 'Degree', 'B.E in Computer Science Engineering')
(787, 791, 'Graduation Year', '2012')
(74

In [12]:
converted_data,valid_entities = trim_entity_spans(data)

In [13]:
def correct_annotations(data):
    corrected_data = []
    for text, annotations in data:
        entities = annotations['entities']
        entities = sorted(entities, key=lambda x: x[0])  # Sort entities by start position
        corrected_entities = []

        i = 0
        while i < len(entities):
            start1, end1, label1 = entities[i]
            if i + 1 < len(entities):
                start2, end2, label2 = entities[i + 1]
                if start2 < end1:  # Check for overlap
                    # Resolve overlap by adjusting the end position of the first entity
                    end1 = start2 - 1  # Adjust end position of the first entity
                    if end1 >= start1:  # Append the corrected entity if valid
                        corrected_entities.append((start1, end1, label1))
                else:
                    corrected_entities.append((start1, end1, label1))
            else:
                corrected_entities.append((start1, end1, label1))
            i += 1

        corrected_data.append((text, {"entities": corrected_entities}))

    return corrected_data


In [14]:
corrected_data = correct_annotations(converted_data)

### Overlapping Entities

In [15]:
def find_overlapping_entities(data):
    overlapping_entities = []
    for text, annotations in data:
        entities = annotations['entities']
        entities = sorted(entities, key=lambda x: x[0])  # Sort entities by start position
        for i in range(len(entities) - 1):
            start1, end1, label1 = entities[i]
            start2, end2, label2 = entities[i + 1]
            if start2 < end1:  # Check for overlap
                overlapping_entities.append(((start1, end1, label1), (start2, end2, label2)))
    return overlapping_entities

# Sample Data
data = corrected_data
overlapping_entities = find_overlapping_entities(data)

if overlapping_entities:
    print("Overlapping entities found:")
    for ent1, ent2 in overlapping_entities:
        print(f"Overlap between {ent1} and {ent2}")
else:
    print("No overlapping entities found.")


No overlapping entities found.


### Entity Mapping

In [16]:
!pip install spacy==2.1.4

Collecting spacy==2.1.4
  Downloading spacy-2.1.4-cp37-cp37m-manylinux1_x86_64.whl (29.8 MB)
[K     |████████████████████████████████| 29.8 MB 459 kB/s eta 0:00:011
Collecting plac<1.0.0,>=0.9.6
  Downloading plac-0.9.6-py2.py3-none-any.whl (20 kB)
Collecting blis<0.3.0,>=0.2.2
  Downloading blis-0.2.4-cp37-cp37m-manylinux1_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 40.9 MB/s eta 0:00:01
[?25hCollecting preshed<2.1.0,>=2.0.1
  Downloading preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82 kB)
[K     |████████████████████████████████| 82 kB 217 kB/s  eta 0:00:01
[?25hCollecting thinc<7.1.0,>=7.0.2
  Downloading thinc-7.0.8-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 33.0 MB/s eta 0:00:01
[?25hCollecting jsonschema<3.1.0,>=2.6.0
  Downloading jsonschema-3.0.2-py2.py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 1.6 MB/s  eta 0:00:01
Installing collected packages: preshed, plac, blis, 

In [17]:
from spacy.lang.en import English  # Or whichever language you need
from spacy.gold import biluo_tags_from_offsets

def bilou_tags(data):
    
    docs  = []
    annots = []
    nlp = English()
    for text, annotations in data:
        offsets = annotations["entities"]
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, offsets)
        for i in range(len(tags)):
            if tags[i].startswith("U"):
                tags[i] = "B" + tags[i][1:]
            elif tags[i].startswith("L"):
                tags[i] = "I" + tags[i][1:]
            if not (doc[i].text.isalnum() or len(doc[i].text) > 1):
                tags[i] = "O"
        docs.append([token.text for token in doc])
        annots.append(tags)
        
    df_data = pd.DataFrame({'docs': docs, 'annots': annots})

    return df_data

df_data = bilou_tags(data)
# [(k, v) for k, v in zip(df_data["docs"][0], df_data["annots"][0])]

### Removing Mislabeled Examples

In [18]:
for i in range(len(df_data)):
    if "-" in df_data.loc[i, "annots"]:
        df_data.drop(i, axis = "index", inplace = True)
df_data.reset_index(inplace = True)
len(df_data)

95

## Modeling

### Conditional Random Fields

#### Sentence Getter

In [19]:
from nltk import pos_tag
sentences = [[(w, p, t) for w, p, t in zip(df_data["docs"][i], [y for x, y in pos_tag(df_data["docs"][i])], df_data["annots"][i]) if w.isalnum() or len(w) > 1] for i in range(0, len(df_data))]

#### Feature Extraction

In [20]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2]
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2]
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

#### Train-Test Split

In [21]:
%%time
from sklearn.model_selection import train_test_split

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

CPU times: user 244 ms, sys: 26.8 ms, total: 271 ms
Wall time: 271 ms


#### Training

In [22]:
!pip install python-crfsuite

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 11.3 MB/s eta 0:00:01
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.10


In [23]:
import pycrfsuite

In [24]:
%%time

trainer = pycrfsuite.Trainer(verbose = True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 882 ms, sys: 22.3 ms, total: 904 ms
Wall time: 904 ms


In [25]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [26]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [27]:
%%time
trainer.train('resume-parsor_CRF.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 27131
Seconds required: 0.165

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 50646.992468
Feature norm: 1.000000
Error norm: 42657.468756
Active features: 13958
Line search trials: 1
Line search step: 0.000018
Seconds required for this iteration: 0.368

***** Iteration #2 *****
Loss: 43993.861562
Feature norm: 4.793605
Error norm: 12199.661783
Active features: 13449
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.182

***** Iteration #3 *****
Loss: 36199.056839
Feature norm: 4.223523
Error norm: 12165.193464
Active features: 11236
Line search trials: 1
Line search step: 1.000000
Seconds required for

In [28]:
trainer.logparser.last_iteration

{'num': 100,
 'scores': {},
 'loss': 2643.969529,
 'feature_norm': 43.220932,
 'error_norm': 339.67034,
 'active_features': 2239,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.17}

In [36]:
tagger = pycrfsuite.Tagger()
tagger.open('/kaggle/working/resume-parsor_CRF.crfsuite')

<contextlib.closing at 0x7e78e754ad10>

#### Evaluation

In [30]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)

In [31]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 108 ms, sys: 5.86 ms, total: 114 ms
Wall time: 113 ms


In [32]:
report, accuracy = ner_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
print(report)

                       precision    recall  f1-score   support

       B-College Name       0.90      0.45      0.60        20
       I-College Name       0.86      0.59      0.70        61
B-Companies worked at       0.65      0.26      0.37        42
I-Companies worked at       0.43      0.12      0.19        25
             B-Degree       1.00      0.62      0.77        16
             I-Degree       0.95      0.95      0.95        37
        B-Designation       0.83      0.60      0.70        25
        I-Designation       0.96      0.57      0.72        40
      B-Email Address       0.89      0.89      0.89         9
      I-Email Address       1.00      0.78      0.88         9
    B-Graduation Year       1.00      0.33      0.50        12
           B-Location       0.33      0.36      0.35        11
           I-Location       0.00      0.00      0.00         1
               B-Name       1.00      0.83      0.91        12
               I-Name       0.91      0.83      0.87  

In [34]:
print(accuracy)

0.9205322491221586


In [41]:
# Example test sentence
input_sentence = """Sharan

AI / Machine Learning

Delhi, India Email me on Indeed
•
20+ years of experience in data handling, design, and development
•
Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to
data warehousing and business intelligence
•
Database: Experience in database designing, scalability, back-up and recovery, writing and
optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.
Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,
Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake
analytics(U-SQL)
Willing to relocate anywhere

WORK EXPERIENCE
Software Engineer
Microsoft – Bangalore, Karnataka
January 2000 to Present
1. Microsoft Rewards Live dashboards:
Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping
online. Microsoft Rewards members can earn points when searching with Bing, browsing with
Microsoft Edge and making purchases at the Xbox Store, the Windows Store and the Microsoft
Store. Plus, user can pick up bonus points for taking daily quizzes and tours on the Microsoft
rewards website. Rewards live dashboards gives a live picture of usage world-wide and by
markets like US, Canada, Australia, new user registration count, top/bottom performing rewards
offers, orders stats and weekly trends of user activities, orders and new user registrations. the
PBI tiles gets refreshed in different frequencies starting from 5 seconds to 30 minutes.
Technology/Tools used

EDUCATION
Indian Institute of Technology – Mumbai
2001

SKILLS
Machine Learning, Natural Language Processing, and Big Data Handling

ADDITIONAL INFORMATION
Professional Skills
• Excellent analytical, problem solving, communication, knowledge transfer and interpersonal
skills with ability to interact with individuals at all the levels
• Quick learner and maintains cordial relationship with project manager and team members and
good performer both in team and independent job environments
• Positive attitude towards superiors &amp; peers
• Supervised junior developers throughout project lifecycle and provided technical assistance """


In [44]:
import nltk
import pycrfsuite

# Ensure you have the necessary NLTK data downloaded (if not already)
# nltk.download('punkt')

# Define your word2features function and other helper functions here as per your previous setup

# Example input sentence
input_tokens = nltk.word_tokenize(input_sentence)
input_pos_tags = nltk.pos_tag(input_tokens)

# Prepare the input sentence in the required format for feature extraction
input_sent = list(zip(input_tokens, [tag for token, tag in input_pos_tags]))

# Function to extract features for a token using word2features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# Extract features for the input sentence
input_features = sent2features(input_sent)

# Load the CRFsuite model
tagger = pycrfsuite.Tagger()
tagger.open('/kaggle/working/resume-parsor_CRF.crfsuite')

# Predict labels for the input sentence
predicted_tags = tagger.tag(input_features)

# Print the input sentence tokens and predicted tags
for token, tag in zip(input_tokens, predicted_tags):
    if tag != 'O':
        print(f"{token}\t{tag}")

Sharan	B-Name
AI	I-Name
•	B-Email Address
20+	B-Years of Experience
years	I-Years of Experience
Software	B-Designation
Engineer	I-Designation
Microsoft	B-Companies worked at
Microsoft	B-Companies worked at
Microsoft	B-Companies worked at
Microsoft	B-Companies worked at
Microsoft	B-Companies worked at
Indian	B-College Name
Institute	I-College Name
of	I-College Name
Technology	I-College Name
