# Data preparation

In [2]:
import json
documents = []
with open("resumes.json") as fp: 
    for line in fp: 
        doc_json = json.loads(line)
        documents.append(doc_json['content'])

# Microsoft Azure

In [8]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

api_key = os.getenv('AZURE_API_KEY')
credential = AzureKeyCredential(api_key)
endpoint="https://astrok2-ner-test.cognitiveservices.azure.com/"

text_analytics_client = TextAnalyticsClient(endpoint, credential)

In [73]:
import math
for i, doc in enumerate(documents):
    with open('results/azure/' + str(i) + ".json", "w") as f:
        if len(doc) > 5120:
            start_pos = 0
            r_json = {'entities': []}
            while start_pos + 5120 < len(doc):
                end_pos = start_pos + doc[start_pos:(start_pos+5120)].rfind(' ')
                response = text_analytics_client.recognize_entities([doc[start_pos:end_pos]], language="en")
                [r_json['entities'].append({'text': e.text, 'category': e.category, 'confidence_score': e.confidence_score}) for e in response[0].entities]
                start_pos = end_pos
            response = text_analytics_client.recognize_entities([doc[end_pos:]], language="en")
            [r_json['entities'].append({'text': e.text, 'category': e.category, 'confidence_score': e.confidence_score}) for e in response[0].entities]
            json.dump(r_json, f)
        else:
            response = text_analytics_client.recognize_entities([doc], language="en")
            r = response[0]
            r_json = {'entities': [{'text': e.text, 'category': e.category, 'confidence_score': e.confidence_score} for e in r.entities]}
            json.dump(r_json, f)

# Amazon Comprehend

In [5]:
import boto3

comprehend = boto3.client(service_name='comprehend')

Calling DetectEntities
{
    "Entities": [
        {
            "BeginOffset": 14,
            "EndOffset": 19,
            "Score": 0.9925345182418823,
            "Text": "today",
            "Type": "DATE"
        },
        {
            "BeginOffset": 23,
            "EndOffset": 30,
            "Score": 0.9988563060760498,
            "Text": "Seattle",
            "Type": "LOCATION"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "200",
            "content-type": "application/x-amz-json-1.1",
            "date": "Sun, 18 Oct 2020 00:32:06 GMT",
            "x-amzn-requestid": "52d7335f-1882-4ab5-8d4e-e5db60e3bbfd"
        },
        "HTTPStatusCode": 200,
        "RequestId": "52d7335f-1882-4ab5-8d4e-e5db60e3bbfd",
        "RetryAttempts": 0
    }
}
End of DetectEntities



In [99]:
for i, doc in enumerate(documents):
    with open('results/amazon/' + str(i) + ".json", "w") as f:  
        if len(bytes(doc, 'utf-8')) > 5000:
            start_pos = 0
            r_json = {'Entities': []}
            while start_pos + 4850 < len(doc):
                end_pos = start_pos + doc[start_pos:(start_pos+4850)].rfind(' ')
                r = comprehend.detect_entities(Text=doc[start_pos:end_pos], LanguageCode='en')
                [r_json['Entities'].append(e) for e in r['Entities']]
                start_pos = end_pos
            r = comprehend.detect_entities(Text=doc[end_pos:], LanguageCode='en')
            [r_json['Entities'].append(e) for e in r['Entities']]
            json.dump(r_json, f)
        else:
            r_json = comprehend.detect_entities(Text=doc, LanguageCode='en')
            json.dump(r_json, f)

# Google Cloud Natural Language

In [101]:
from google.cloud import language
from google.oauth2 import service_account
from google.cloud.language import enums
from google.cloud.language import types

# Build language API client (requires service account key)
client = language.LanguageServiceClient.from_service_account_json('services.json')

# Define functions
def pull_googlenlp(client, text, invalid_types = ['OTHER'], **data):
    document = types.Document(content=text, type=language.enums.Document.Type.PLAIN_TEXT)
    features = {'extract_syntax': False, 
                'extract_entities': True, 
                'extract_document_sentiment': False, 
                'extract_entity_sentiment': False,
                'classify_text': False
               }
    response = client.annotate_text(document=document, features=features)
    entities = response.entities
    def get_type(type):
        return client.enums.Entity.Type(entity.type).name
    result = {'entities': []}
    for entity in entities:
        if get_type(entity.type) not in invalid_types:
            result['entities'].append({'name': entity.name, 'type': get_type(entity.type), 'salience': entity.salience})
    return result

In [104]:
for i, doc in enumerate(documents):
    with open('results/google/' + str(i) + ".json", "w") as f:
        json.dump(pull_googlenlp(client, doc), f)

# Stanford NLP Tagger via NLTK

In [1]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('classifiers/english.all.3class.distsim.crf.ser.gz', 
                       'stanford-ner-4.0.0.jar',
                       encoding='utf-8')

In [47]:
import re

def bio_chunking(classified_text):
    chunks = []
    prev_tag = "O"
    chunk = None
    for token, tag in classified_text:
        if re.fullmatch(r'[\W]+', token):
            continue
        if tag == "O":
            prev_tag = tag
            if chunk is not None:
                chunks.append(chunk)
                chunk = None
        else:
            if prev_tag == tag:
                chunk = (chunk[0] + ' ' + token, tag)
            else:
                if chunk is not None:
                    chunks.append(chunk)
                chunk = (token, tag)
                prev_tag = tag
    if chunk is not None:
        chunks.append(chunk)
    return chunks

In [48]:
for i, doc in enumerate(documents):
    with open('results/stanford/' + str(i) + ".json", "w") as f:  
        tokenized_text = word_tokenize(doc)
        classified_text = st.tag(tokenized_text)
        classified_text = bio_chunking(classified_text)
        r_json = {'entities': [{'text': e[0], 'category': e[1]} for e in classified_text]}
        json.dump(r_json, f)

In [40]:
classified_text

[('Abhishek Jha', 'PERSON'),
 ('Karnataka', 'LOCATION'),
 ('Bangalore Karnataka', 'LOCATION'),
 ('Hubli Karnataka', 'LOCATION'),
 ('Mac Non', 'ORGANIZATION')]

# SpaCy

In [11]:
#python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

In [45]:
for i, doc in enumerate(documents):
    with open('results/spacy/' + str(i) + ".json", "w") as f:  
        classified_doc = nlp(doc)
        r_json = {'entities': [{'text': e.text, 'category': e.label_, 'start': e.start_char, 'end': e.end_char} for e in classified_doc.ents]}
        json.dump(r_json, f)