In [1]:
import argparse
import time
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader 
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import json



In [2]:
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r',encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [3]:
data = trim_entity_spans(convert_dataturks_to_spacy("traindata.json"))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [4]:
df_data = pd.read_json("data/traindata.json", lines = True)
df_data["content"] = df_data["content"].apply(lambda x: x.replace("\n", " "))
df_data

Unnamed: 0,content,annotation,extras
0,Abhishek Jha Application Development Associate...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar Active member of IIIT Committee...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan lecturer - oracle tutorials Mum...,"[{'label': ['Degree'], 'points': [{'start': 20...",
...,...,...,...
215,"Mansi Thanki Student Jamnagar, Gujarat - Emai...","[{'label': ['College Name'], 'points': [{'star...",
216,Anil Kumar Microsoft Azure (Basic Management) ...,"[{'label': ['Location'], 'points': [{'start': ...",
217,Siddharth Choudhary Microsoft Office Suite - E...,"[{'label': ['Skills'], 'points': [{'start': 78...",
218,Valarmathi Dhandapani Investment Banking Opera...,"[{'label': ['Skills'], 'points': [{'start': 92...",


In [5]:
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

df_data = pd.DataFrame(columns=['clean_content','entities_mapped'])
entities_mapped = []
clean_content = []
for i in range(len(data)):
    content=data[i][0].split()
    entities=data[i][1]['entities']
    words=[]
    labels=[]
    
    for word in content:
        
        if ((word.isalnum() or word.find(".com")!=-1) and word not in en_stops):
            words.append(word)
            found = False
            
            for entity in sorted(entities):
                ent_start = entity[0]
                ent_end = entity[1]
                ent_label = entity[2]
                
                if word in data[i][0][ent_start:ent_end].split():
                    labels.append(ent_label)
                    found = True
                    break
                    
            if not found:
                labels.append("O")
              
    entities_mapped.append(labels)
    clean_content.append(words)
    
df_data = pd.DataFrame(columns = ["clean_content", "entities_mapped"])
df_data["entities_mapped"] = entities_mapped
df_data["clean_content"] = clean_content
df_data["clean_content"] = df_data["clean_content"].apply(lambda x: " ".join(x))
df_data

Unnamed: 0,clean_content,entities_mapped
0,Abhishek Jha Application Development Associate...,"[Name, Name, Designation, Designation, Designa..."
1,Afreen Jamadar Active member IIIT Committee Th...,"[Name, Name, O, O, O, O, O, O, O, O, Email Add..."
2,Akhil Yadav Polemaina Telangana Email indeed.c...,"[Name, Name, Name, O, O, Email Address, Email ..."
3,Alok Khandai Operational Analyst Engineer UNIS...,"[Name, Name, Designation, Designation, Designa..."
4,Ananya Chavan lecturer oracle tutorials Mahara...,"[Name, Name, Designation, Companies worked at,..."
...,...,...
215,Mansi Thanki Student Gujarat Email indeed.com/...,"[Name, Name, Designation, O, O, O, O, O, O, O,..."
216,Anil Kumar Microsoft Azure Delhi Email indeed....,"[Name, Name, Designation, Designation, Locatio..."
217,Siddharth Choudhary Microsoft Office Suite Exp...,"[Name, Name, Designation, Designation, Designa..."
218,Valarmathi Dhandapani Investment Banking Karna...,"[Name, Name, Designation, Designation, O, O, E..."


In [6]:
MAX_LEN = 300
bs = 16

In [7]:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 

'GeForce GTX 1060 6GB'

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER", do_lower_case=True)

# load untrained model
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [9]:
tokenized_texts = [tokenizer(sent,padding="max_length") for sent in df_data["clean_content"]]

In [10]:
tokenized_texts[0]

{'input_ids': [101, 170, 1830, 27516, 4638, 1377, 179, 2328, 4048, 1718, 6500, 9603, 3313, 24181, 11782, 1777, 1968, 10632, 5750, 119, 3254, 120, 187, 120, 170, 1830, 27516, 4638, 1377, 118, 179, 2328, 120, 1275, 1162, 1559, 1161, 1604, 1665, 1830, 1559, 17101, 1830, 1665, 25631, 1161, 1106, 1250, 2369, 2790, 3767, 4607, 4196, 3044, 2510, 3213, 1436, 1936, 4988, 22036, 24181, 11782, 1777, 1968, 1250, 2541, 4048, 1718, 6500, 9603, 3313, 1185, 2707, 10615, 1504, 1675, 1971, 1684, 4297, 1171, 6696, 1137, 25001, 7983, 18874, 15027, 3377, 171, 3329, 13746, 1359, 1549, 2013, 171, 3329, 1472, 1936, 15462, 3923, 1116, 3112, 1549, 7758, 1972, 1869, 2598, 3752, 2134, 3752, 2815, 24181, 11782, 1777, 1968, 12686, 12909, 1204, 1381, 179, 10038, 1504, 5247, 6686, 3591, 16405, 2030, 1278, 170, 1643, 13217, 1349, 8943, 1381, 5368, 180, 6696, 16383, 191, 25382, 27399, 170, 1643, 13217, 1630, 8943, 1349, 4196, 172, 122, 8539, 122, 8539, 2635, 122, 8539, 2635, 1449, 122, 179, 15677, 122, 2509, 1869, 4301

In [11]:
tags_vals = ["UNKNOWN", "Name", "Degree","Skills","College Name","Email Address","Designation","Companies worked at","Empty","Graduation Year","Years of Experience","Location"]
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [12]:
labels = df_data['entities_mapped'].tolist()

In [13]:
print(set(labels[61]))

{'Designation', 'Graduation Year', 'Companies worked at', 'Degree', 'Skills', 'O', 'UNKNOWN', 'College Name', 'Name', 'Email Address', 'Years of Experience'}


In [14]:
df_data

Unnamed: 0,clean_content,entities_mapped
0,Abhishek Jha Application Development Associate...,"[Name, Name, Designation, Designation, Designa..."
1,Afreen Jamadar Active member IIIT Committee Th...,"[Name, Name, O, O, O, O, O, O, O, O, Email Add..."
2,Akhil Yadav Polemaina Telangana Email indeed.c...,"[Name, Name, Name, O, O, Email Address, Email ..."
3,Alok Khandai Operational Analyst Engineer UNIS...,"[Name, Name, Designation, Designation, Designa..."
4,Ananya Chavan lecturer oracle tutorials Mahara...,"[Name, Name, Designation, Companies worked at,..."
...,...,...
215,Mansi Thanki Student Gujarat Email indeed.com/...,"[Name, Name, Designation, O, O, O, O, O, O, O,..."
216,Anil Kumar Microsoft Azure Delhi Email indeed....,"[Name, Name, Designation, Designation, Locatio..."
217,Siddharth Choudhary Microsoft Office Suite Exp...,"[Name, Name, Designation, Designation, Designa..."
218,Valarmathi Dhandapani Investment Banking Karna...,"[Name, Name, Designation, Designation, O, O, E..."


In [None]:
# modify the data for pytorch tensors
# feed the input to the bert model and train
# test with the test dataset