In [1]:
import argparse
import time
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader 
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import json

import functools as ftools
import tqdm

In [2]:
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r',encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [3]:
data = trim_entity_spans(convert_dataturks_to_spacy("data/traindata.json"))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [4]:
df_data = pd.read_json("data/traindata.json", lines = True)
df_data["content"] = df_data["content"].apply(lambda x: x.replace("\n", " "))
df_data

Unnamed: 0,content,annotation,extras
0,Abhishek Jha Application Development Associate...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar Active member of IIIT Committee...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan lecturer - oracle tutorials Mum...,"[{'label': ['Degree'], 'points': [{'start': 20...",
...,...,...,...
215,"Mansi Thanki Student Jamnagar, Gujarat - Emai...","[{'label': ['College Name'], 'points': [{'star...",
216,Anil Kumar Microsoft Azure (Basic Management) ...,"[{'label': ['Location'], 'points': [{'start': ...",
217,Siddharth Choudhary Microsoft Office Suite - E...,"[{'label': ['Skills'], 'points': [{'start': 78...",
218,Valarmathi Dhandapani Investment Banking Opera...,"[{'label': ['Skills'], 'points': [{'start': 92...",


In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

df_data = pd.DataFrame(columns=['clean_content','entities_mapped'])
entities_mapped = []
clean_content = []
for i in range(len(data)):
    content=data[i][0].split()
    entities=data[i][1]['entities']
    words=[]
    labels=[]
    
    for word in content:
        
        if ((word.isalnum() or word.find(".com")!=-1) and word not in en_stops):
            words.append(word)
            found = False
            
            for entity in sorted(entities):
                ent_start = entity[0]
                ent_end = entity[1]
                ent_label = entity[2]
                
                if word in data[i][0][ent_start:ent_end].split():
                    labels.append(ent_label)
                    found = True
                    break
                    
            if not found:
                labels.append("O")
              
    entities_mapped.append(labels)
    clean_content.append(words)
    
df_data = pd.DataFrame(columns = ["clean_content", "entities_mapped"])
df_data["entities_mapped"] = entities_mapped
df_data["clean_content"] = clean_content
df_data["clean_content"] = df_data["clean_content"].apply(lambda x: " ".join(x))
df_data

[nltk_data] Downloading package stopwords to /home/txetx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,clean_content,entities_mapped
0,Abhishek Jha Application Development Associate...,"[Name, Name, Designation, Designation, Designa..."
1,Afreen Jamadar Active member IIIT Committee Th...,"[Name, Name, O, O, O, O, O, O, O, O, Email Add..."
2,Akhil Yadav Polemaina Telangana Email indeed.c...,"[Name, Name, Name, O, O, Email Address, Email ..."
3,Alok Khandai Operational Analyst Engineer UNIS...,"[Name, Name, Designation, Designation, Designa..."
4,Ananya Chavan lecturer oracle tutorials Mahara...,"[Name, Name, Designation, Companies worked at,..."
...,...,...
215,Mansi Thanki Student Gujarat Email indeed.com/...,"[Name, Name, Designation, O, O, O, O, O, O, O,..."
216,Anil Kumar Microsoft Azure Delhi Email indeed....,"[Name, Name, Designation, Designation, Locatio..."
217,Siddharth Choudhary Microsoft Office Suite Exp...,"[Name, Name, Designation, Designation, Designa..."
218,Valarmathi Dhandapani Investment Banking Karna...,"[Name, Name, Designation, Designation, O, O, E..."


In [6]:
# Check that words are aligned wit labels
assert all((len(d1) == len(d2.split()) for d1,d2 in zip(df_data['entities_mapped'].iloc, df_data['clean_content'].iloc)))

In [7]:
MAX_LEN = 128
MIN_LEN = 32
STRIDE = 32
bs = 8

## Tokenize

In [8]:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 

'NVIDIA GeForce GTX 1080 Ti'

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER", do_lower_case=True)

In [18]:
def align_labels(text,labels):
    tokens = text.split()
    labels_aligned = []
    
    for token,label in zip(tokens,labels):
        sub_tokens = tokenizer(token)
        labels_aligned += [label]*(len(sub_tokens["input_ids"]) - 2)
    
    return labels_aligned

def spanning_window(input_ids,
                    attention_mask,
                    token_type_ids,
                    labels,
                    w_size,
                    stride,
                    w_min):
    
    input_ids_post = []
    attention_mask_post = []
    token_type_ids_post = []
    labels_post = []
    
    for a,b,c,d in zip(input_ids,attention_mask,token_type_ids,labels):
        for begin_i in range(0,len(d),stride):
            bi1 = begin_i
            bi2 = begin_i + 1
            
            iid = [101] + a[bi2:bi2+w_size-2]
            am = [1] + b[bi2:bi2+w_size-2]
            tti = [0] + c[bi2:bi2+w_size-2]
            lb = d[bi1:bi1+w_size-2]
            
            if len(lb) == (w_size - 2):
                iid = iid + [102 if iid[-1] != 102 else 0]
                am = am + [1 if iid[-1] != 102 else 0]
                tti = tti + [0]
                lb = lb + ["O"]
            else:
                if len(iid) < w_min:
                    continue
                
                missing_length = w_size - 2 - len(lb)
                iid = iid + [0]*missing_length
                am = am + [0]*missing_length
                tti = tti + [0]*missing_length
                lb = lb + ["O"]*(missing_length+1)
                
            input_ids_post.append(iid)
            attention_mask_post.append(am)
            token_type_ids_post.append(tti)
            labels_post.append(lb)
            
    return (input_ids_post,
            attention_mask_post,
            token_type_ids_post,
            labels_post)

tags_vals = ["UNKNOWN", "Name", "Degree","Skills","College Name","Email Address","Designation","Companies worked at","Empty","Graduation Year","Years of Experience","Location","O"]
tag2idx = {t: i for i, t in enumerate(tags_vals)}

def vectorize_df(df):
    tokenized_texts = tokenizer(df["clean_content"].tolist())
    labels = [align_labels(txt,label) for txt,label in zip(df["clean_content"],df['entities_mapped'])]

    # Use spanning window
    (tokenized_texts["input_ids"],
    tokenized_texts["attention_mask"],
    tokenized_texts["token_type_ids"],
    labels) = spanning_window(input_ids=tokenized_texts["input_ids"],
                                attention_mask=tokenized_texts["attention_mask"],
                                token_type_ids=tokenized_texts["token_type_ids"],
                                labels=labels,
                                w_size=MAX_LEN,
                                stride=STRIDE,
                                w_min=MIN_LEN)
    
    label_ids = [list(map(tag2idx.__getitem__,lab)) for lab in labels]

    tokenized_texts_pt = {k:torch.tensor(v) for k,v in tokenized_texts.items()}
    label_ids_pt = torch.tensor(label_ids,dtype=torch.int64)
    
    return tokenized_texts_pt, label_ids_pt

# Split

In [19]:
df_train, df_test = train_test_split(df_data,test_size=0.1)

In [20]:
X_train, y_train = vectorize_df(df_train)
X_test, y_test = vectorize_df(df_test)

## Create tensors

In [21]:
print(X_train["input_ids"].size())
print(X_test["input_ids"].size())

torch.Size([2866, 128])
torch.Size([393, 128])


## Model

In [22]:
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [23]:
# load pretrained model
class ResumeNERLM(nn.Module):
    def __init__(self,
                 out_classes=len(tags_vals),
                 lm="dslim/bert-base-NER",
                 device=device):
        super(ResumeNERLM,self).__init__()

        conf = AutoConfig.from_pretrained(lm)
        conf.output_hidden_states = True
        model = AutoModelForTokenClassification.from_config(conf).to(device)
        
        classification_layer = nn.Linear(768,out_classes).to(device)
        
        self.model = model
        self.classification_layer = classification_layer
        
        self.optimizer = optim.Adam(model.parameters(),lr=5e-5)
        self.loss = nn.CrossEntropyLoss()
        
        self.device = device
        
    def forward(self,**x):
        x = self.model(**x)
        x = x["hidden_states"][-1]
        x = [self.classification_layer(x[:,i]) for i in range(1,x.size()[1])]
        x = torch.stack(x,dim=1)
                        
        return x
    
    def fit(self,X,y,bs=bs,epochs=6):
        samples,seq_len = X["input_ids"].size()
        
        for e in range(epochs):
            perm = np.random.permutation(samples)
            
            trange = tqdm.trange(0,samples,bs)
            loss_arr = []
            for b_start in trange:
                self.optimizer.zero_grad()
                b_slice = slice(b_start,b_start+bs)

                xi = {k: v[perm[b_slice]].to(self.device) for k,v in X.items()}
                yi = y[perm[b_slice]].to(self.device)

                x = self(**xi)

                losses = torch.mean(torch.stack([self.loss(x[:,i],yi[:,i]) for i in range(seq_len-1)]))
                losses = torch.mean(losses)
                
                loss_arr.append(losses.item())
                trange.set_postfix(loss=np.mean(loss_arr))
                                
                losses.backward()
                self.optimizer.step()
                
    def predict(self,X,bs=32):
        samples,seq_len = X["input_ids"].size()
        trange = tqdm.trange(0,samples,bs)
        
        for b_start in trange:
            b_slice = slice(b_start,b_start+bs)

            xi = {k: v[b_slice].to(self.device) for k,v in X.items()}
            x = self(**xi)
            
            
            
        
model = ResumeNERLM()

## Training

In [None]:
model.train()
model.fit(X_train,y_train)

 19%|█████▌                        | 67/359 [00:09<00:40,  7.22it/s, loss=0.807]

# Evaluation