<a href="https://colab.research.google.com/github/AshishKumarAnguria/ResumeParser/blob/main/ResumeParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

import spacy
from spacy.gold import biluo_tags_from_offsets
nlp = spacy.load('en_core_web_lg')


In [4]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
prefixes = ('\\n', ) + nlp.Defaults.prefixes
print(prefixes)
print("____________________________________________________")
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
print(prefix_regex)
print("____________________________________________________")
nlp.tokenizer.prefix_search = prefix_regex.search
print(prefix_regex.search)
print("____________________________________________________")

('\\n', '§', '%', '=', '—', '–', '\\+(?![0-9])', '…', '……', ',', ':', ';', '\\!', '\\?', '¿', '؟', '¡', '\\(', '\\)', '\\[', '\\]', '\\{', '\\}', '<', '>', '_', '#', '\\*', '&', '。', '？', '！', '，', '、', '；', '：', '～', '·', '।', '،', '۔', '؛', '٪', '\\.\\.+', '…', "\\'", '"', '”', '“', '`', '‘', '´', '’', '‚', ',', '„', '»', '«', '「', '」', '『', '』', '（', '）', '〔', '〕', '【', '】', '《', '》', '〈', '〉', '\\$', '£', '€', '¥', '฿', 'US\\$', 'C\\$', 'A\\$', '₽', '﷼', '₴', '[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F\\u218A\\u218

In [4]:
entity_dict = {
    'Name': 'NAME', 
    'College Name': 'CLG',
    'Degree': 'DEG',
    'Graduation Year': 'GRADYEAR',
    'Years of Experience': 'YOE',
    'Companies worked at': 'COMPANY',
    'Designation': 'DESIG',
    'Skills': 'SKILLS',
    'Location': 'LOC',
    'Email Address': 'EMAIL'
}


In [5]:
df = pd.read_json('/content/drive/MyDrive/ResumeParser/Entity Recognition in Resumes.json', lines=True)
df.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [6]:
df = df.drop(['extras'], axis=1)
df.head()
#len(df)

Unnamed: 0,content,annotation
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12..."
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta..."
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37..."
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80..."
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20..."


In [None]:
df.to_csv('/content/drive/MyDrive/ResumeParser/Data.csv')

**mergeIntervals**  
It resolve the overlapping intervals of labels if present

In [7]:
def mergeIntervals(intervals):
    sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
    merged = []

    for higher in sorted_by_lower_bound:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:#### 1.1->interchanged
                if lower[2] is higher[2]:#### 1.2->interchanged 
                    upper_bound = max(lower[1], higher[1])
                    merged[-1] = (lower[0], upper_bound, lower[2])
                else:
                    if lower[1] > higher[1]:
                        merged[-1] = lower
                    else:
                        merged[-1] = (lower[0], higher[1], higher[2])
            else:
                merged.append(higher)

    return merged

**get_entities**   
This function assign final labels from `entity_dict`

In [8]:
def get_entities(df):
    entities = []
    for i in range(len(df)):
        entity = []
        for annot in df['annotation'][i]:
            try:
                ent = entity_dict[annot['label'][0]]
                start = annot['points'][0]['start']
                end = annot['points'][0]['end'] + 1
                entity.append((start, end, ent))
            except:
                pass
        #print("XXXXX",entity)
        entity_ = mergeIntervals(entity)
       #print("AAAAA",entity_)
        #entity=sorted(entity, key=lambda tup: tup[0])
        #set(entity)
        #list(entity)
        entities.append(entity_)
    return entities

In [9]:
df['annotation'][0]

[{'label': ['Skills'],
  'points': [{'end': 1621,
    'start': 1295,
    'text': '\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'}]},
 {'label': ['Skills'],
  'points': [{'end': 1153,
    'start': 993,
    'text': 'C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),\nDatabase Management System (Less than 1 year), Java (Less than 1 year)'}]},
 {'label': ['College Name'],
  'points': [{'end': 956, 'start': 939, 'text': 'Kendriya Vidyalaya'}]},
 {'label': ['College Name'],
  'points': [{'end': 904, 'start': 883, 'text': 'Woodbine modern school'}]},
 {'label': ['Graduation Year'],
  'points': [{'end': 860, 'start': 856, 'text': '2017\n'}]},
 {'label': ['College 

In [10]:
df['entities'] = get_entities(df)
df.head()

Unnamed: 0,content,annotation,entities
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...","[(0, 12, NAME), (13, 46, DESIG), (49, 58, COMP..."
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...","[(0, 14, NAME), (62, 68, LOC), (104, 148, EMAI..."
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...","[(0, 21, NAME), (22, 31, LOC), (65, 117, EMAIL..."
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...","[(0, 12, NAME), (13, 51, DESIG), (54, 60, COMP..."
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...","[(0, 13, NAME), (14, 22, DESIG), (24, 41, COMP..."


**get_train_data**  
1. Convert labels into `BILUO` form
2. Break all resumes text into sentences
3. Return sentences and their labels

In [11]:
def get_train_data(df):
    tags = []
    sentences = []

    for i in range(len(df)):
        text = df['content'][i]
        entities = df['entities'][i]
    
        doc = nlp(text)
    
        tag = biluo_tags_from_offsets(doc, entities)
        tmp = pd.DataFrame([list(doc), tag]).T
        loc = []
        for i in range(len(tmp)):
            if tmp[0][i].text is '.' and tmp[1][i] is 'O':
                loc.append(i)
        loc.append(len(doc))
    
        last = 0
        data = []
        for pos in loc:
            data.append([list(doc)[last:pos], tag[last:pos]])
            last = pos
    
        for d in data:
            tag = ['O' if t is '-' else t for t in d[1]]
            if len(set(tag)) > 1:
                sentences.append(d[0])
                tags.append(tag)
    
    return sentences, tags


In [12]:
sentences, tags = get_train_data(df)

In [None]:
print("FF",sentences[1],"XXX",tags[1])

FF [., 
, 
, Willing, to, relocate, to, :, Bangalore, ,, Karnataka, 
, 
, WORK, EXPERIENCE, 
, 
, Application, Development, Associate, 
, 
, Accenture, -, 
, 
, November, 2017, to, Present, 
, 
, Role, :, Currently, working, on, Chat, -, bot] XXX ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DESIG', 'I-DESIG', 'L-DESIG', 'O', 'O', 'U-COMPANY', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [13]:
# getting all the tags present
tag_vals = set(['X', '[CLS]', '[SEP]'])
for i in range(len(tags)):
    tag_vals = tag_vals.union(tags[i])
tag_vals

{'B-CLG',
 'B-COMPANY',
 'B-DEG',
 'B-DESIG',
 'B-EMAIL',
 'B-GRADYEAR',
 'B-LOC',
 'B-NAME',
 'B-SKILLS',
 'B-YOE',
 'I-CLG',
 'I-COMPANY',
 'I-DEG',
 'I-DESIG',
 'I-EMAIL',
 'I-GRADYEAR',
 'I-LOC',
 'I-NAME',
 'I-SKILLS',
 'I-YOE',
 'L-CLG',
 'L-COMPANY',
 'L-DEG',
 'L-DESIG',
 'L-EMAIL',
 'L-GRADYEAR',
 'L-LOC',
 'L-NAME',
 'L-SKILLS',
 'L-YOE',
 'O',
 'U-CLG',
 'U-COMPANY',
 'U-DEG',
 'U-DESIG',
 'U-EMAIL',
 'U-GRADYEAR',
 'U-LOC',
 'U-SKILLS',
 'U-YOE',
 'X',
 '[CLS]',
 '[SEP]'}

In [14]:
# dictionaries to convert text tag into numeric ids and vice-versa
tag2idx = {t: i for i, t in enumerate(tag_vals)}
idx2tag = {tag2idx[key] : key for key in tag2idx.keys()}

In [17]:
from tqdm import trange
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam


In [15]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 24.1MB/s eta 0:00:01[K     |█████▎                          | 20kB 31.9MB/s eta 0:00:01[K     |████████                        | 30kB 35.8MB/s eta 0:00:01[K     |██████████▋                     | 40kB 30.9MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 24.7MB/s eta 0:00:01[K     |███████████████▉                | 61kB 27.1MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 23.4MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 24.7MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 23.9MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 25.4MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 25.4MB/s eta 0:00:01[K     |████████████

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(n_gpu)

1


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 823435.23B/s]


**get_tokenized_train_data**
1. Convert sentences to tokens
2. Assign labels to tokenized text

In [None]:
def get_tokenized_train_data(sentences, tags):

    tokenized_texts = []
    word_piece_labels = []

    for word_list, label in zip(sentences, tags):
    
        # Add [CLS] at the front
        temp_lable = ['[CLS]']
        temp_token = ['[CLS]']
    
        for word, lab in zip(word_list, label):
            token_list = tokenizer.tokenize(word.text)
           
            for m, token in enumerate(token_list):
                temp_token.append(token)
                if m == 0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append('X')  
            
        # Add [SEP] at the end
        temp_lable.append('[SEP]')
        temp_token.append('[SEP]')
        #print("AAA:",temp_lable)
        #print("FFF:",temp_token)
        #print("______________________________________________________________________")
        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)
    
    return tokenized_texts, word_piece_labels

In [None]:
tokenized_texts, word_piece_labels = get_tokenized_train_data(sentences, tags)

In [None]:
print(tokenized_texts[0])
print(word_piece_labels[0])

['[CLS]', 'A', '##b', '##his', '##he', '##k', 'J', '##ha', 'Application', 'Development', 'Associate', '-', 'A', '##cc', '##ent', '##ure', 'Bengal', '##uru', ',', 'Karnataka', '-', 'Em', '##ail', 'me', 'on', 'Indeed', ':', 'indeed', '.', 'com', '/', 'r', '/', 'A', '##b', '##his', '##he', '##k', '-', 'J', '##ha', '/', '10', '##e', '##7', '##a', '##8', '##c', '##b', '##7', '##32', '##b', '##c', '##43', '##a', '•', 'To', 'work', 'for', 'an', 'organization', 'which', 'provides', 'me', 'the', 'opportunity', 'to', 'improve', 'my', 'skills', 'and', 'knowledge', 'for', 'my', 'individual', 'and', 'company', "'", 's', 'growth', 'in', 'best', 'possible', 'ways', '[SEP]']
['[CLS]', 'B-NAME', 'X', 'X', 'X', 'X', 'L-NAME', 'X', 'B-DESIG', 'I-DESIG', 'L-DESIG', 'O', 'U-COMPANY', 'X', 'X', 'X', 'U-LOC', 'X', 'O', 'O', 'O', 'O', 'X', 'O', 'O', 'B-EMAIL', 'I-EMAIL', 'I-EMAIL', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',

In [None]:
MAX_LEN = 512
bs = 4
for txt in tokenized_texts:
  if len(txt)>MAX_LEN:
    print(len(txt))

679
977
567
1054
1231
674
543
920
1269
756
513
642
590
599
680


In [None]:
# BERT model take input of fixed size of 512 so we are padding short sequences
# Convert tokens to numeric ids
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(len(input_ids[0]))
print(input_ids[0])

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (679 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (977 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (567 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1054 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1231 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length

512
[  101   138  1830 27516  4638  1377   147  2328 22491  3273  9666   118
   138 19515  3452  3313  7756 12328   117 12247   118 18653 11922  1143
  1113 10364   131  5750   119  3254   120   187   120   138  1830 27516
  4638  1377   118   147  2328   120  1275  1162  1559  1161  1604  1665
  1830  1559 17101  1830  1665 25631  1161   794  1706  1250  1111  1126
  2369  1134  2790  1143  1103  3767  1106  4607  1139  4196  1105  3044
  1111  1139  2510  1105  1419   112   188  3213  1107  1436  1936  3242
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     

In [None]:
# pad tags in the same ways as input_ids
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels], maxlen=MAX_LEN, value=tag2idx["O"], 
                     padding="post", dtype="long", truncating="post")
print(len(tags[0]))
print(tags[0])

512
[32 31  9  9  9  9  7  9  2  3 25 23 38  9  9  9 40  9 23 23 23 23  9 23
 23 22 12 12  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
  9  9  9  9  9  9  9 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23  9 23 23 23 23 23 17 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 2

In [None]:
# 1 is for token present in sentence
# 0 is for padded tokens
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state=2020, 
                                                                                 test_size=0.3)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx))

100%|██████████| 404400730/404400730 [00:10<00:00, 37941080.42B/s]


In [None]:
model.cuda();

In [None]:
# Setting parameter for optimizers
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)



# Training Step

In [None]:
epochs = 15
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

output_path='/content/drive/MyDrive/ResumeParser'
torch.save(
    {
        "model_state_dict": model.state_dict()
    },
    f'{output_path}/model-state.bin',
)


Epoch:   7%|▋         | 1/15 [01:02<14:40, 62.91s/it]

Train loss: 0.8525728643158056


Epoch:  13%|█▎        | 2/15 [02:08<13:48, 63.75s/it]

Train loss: 0.3635777681522126


Epoch:  20%|██        | 3/15 [03:14<12:51, 64.26s/it]

Train loss: 0.26486069758007996


Epoch:  27%|██▋       | 4/15 [04:19<11:50, 64.62s/it]

Train loss: 0.19220463314304387


Epoch:  33%|███▎      | 5/15 [05:25<10:48, 64.88s/it]

Train loss: 0.15675966460665647


Epoch:  40%|████      | 6/15 [06:30<09:45, 65.02s/it]

Train loss: 0.12953785498266238


Epoch:  47%|████▋     | 7/15 [07:35<08:41, 65.14s/it]

Train loss: 0.09056485470139632


Epoch:  53%|█████▎    | 8/15 [08:41<07:36, 65.19s/it]

Train loss: 0.07316902035806519


Epoch:  60%|██████    | 9/15 [09:46<06:31, 65.21s/it]

Train loss: 0.06018215560619414


Epoch:  67%|██████▋   | 10/15 [10:51<05:26, 65.24s/it]

Train loss: 0.04451287872103607


Epoch:  73%|███████▎  | 11/15 [11:56<04:20, 65.23s/it]

Train loss: 0.03404399084715839


Epoch:  80%|████████  | 12/15 [13:02<03:15, 65.21s/it]

Train loss: 0.03070319276540982


Epoch:  87%|████████▋ | 13/15 [14:07<02:10, 65.23s/it]

Train loss: 0.025613784235154355


Epoch:  93%|█████████▎| 14/15 [15:12<01:05, 65.22s/it]

Train loss: 0.020911096536595185


Epoch: 100%|██████████| 15/15 [16:17<00:00, 65.18s/it]

Train loss: 0.018586255797358352





**extract_data**  
It takes tokenized sentence and their labels and give the required entity.  
To get name - `extract_data('NAME',sentence,logits)`  
_entity_ can be any value(not key) present in _entity_dict_

In [30]:
def extract_data(entity,sentence,logits):
  p=False
  q=False
  test=''
  for j in range(len(logits)):
    if 'U-'+entity == idx2tag[logits[j]]:
      p=True
      q=True
    if 'B-'+entity == idx2tag[logits[j]]:
      if not p:
        q=True
    if '-' in idx2tag[logits[j]] and ('B-'+entity != idx2tag[logits[j]] or 'U-'+entity != idx2tag[logits[j]] ):
      if p:
        q=False
        p=False
        test+="|"
    if 'O' == idx2tag[logits[j]] and p:
      q=False
      p=False
      test+="|"
    if 'L-'+entity == idx2tag[logits[j]]:
      test+=" "
      p=True
    if 'I-'+entity == idx2tag[logits[j]]:
      test+=" "  
    if q:
      if '[' not in sentence[j]:
        test+=sentence[j].replace("#",'')

  if len(test):
    test=list(set(test.split("|")))
    print(entity,":",test)
  



In [None]:
!pip install --no-deps seqeval[gpu]

In [None]:
from seqeval.metrics import classification_report, accuracy_score, f1_score
model.eval()

y_true = []
y_pred = []
out_name=[]
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in train_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    with torch.no_grad():
        logits = model(input_ids, token_type_ids=None, attention_mask=input_mask,)
    logits = logits.detach().cpu().numpy()
    logits = [list(p) for p in np.argmax(logits, axis=2)]
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    n_input_ids = input_ids.to('cpu').numpy()
    for i,mask in enumerate(input_mask):
        temp_1 = [] # Real one
        temp_2 = [] # Predict one
        temp_3 = []
        b_name=False
        u_name=False
        name=[]
        sentence=tokenizer.convert_ids_to_tokens(input_ids[i].to('cpu').numpy())
        #____________________________________________________________________________________________________
        test=''
        q=False
        p=False
        #print("Detected:",test)
        print("InResume:")
        extract_data('NAME',sentence,label_ids[i])
        extract_data('EMAIL',sentence,logits[i])
        extract_data('SKILLS',sentence,logits[i])
        

        print("___________________________________________________________________________________________________________")
        #____________________________________________________________________________________________________
        
        for j, m in enumerate(mask):
            if m:
                if idx2tag[label_ids[i][j]] != "X" and idx2tag[label_ids[i][j]] != "[CLS]" and idx2tag[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(idx2tag[label_ids[i][j]])
                    temp_2.append(idx2tag[logits[i][j]])
            else:
                break    
        y_true.append(temp_1)
        y_pred.append(temp_2)
        out_name.append(temp_3)
'''
print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

print(classification_report(y_true, y_pred,digits=4)) 
'''

In [None]:
import numpy as np
import pandas as pd
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = 'bert-base-cased'
STATE_DICT = torch.load("/content/drive/MyDrive/ResumeParser/model-state.bin", map_location=DEVICE)
#TOKENIZER = BertTokenizerFast("./vocab/vocab.txt", lowercase=True)#######need to change

model_L = BertForTokenClassification.from_pretrained(
    'bert-base-cased', state_dict=STATE_DICT['model_state_dict'], num_labels=len(tag2idx))
model_L.to(DEVICE)

y_true = []
y_pred = []
out_name=[]
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    with torch.no_grad():
        logits = model_L(input_ids, token_type_ids=None, attention_mask=input_mask,)
    logits = logits.detach().cpu().numpy()
    logits = [list(p) for p in np.argmax(logits, axis=2)]
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    n_input_ids = input_ids.to('cpu').numpy()
    for i,mask in enumerate(input_mask):
        temp_1 = [] # Real one
        temp_2 = [] # Predict one
        temp_3 = []
        b_name=False
        u_name=False
        name=[]
        sentence=tokenizer.convert_ids_to_tokens(input_ids[i].to('cpu').numpy())
        #____________________________________________________________________________________________________
        test=''
        q=False
        p=False
        #print("Detected:",test)
        print("InResume:")
        #extract_data('SKILLS',sentence,label_ids[i])
        extract_data('NAME',sentence,logits[i])
        extract_data('COMPANY',sentence,logits[i])
        extract_data('SKILLS',sentence,logits[i])
        extract_data('DEG',sentence,logits[i])

        print("___________________________________________________________________________________________________________")
        #____________________________________________________________________________________________________

'''
Extract Text Resume
Break into lines
Tokenise each lines
Pass it into model

'''

        

# Loading model from memory

In [19]:
import numpy as np
import pandas as pd
#DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE=device
#DEVICE="cuda:0"
MAX_LEN = 512
MODEL_PATH = 'bert-base-cased'
STATE_DICT = torch.load("/content/drive/MyDrive/ResumeParser/model-state.bin", map_location=DEVICE)
#TOKENIZER = BertTokenizerFast("./vocab/vocab.txt", lowercase=True)#######need to change
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
model_L = BertForTokenClassification.from_pretrained(
    'bert-base-cased', state_dict=STATE_DICT['model_state_dict'], num_labels=len(tag2idx))
model_L.to(DEVICE);
#model_L.cuda();
#model_L.to(DEVICE)

100%|██████████| 213450/213450 [00:00<00:00, 15558080.58B/s]
100%|██████████| 404400730/404400730 [00:07<00:00, 54672836.72B/s]


In [28]:

def get_tokenized_data(word_list,tokenizer):

    tokenized_texts = []
    word_piece_labels = []
        # Add [CLS] at the front
        #temp_lable = ['[CLS]']
    temp_token = ['[CLS]']

    for word in word_list:
        token_list = tokenizer.tokenize(word)
        
        for m, token in enumerate(token_list):
            temp_token.append(token)  
        
    # Add [SEP] at the end
    #temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    #print("AAA:",temp_lable)
    #print("FFF:",temp_token)
    #print("______________________________________________________________________")
    tokenized_texts=[temp_token]
    input_ids=pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
        #word_piece_labels.append(temp_lable)
    print("T:",temp_token)
    print("W:",word_list)
    #print("I:",input_ids)
    return input_ids

# Extracting Info from resume
NOT WORKING PROPERLY

In [37]:

f = open("/content/drive/MyDrive/ResumeParser/Manish Kumar _raw", "r")
txt=f.read()
resume = txt.split('\n')

f.close()
#model_L.cuda();
#resume=df['content'].values[0]
for sentence in resume:
    word_list=sentence.split(" ")
    input_ids=get_tokenized_data(word_list,tokenizer)
    attention_masks = [float(i>0) for i in input_ids[0]]
    attention_masks = [attention_masks]
    #input_ids=[input_ids]

    input_ids = torch.tensor(input_ids)
    input_ids=input_ids.to(DEVICE)
    input_mask = torch.tensor(attention_masks)
    input_mask=input_masks.to(DEVICE)
    
    with torch.no_grad():
        logits = model_L(input_ids, token_type_ids=None, attention_mask=input_mask).to(DEVICE)
    logits = logits.detach().cpu().numpy()
    logits = [list(p) for p in np.argmax(logits, axis=2)]
    #label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    n_input_ids = input_ids.to('cpu').numpy()
    id2token=tokenizer.convert_ids_to_tokens(n_input_ids[0])
    #____________________________________________________________________________________________________
    
    #print("Detected:",test)
    #extract_data('SKILLS',sentence,label_ids[i])
    extract_data('NAME',id2token,logits[0])
    extract_data('COMPANY',id2token,logits[0])
    extract_data('SKILLS',id2token,logits[0])
    extract_data('DEG',id2token,logits[0])

    print("___________________________________________________________________________________________________________")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
W: ['p']
NAME : ['p']
SKILLS : ['']
DEG : ['']
___________________________________________________________________________________________________________
T: ['[CLS]', 'u', '[SEP]']
W: ['u']
SKILLS : ['']
DEG : ['']
___________________________________________________________________________________________________________
T: ['[CLS]', 't', '[SEP]']
W: ['t']
NAME : ['t']
SKILLS : ['']
DEG : ['']
___________________________________________________________________________________________________________
T: ['[CLS]', '[SEP]']
W: ['', '']
SKILLS : ['']
DEG : ['']
___________________________________________________________________________________________________________
T: ['[CLS]', 'b', '[SEP]']
W: ['b']
NAME : ['b']
SKILLS : ['']
DEG : ['']
___________________________________________________________________________________________________________
T: ['[CLS]', 'y', '[SEP]']
W: ['y']
SKILLS : ['']
DEG : ['']
___________________