In [67]:
!pip install -U spacy
!pip install spacy-transformers
!pip install PyMuPDF

In [86]:
# Importing required libraries

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
from spacy import displacy
import sys, fitz

In [69]:
spacy.__version__

'3.4.3'

In [71]:
# Loading JSON file (dataset)
cv_data = json.load(open("CV-Parsing-using-Spacy-3/data/training/train_data.json"))

In [72]:
len(cv_data)

200

In [73]:
# Setting up configuration file
!python -m spacy init fill-config CV-Parsing-using-Spacy-3/data/training/base_config.cfg CV-Parsing-using-Spacy-3/data/training/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
CV-Parsing-using-Spacy-3/data/training/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [74]:
def get_spacy_doc(file, data):
    nlp = spacy.blank('en')
    db = DocBin()
    
    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        annot = annot['entities']
        
        ents = []
        entity_indices = []
        
        # below code will help to identify if there is any overlapping entity. If yes, then we'll keep only 1st entity and skip other
        for start, end , label in annot:
            skip_entitiy = False
            for idx in range(start, end):
                if idx in entity_indices:
                    skip_entitiy = True
                    break
            if skip_entitiy:
                continue
                
                
            entity_indices = entity_indices + list(range(start, end))
            
            #Getting span of data
            try:
                span = doc.char_span(start, end, label = label, alignment_mode='strict')
            except:
                continue

            # If the given index span has no value the we'll add those data into error.txt file
            if span is None:
                err_data = str([start, end]) + "    " + str(text) +'\n'
                file.write(err_data)
                
            else:
                ents.append(span)
                
                
        try:
          # Setting up custom entities and adding it to docbin object
            doc.ents = ents
            db.add(doc)
            
        except:
            pass
        
    return db
            

In [75]:
# Creating train and test dataset

from sklearn.model_selection import train_test_split
train, test = train_test_split(cv_data, test_size = 0.2)

In [76]:
len(train), len(test)

(160, 40)

In [77]:
file = open('error.txt', 'w', encoding='utf-8')

#Extracting and saving spacy file to the disk for training purpose
db = get_spacy_doc(file, train)
db.to_disk('train_data.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_data.spacy')


file.close()

100%|██████████| 160/160 [00:01<00:00, 91.07it/s] 
100%|██████████| 40/40 [00:00<00:00, 80.34it/s]


In [78]:
# Training the custom spacy NER model
!python -m spacy train CV-Parsing-using-Spacy-3/data/training/config.cfg --output ./output --paths.train ./train_data.spacy --paths.dev ./test_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-11-15 14:21:11,829] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-11-15 14:21:11,840] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2022-11-15 14:21:11,844] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-11-15 14:21:11,845] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are in

# Model Test

In [87]:
nlp = spacy.load('/content/output/model-best')  # Loading best model

In [88]:
nlp.get_pipe("ner").labels  

('College Name',
 'Companies worked at',
 'Degree',
 'Designation',
 'Email Address',
 'Graduation Year',
 'Location',
 'Name',
 'Skills',
 'UNKNOWN',
 'Years of Experience')

In [127]:
#Setting up colors for custom entities
colors = {
    'College Name' : '#E6B0AA',
    'Companies worked at' : '#AF7AC5', 
    'Degree' : '#5499C7', 
    'Designation' : '#5DADE2', 
    'Email Addresss' : '#48C9B0', 
    'Graduation Year' : '#73C6B6', 
    'Location' : '#2ECC71', 
    'Name' : '#F9E79F', 
    'Skills' : '#EB984E', 
    'UNKNOWN' : '#D35400', 
    'Years of Experience' : '#839192'}


options = {"ents": list(nlp.get_pipe("ner").labels), "colors": colors}

In [121]:
# Reading resume 
fname = '/content/CV-Parsing-using-Spacy-3/data/test/Aish_Resume.pdf'
doc = fitz.open(fname)

In [122]:
# converting resume into text
text = [page.get_text() for page in doc][0].strip().split()
text = " ".join(text)
text

"Aishwarya Pachaiyappan Data Scientist Data scientist with Two years of experience in analysing large datasets and coming up with data-driven insights .Proﬁcient in Predictive modelling,data processing and data mining algorithms.I'm a energetic, and geeky individual whose desire to learn is endless aishwaryayamunap02@gmail.com 9677227289 Chennai, India WORK EXPERIENCE Data Scientist Infosys Limited 06/2019 - Present, Chennai, India 3years of experience as Engineer -Industrial IT • Developed Classiﬁcation algorithm to provide result based on classiﬁcation from large set of data and developed NLP algorithm to provide suggestion based on similarity check Knowledge in Creating Models using Python for Data Prediction and Classiﬁcation using libraries such as NumPy, Pandas, Matplotlib and Scikit learn Experience in database management and Microsoft Dynamics 365 CRM Contact : +91 9677227289 EDUCATION Bachelor of Computer Application Madaras Christian college 06/2016 - 05/2019, Chennai,India P

In [123]:
# converting text into spacy doc and then printing the extracted entities

doc = nlp(text)
text_list = []

for ent in doc.ents:
  if ent.text not in text_list:
    print(ent.text, '\t\t\t\t\t', ent.label_)
    text_list.append(ent.text)

Aishwarya Pachaiyappan 					 Name
Data Scientist 					 Designation
Data scientist 					 Designation
Two years 					 Years of Experience
Infosys 					 Companies worked at
Bachelor of Computer Application 					 Degree
Madaras Christian college 					 College Name
B.Ed. in Hindi Pandit Hindi Prachar 					 Degree
Technical skills => Data Science, Linear & Logistics Regression, Naive Bayes, KNN, Decision Tree, Random Forest, GBDT, XGBoost, SVM, LSTM, MS Dynamics CRM, Data visualization tools, Python, SQL, etc Area => Machine Learning, Data Analysis, NLP , Sentiment Analysis, Document Classiﬁcation, Tokenization, Feature Engineering, Model Training, Feature Selection, Validation, Cleansing, Statistical & Predictive analysis. 					 Skills
Loan Default Prediction (02/2020 - 02/2021) 					 Skills
Infosys Certiﬁed 					 Companies worked at


In [128]:
displacy.render(doc, style="ent", jupyter = True, options = options)