In [1]:
# en_core_web_sm(12 mb)
# en_core_web_md(43 mb)
# en_core_web_lg(741 mb)
# en_core_web_trf(438 mb)

In [59]:
import spacy 

nlp = spacy.load("en_core_web_sm")
#check for ner pipleine is available or not
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [61]:
#1. docx to text
import docx2txt

# extract text
# file_name="CV Wyc.docx"
file_name="VinothRajendran[6_0]-converted.docx"
text = docx2txt.process(f"./ResumeData/{file_name}")
# text
with open(f"./txtData/{file_name}.txt","w") as f:
    f.write(text)
    

In [5]:
### Test with default model


In [6]:
doc = nlp(text)

from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [8]:
for word in doc.ents:
    print(word.text," == ",word.label_)

VINOTH  ==  ORG
RAJENDRAN  ==  ORG
Software Professional  ==  ORG
1B  ==  CARDINAL
12th street  ==  FAC
Thiruvottiyur  ==  GPE
Sep 2015 to March 2019  ==  DATE
April 2019  ==  DATE
Dec 2021  ==  DATE
BCA  ==  ORG
Madras University  ==  ORG
Computer Engineering  ==  ORG
DCE  ==  ORG
Vijaya  ==  GPE
21 April 1996  ==  DATE
6.3  ==  CARDINAL
Indian  ==  NORP
English  ==  LANGUAGE
Tamil  ==  GPE
6 years  ==  DATE
3.5 years  ==  DATE
Cassandra Administrator  ==  PERSON
24  ==  CARDINAL
Apache Cassandra and Datastax Enterprise Cassandra

  ==  ORG
Cassandra  ==  GPE
Apache Cassandra  ==  PRODUCT
Cassandra  ==  GPE
Cassandra Query Language  ==  PERSON
CQL  ==  ORG
Family’s  ==  PRODUCT
Cassandra Worked  ==  GPE
Cassandra Cluster  ==  PERSON
Cassandra  Involved  ==  PERSON
HEAP  ==  ORG
GC  ==  ORG
Cassandra nodes  ==  GPE
Apache Cassandra  ==  PRODUCT
Set Cassandra  ==  PERSON
24  ==  CARDINAL
7  ==  CARDINAL
TECHNICAL SKILLS



	Operating Systems: Linux  ==  ORG
Windows Languages  ==  ORG
CQ

In [10]:
# explain about TAGS
spacy.explain("GPE")

'Countries, cities, states'

## Custom NER:
SpaCy pre-trained models detect and categorize the text chunks into 18 types of entities.

Steps to build the custom NER model for detecting the job role in job postings in spaCy 3.0:

1. Annotate the data to train the model.
2. Convert the annotated data into the spaCy bin object.
3. Generate the config file from the spaCy website.
4. Train the model in the command line.
5. Load and test the saved model.

In [11]:
#2. Annotate the txt file

In [12]:
#3. Convet .jsonl to .spacy

In [62]:
#select file
jsonl_file="root.jsonl"

annotation_file_name=f"./AnnotationJSONL/{jsonl_file}"

In [63]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import logging
import json

#filtter data to convert in spacy format
def fillterDoccanoData(doccano_JSONL_FilePath):
    try:
        training_data = []
        lines=[]
        with open(doccano_JSONL_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['data']
            entities = data['label']
            if len(entities)>0:
                training_data.append((text, {"entities" : entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + doccano_JSONL_FilePath + "\n" + "error = " + str(e))
        return None

In [64]:
#read Doccano Annotation file .jsonl
TRAIN_DATA=fillterDoccanoData(annotation_file_name) #root.jsonl is annotation file name file name 

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    try:
        doc.ents = ents # label the text with the ents
        db.add(doc)
    except:
        print(text, annot)
db.to_disk("./SpaCyTrainData/roottrain.spacy") # save the docbin object

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.95it/s]


In [16]:
#4. Load Config file
# https://spacy.io/usage/training

In [65]:
!python -m spacy init config --lang en --pipeline ner configV.cfg --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
configV.cfg
You can now add your data and train your pipeline:
python -m spacy train configV.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
#5. Train model

In [66]:
!python -m spacy train configV.cfg --output ./training/ --paths.train ./SpaCyTrainData/roottrain.spacy --paths.dev ./SpaCyTrainData/Vdev.spacy --training.eval_frequency 10 --training.max_steps 100 --gpu-id -0


#Explain:
#<python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy>
#Dev --> Validation Data 
#train --> Training Data 

[38;5;4mℹ Saving to output directory: training[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-04-01 15:50:18,551] [INFO] Set up nlp object from config
[2022-04-01 15:50:18,559] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-04-01 15:50:18,562] [INFO] Created vocabulary
[2022-04-01 15:50:18,562] [INFO] Finished initializing nlp object
[2022-04-01 15:50:19,156] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    454.24    0.00    0.00    0.00    0.00
 10      10         24.70   7631.21    0.00    0.00    0.00    0.00
 20      20        249.90   1025.85    0.00    0.00    0.00    0.00
 30      30        607.49    677.43    8.89   28.57    5.26    0.09
 40      40        204.32    467.24    0.00    0.00  

In [44]:
#load trained custom model

In [68]:
custNlp = spacy.load(r"./training/model-best") #load the best model

In [50]:
#Test on Existing Data

In [73]:
import docx2txt
Cv_PathToTest="./ResumeData/CV Wyc.docx"
cv_text = docx2txt.process(Cv_PathToTest)

doc = custNlp(cv_text) # input sample tex

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter
# [(X, X.ent_type_) for X in doc if X.ent_type_]

In [None]:
## Test with New Data

In [70]:
import docx2txt
Cv_PathToTest="./ResumeData/Balaji[5_8].docx"
cv_text = docx2txt.process(Cv_PathToTest)

doc = custNlp(cv_text) # input sample tex

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter
# [(X, X.ent_type_) for X in doc if X.ent_type_]

In [54]:
########################33

############TEST

!python -m spacy train configV.cfg --output ./trainingTEST/ --paths.train ./SpaCyTrainData/Vtrain.spacy --paths.dev ./SpaCyTrainData/Bdev.spacy --training.eval_frequency 10 --training.max_steps 100 --gpu-id -0


#Explain:
#<python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy>
#Dev --> Validation Data 
#train --> Training Data 


[38;5;2m✔ Created output directory: trainingTEST[0m
[38;5;4mℹ Saving to output directory: trainingTEST[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-03-31 16:46:14,364] [INFO] Set up nlp object from config
[2022-03-31 16:46:14,371] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-03-31 16:46:14,373] [INFO] Created vocabulary
[2022-03-31 16:46:14,373] [INFO] Finished initializing nlp object
[2022-03-31 16:46:14,773] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    456.74    0.00    0.00    0.00    0.00
 10      10         24.50   7613.03    0.00    0.00    0.00    0.00
 20      20        247.95   1098.15    0.00    0.00    0.00    0.00
 30      30       1178.33    892.60    0.00    0.00    0.00    

In [55]:
custNlp_1 = spacy.load(r"./trainingTEST/model-best") #load the best model

In [57]:
import docx2txt
Cv_PathToTest="./ResumeData/VinothRajendran[6_0]-converted.docx"
cv_text = docx2txt.process(Cv_PathToTest)

doc = custNlp_1(cv_text) # input sample tex

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter
# [(X, X.ent_type_) for X in doc if X.ent_type_]