<a href="https://colab.research.google.com/github/Ekaagra08/Resume-Parsing-NER-Rule_based-/blob/main/Custom_Resume_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install spacy==2.1.4



In [11]:
import json
import re
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score


In [12]:
import spacy
from spacy import displacy
!python -m spacy download en_core_web_md
import en_core_web_md

Collecting en_core_web_md==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4 MB)
[K     |████████████████████████████████| 95.4 MB 1.2 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [13]:
def convert_json_for_spacy(FilePath):
    try:
      converted_data = []
      lines=[]
      with open(FilePath, 'r') as f:
          lines = f.readlines()

      for line in lines:
          data = json.loads(line)
          text = data['content'].replace("\n", " ")
          entities = []
          data_annotations = data['annotation']
          if data_annotations is not None:
              for annotation in data_annotations:
                  #only a single point in text annotation.
                  point = annotation['points'][0]
                  labels = annotation['label']
                  # handle both list of labels or a single label.
                  if not isinstance(labels, list):
                      labels = [labels]

                  for label in labels:
                      point_start = point['start']
                      point_end = point['end']
                      point_text = point['text']

                      lstrip_diff = len(point_text) - len(point_text.lstrip())
                      rstrip_diff = len(point_text) - len(point_text.rstrip())
                      if lstrip_diff != 0:
                          point_start = point_start + lstrip_diff
                      if rstrip_diff != 0:
                          point_end = point_end - rstrip_diff
                      entities.append((point_start, point_end + 1 , label))
          
          converted_data.append((text, {"entities" : entities}))
      return converted_data
      
    except Exception as e:
      logging.exception("Unable to process " + FilePath + "\n" + "error = " + str(e))
      return None    

################################################################################################

def trim_entity_spans(data: list) -> list:
# removes extra white spaces from entity span to prevent overlaping
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    
    return cleaned_data


In [14]:
train_data_clean = trim_entity_spans(convert_json_for_spacy("/content/drive/MyDrive/Resume Parsing (NER + Rule-based)/traindata.json"))
print(train_data_clean[0])

['Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [15]:
def train_spacyNER():

  # creating blank eng-language class and add the built-in pipeline components to the pipeline
  c_nlp = spacy.blank("en")
  if 'ner' not in c_nlp.pipe_names:
    ner = c_nlp.create_pipe('ner')
    c_nlp.add_pipe(ner,last = True)

  # adding custom lables from resume
  for _, annotation in train_data_clean:
    for ent in annotation.get('entities'):
      ner.add_label(ent[2])    

  # other pipes to disabled during training
  other_pipes = [pipe for pipe in c_nlp.pipe_names if pipe != 'ner']
  with c_nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = c_nlp.begin_training()
    for itn in range(10):
      print("Statring iteration " + str(itn + 1))
      random.shuffle(train_data_clean)
      losses = {}
      for text, annotations in train_data_clean:
        c_nlp.update(
            [text],  # batch of texts
            [annotations],  # batch of annotations
            drop=0.2,  # dropout - make it harder to memorise data
            sgd=optimizer,  # callable to update weights
            losses=losses)
      print(losses)
  return c_nlp   

In [16]:
nlp = train_spacyNER()

Statring iteration 1
{'ner': 31139.00876891847}
Statring iteration 2
{'ner': 23121.033560894088}
Statring iteration 3
{'ner': 16716.233918247923}
Statring iteration 4
{'ner': 13632.157929476387}
Statring iteration 5
{'ner': 13088.061687318463}
Statring iteration 6
{'ner': 10983.932519705359}
Statring iteration 7
{'ner': 10801.565972050092}
Statring iteration 8
{'ner': 9212.50325682725}
Statring iteration 9
{'ner': 10858.665097953852}
Statring iteration 10
{'ner': 9099.141343083038}


In [17]:
doc=nlp(train_data_clean[0][0])
for ent in doc.ents:
  print(f'{ent.label_.upper():{25}}- {ent.text}')

NAME                     - Manisha Bharti
DESIGNATION              - Software Automation Engineer
LOCATION                 - Pune
EMAIL ADDRESS            - indeed.com/r/Manisha-Bharti/3573e36088ddc073
YEARS OF EXPERIENCE      - 3.5 years
COMPANIES WORKED AT      - Infosys Limited
LOCATION                 - Pune
DESIGNATION              - NOT WORKING
DESIGNATION              - Software Automation Engineer
COMPANIES WORKED AT      - Infosys Limited
DESIGNATION              - System Engineer Trainee
COMPANIES WORKED AT      - Infosys Limited
DEGREE                   - B.Tech in CSE
COLLEGE NAME             - Meghnad saha institute of technology
GRADUATION YEAR          - 2013
SKILLS                   - Uft/qtp,alm/qc,jira,jenkins,automation testing,cicd,service vitualization,uipath    ADDITIONAL INFORMATION  Operating Systems Windows 10 / 8 / 7 / Vista / XP  Domains Banking and Finance
COMPANIES WORKED AT      - Oracle
COMPANIES WORKED AT      - Oracle
SKILLS                   - Has
SKIL

In [18]:
#test the model and evaluate it
examples = convert_json_for_spacy("/content/drive/MyDrive/Resume Parsing (NER + Rule-based)/testdata.json")
tp=0
tr=0
tf=0
ta=0
c=0        
for text,annot in examples:

    f=open("resume"+str(c)+".txt","w")
    doc_to_test=nlp(text)
    d={}
    for ent in doc_to_test.ents:
        d[ent.label_]=[]
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)

    for i in set(d.keys()):

        f.write("\n\n")
        f.write(i +":"+"\n")
        for j in set(d[i]):
            f.write(j.replace('\n','')+"\n")
    d={}
    for ent in doc_to_test.ents:
        d[ent.label_]=[0,0,0,0,0,0]
    for ent in doc_to_test.ents:
        doc_gold_text= nlp.make_doc(text)
        gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
        y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
        y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
        if(d[ent.label_][0]==0):
            #f.write("For Entity "+ent.label_+"\n")   
            #f.write(classification_report(y_true, y_pred)+"\n")
            (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
            a=accuracy_score(y_true,y_pred)
            d[ent.label_][0]=1
            d[ent.label_][1]+=p
            d[ent.label_][2]+=r
            d[ent.label_][3]+=f
            d[ent.label_][4]+=a
            d[ent.label_][5]+=1
    c+=1
for i in d:
    print("\n For Entity "+i+"\n")
    print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
    print("Precision : "+str(d[i][1]/d[i][5]))
    print("Recall : "+str(d[i][2]/d[i][5]))
    print("F-score : "+str(d[i][3]/d[i][5]))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


 For Entity Name

Accuracy : 99.82683982683983%
Precision : 0.9982714019140732
Recall : 0.9982683982683983
F-score : 0.9979805495430495

 For Entity Location

Accuracy : 99.3073593073593%
Precision : 0.9931216097593915
Recall : 0.9930735930735931
F-score : 0.9903150762281195

 For Entity Email Address

Accuracy : 99.48051948051948%
Precision : 1.0
Recall : 0.9948051948051948
F-score : 0.9973958333333334

 For Entity Designation

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Companies worked at

Accuracy : 99.13419913419914%
Precision : 0.9914170174135513
Recall : 0.9913419913419913
F-score : 0.9877533258734302

 For Entity College Name

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Graduation Year

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Skills

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0


  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
import pickle

filename = 'NER_model.pkl'
pickle.dump(nlp, open(filename, 'wb'))

In [19]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 25.0 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.19.4


In [39]:
import sys, fitz

def parse(filepath):
  try:
    doc = fitz.open(filepath)
    textblob = ""
    for pg in doc:
      textblob += str(pg.get_text())

  except:
    print("\nError in parse: Could not read the file ")

  finally:
    doc.close()
    
  return(textblob)  

In [None]:
filepath = "/content/drive/MyDrive/Resume/Ekaagra Dubey Resume.pdf"
text = parse(filepath)
print(text)

In [66]:
doc = NER_model(text)
def present
for ent in doc.ents:
  print(f'{ent.label_.upper():{25}}- {ent.text}')

NAME                     - EKAAGRA DUBEY
LOCATION                 - ekaagra@gmail.com |
LOCATION                 - Delhi
LOCATION                 - Delhi
SKILLS                   - Duke University
-
AWS Machine Learning - Amazon Web Services
-
Python Programming & Data Exploration
LOCATION                 - NIIT
COLLEGE NAME             - Hackathons.



In [190]:
class resparser():
  def __init__(self,model,filepath):
    self.model = model
    self.filepath = filepath

  def parse(self):

    ''' Parses the PDF and displayes resume lables and entities '''
 
    try:
      doc = fitz.open(self.filepath)
      self.textblob = ""
      for pg in doc:
        self.textblob += str(pg.get_text())

    except:
      print("\nError in parse: Could not read the file ")

    finally:
      doc.close()
    
    nlp = self.model
    doc = nlp(self.textblob)
    COLOR = '\033[92m' #GREEN
    BOLD = '\033[1m'
    RESET = '\033[0m' #RESET COLOR

    for ent in doc.ents:
      print(f"{COLOR}{BOLD}{ent.label_.upper():{25}}{RESET}: {ent.text}")
    
  def text(self):
    ''' returns: Text format of parsed PDF resume '''
    return self.textblob




In [193]:
NER_model = pickle.load(open("/content/NER_model.pkl", 'rb'))

In [194]:
r3 = resparser(model=NER_model, filepath="/content/drive/MyDrive/TPO resume.pdf")

In [195]:
r3.parse()

[95m[1mNAME                     [0m: EKAAGRA DUBEY
[95m[1mSKILLS                   [0m: CSS
[95m[1mSKILLS                   [0m: PHP
[95m[1mSKILLS                   [0m: Duke University (Coursera), Online 
Jun 2020 - Jun 2020 
 
• 
AWS Machine Learning 
Amazon Web Services, Online Feb 
2020 - Mar 2020 
 
• 
Python Programming And Data Exploration 
NIIT, Delhi 
Aug 2019 - Nov 2019 
 
SOFT SKILLS 
• 
Problem solving 
• 
Work ethics 
• 
Leadership 
• 
Adaptability 
• 
Time management 
• 
Interpersonal communication 
 
EXTRA-CURRICULAR ACTIVITIES  
• 
Participated in the Goldman Sachs Engineering Virtual program with Forage. certificate link 
• 
Participation in multiple Data science and machine learning hackathons 
• 
Football  
• 
Drawing and art 
  



In [146]:
print(r3.text())

EKAAGRA DUBEY    
Email: ekaagra@gmail.com  
Linkedin: https://www.linkedin.com/in/ekaagra-dubey-e08/   
Github:  https://github.com/Ekaagra08  
Phone No. : 9958582804, 7982035804  
Greater Noida West, Uttar Pradesh, India 201306 
 
 
ABOUT  
A data science enthusiast. Practice in executing full end to end data science projects. Ramping up 
projects within time, budget and quality parameters, as per project management and best practice 
guidelines, targeting assignments in Data Science, Data analytics, Machine Learning and Deep 
learning.  
  
EDUCATION  
• 
Bachelor of Technology (B.Tech), Information Technology 
Passing year: 2022 
ADGITM college (Guru Gobind Singh Indraprastha University)  
CGPA: 8.8/10  
 
• 
Senior Secondary (XII), Science  
Cambridge School Srinivaspuri (CBSE board)  
Passing year: 2018  
Percentage: 84.00%  
 
SKILLS  
Python  
Operating system 
Keras, tensorflow 
C++ 
Problem solving 
Web Scraping  
Java 
Machine learning  
NLP 
Data Analytics  
Deep learning  