In [1]:
import spacy
import pickle
import random

In [2]:
train_data = pickle.load(open('train_data.pkl','rb'))

In [3]:
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [4]:
nlp = spacy.blank('en')

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last = True)
        
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
            
      # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as e:
                    pass
            print(losses)
            

In [5]:
train_model(train_data)

Statring iteration 0
{'ner': 14737.120422966}
Statring iteration 1
{'ner': 7955.150993422885}
Statring iteration 2
{'ner': 8473.91751971065}
Statring iteration 3
{'ner': 7447.42095894929}
Statring iteration 4
{'ner': 6401.651772810741}
Statring iteration 5
{'ner': 5931.555485118302}
Statring iteration 6
{'ner': 4628.242715452393}
Statring iteration 7
{'ner': 5874.112340425644}
Statring iteration 8
{'ner': 3959.689565258011}
Statring iteration 9
{'ner': 5031.564532470516}


In [6]:
nlp.to_disk('nlp_model')

In [7]:
nlp_model = spacy.load('nlp_model')

In [8]:
train_data[0][0]

'Mansi Thanki Student  Jamnagar, Gujarat - Email me on Indeed: indeed.com/r/Mansi-Thanki/04b8914a81df5a81  project on "Water Quality Of Different Areas Of Ahmedabad City  WORK EXPERIENCE  Microsoft Excel, Microsoft Power Point, Microsoft Word, File Management, Internet Use and C  15 days training at Tata Chemicals Limited -  Mithapur, Gujarat  Mithapur (during 6th Sem) Computer Skill: Microsoft Excel, Microsoft Power Point, Microsoft Word, File Management, Internet Use and C language  EDUCATION  BE in Environmental Engineering  Government Engineering College Bhuj -  Bhuj, Gujarat  2009 to 2013  https://www.indeed.com/r/Mansi-Thanki/04b8914a81df5a81?isid=rex-download&ikw=download-top&co=IN'

In [9]:
doc = nlp_model(train_data[0][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Mansi Thanki
DESIGNATION                   - Student
LOCATION                      - Jamnagar
DESIGNATION                   - Microsoft Excel
DEGREE                        - BE in Environmental Engineering
COLLEGE NAME                  - Government Engineering College Bhuj


In [10]:
# !pip uninstall PyMuPDF

In [11]:
# pip install PyMuPDF==1.16.14

In [12]:
# pip install fitz

In [13]:
import sys, fitz
fname = 'Alice Clark CV.pdf'
doc = fitz.open(fname)
text = ""
for page in doc:
    text = text + str(page.getText())

txt = " ".join(text.split('\n'))
print(txt)

Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, bro

In [14]:
doc = nlp_model(txt)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft –
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COLLEGE NAME                  - Indian Institute of Technology – Mumbai
SKILLS                        - Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills
SKILLS                        - • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; 