In [1]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.util import minibatch, compounding
from spacy.training.example import Example
from pathlib import Path
import json
import time
import random
#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines

In [2]:
df = pd.read_csv("Resume/Resume.csv")
df = df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[0:500]
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
343,13087952,TEACHER Farrah M. Bauman ...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
198,34051710,SR. GRAPHICS DESIGNER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER
433,24240349,SUBSTITUTE TEACHER Summary A...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
1604,13764840,BRAND MANAGER Summary Brand ...,"<div class=""fontsize fontface vmargins hmargin...",APPAREL
441,37348041,ADVOCATE Summary Seeking a p...,"<div class=""fontsize fontface vmargins hmargin...",ADVOCATE


In [3]:
def load_data(file):
    with open(file, 'r',encoding='utf-8') as f:
        data = json.load(f)
    return(data)


In [4]:
def save_data(file, data):
    with open(file , 'w' , encoding='utf-8') as f:
        json.dump(data , f , indent = 4)

In [5]:
def generate_more_data(file):
    data = load_data(file)
    new_char = []
    for item in data:
        new_char.append(item)
    final_characters = []
    for character in new_char:
        if "" != character:
            final_characters.append(character)
    final_char = list(set(final_characters))
    return final_char
    


In [6]:
def create_training_data(file, type):
    data = generate_more_data(file)
    patterns=[]
    for item in data:
        pattern = {
                    "label": type ,
                    "pattern" : item
                 }
        patterns.append(pattern)
    return patterns

In [7]:
def generate_rules(pattern, name):
    nlp = English()
    ruler = EntityRuler(nlp)
    
    ruler = nlp.add_pipe('entity_ruler')
    ruler.add_patterns(pattern)
    nlp.to_disk('{}_ner'.format(name))
    

In [8]:
def test_model(model,text):
    doc = model(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char,ent.end_char , ent.label_))
    if len(entities)>0:
        results = [text , {'entities':entities}]

    return results

In [9]:

def generate_training_data(model , data):
    Training_Data = []
    for d in data:
        segments =  d.split('\n\n')
        for seg in segments:
            seg = seg.strip()
            seg = seg.replace("\n"," ")
            results = test_model(model, seg)
            if results != None and results !=[] :
                Training_Data.append(results)
    return Training_Data

In [10]:
def train_spacy(data, iterations):
    start = time.process_time()
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        ner = nlp.add_pipe('ner',last = True)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            start2 = time.process_time()
            print ("Starting iteration " + str(itn+1))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                texts, annotations = zip(*batch)
                nlp.update(
                            [example],
                            drop=0.2,
                            sgd=optimizer,
                            losses=losses
                )
            print (losses)
            print("Time for iteration " + str(itn+1)+":",time.process_time() - start2, "sec\n")
    print("Total Time",time.process_time() - start, "sec")
    return (nlp)

In [11]:
def all_in_one(json_file , data , label ):
    patterns = create_training_data(json_file,label)
    generate_rules(patterns, label)
    rules = spacy.load('{}_ner'.format(label))
    Training_Data = generate_training_data(rules , data)
    save_data("Training_Data_{}.json".format(label),Training_Data)
    print('copy the code to load the training data :\n TRAIN_DATA_{} = load_data("Training_Data_{}.json") '.format(label,label))

## 1.eating Company model


In [14]:
# characters = generate_more_data('Companies.json')

In [59]:
company_patterns = create_training_data('Companies.json','COMPANY')

In [63]:
company_patterns

[{'label': 'COMPANY', 'pattern': 'Kurzweil Educational Systems'},
 {'label': 'COMPANY', 'pattern': 'Regalo Kitchens'},
 {'label': 'COMPANY', 'pattern': 'Harley-Davidson Inc.'},
 {'label': 'COMPANY', 'pattern': 'Investors Clinic Infratech Private Limited'},
 {'label': 'COMPANY', 'pattern': 'Williams Sonoma Inc'},
 {'label': 'COMPANY', 'pattern': 'Kohler Company'},
 {'label': 'COMPANY', 'pattern': 'AND1'},
 {'label': 'COMPANY', 'pattern': 'Benchmark Electronics'},
 {'label': 'COMPANY', 'pattern': 'J.P. Morgan Chase & Co.'},
 {'label': 'COMPANY', 'pattern': 'Avant Garde Appraisal'},
 {'label': 'COMPANY', 'pattern': 'Data Bridge Market Research'},
 {'label': 'COMPANY', 'pattern': 'Stryker Corp'},
 {'label': 'COMPANY', 'pattern': 'Consolidated Edison Inc.'},
 {'label': 'COMPANY', 'pattern': 'First Data Corp.'},
 {'label': 'COMPANY', 'pattern': 'Ross Stores Inc'},
 {'label': 'COMPANY', 'pattern': 'Duke Energy'},
 {'label': 'COMPANY', 'pattern': 'Amkor Technology'},
 {'label': 'COMPANY', 'pat

In [64]:
generate_rules(company_patterns, 'COMPANY')

In [65]:
company_rules = spacy.load('COMPANY_ner')

In [66]:
Training_Data_Company = generate_training_data(company_rules , data['Resume_str'])

In [95]:
# all_in_one('Company.json',data['Resume_str'],'COMPANY')

In [67]:
save_data("Training_Data_Company.json",Training_Data_Company)

## #Training

In [68]:
TRAIN_DATA_Company = load_data("Training_Data_COMPANY.json") 

In [72]:
nlp_company = train_spacy(TRAIN_DATA_Company , 50)   

Starting iteration 1
{'ner': 7403.826793408808}
Time for iteration 1: 14.96875 sec

Starting iteration 2
{'ner': 136.94826408380936}
Time for iteration 2: 13.40625 sec

Starting iteration 3
{'ner': 108.67391396989173}
Time for iteration 3: 13.28125 sec

Starting iteration 4
{'ner': 108.36475825126031}
Time for iteration 4: 14.328125 sec

Starting iteration 5
{'ner': 98.63942780365065}
Time for iteration 5: 14.125 sec

Starting iteration 6
{'ner': 87.91711913030002}
Time for iteration 6: 13.96875 sec

Starting iteration 7
{'ner': 60.76499486642481}
Time for iteration 7: 13.03125 sec

Starting iteration 8
{'ner': 69.2269151490821}
Time for iteration 8: 13.96875 sec

Starting iteration 9
{'ner': 34.24185930832607}
Time for iteration 9: 13.078125 sec

Starting iteration 10
{'ner': 377.5546529713121}
Time for iteration 10: 13.453125 sec

Starting iteration 11
{'ner': 52.785897381514204}
Time for iteration 11: 13.515625 sec

Starting iteration 12
{'ner': 38.58656690532691}
Time for iteration

In [73]:
nlp_company.to_disk('Company_model')

## #Testing

In [74]:
for text in df['Resume_str'].iloc[750:800]:

    for ent in nlp_company(text).ents:
         
        print(ent.text, ent.label_)
        

Infor COMPANY
Infor COMPANY
Total Quality COMPANY
... COMPANY
Deloitte COMPANY
Deloitte COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
FactSet COMPANY
Microsoft COMPANY
Microsoft COMPANY
Google COMPANY
Google COMPANY
Microsoft COMPANY
Citrix COMPANY
Twitter COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Best Buy COMPANY
Amazon COMPANY
Citrix COMPANY
Citrix COMPANY
Microsoft COMPANY
Google COMPANY
LinkedIn COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Gap COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Google COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Citrix COMPANY
VLookup COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY
Microsoft COMPANY


## 2.Creating Job model

In [18]:
all_in_one('Jobs.json',data['Resume_str'],'JOBS')

copy the code to load the training data :
 TRAIN_DATA_JOBS = load_data("Training_Data_JOBS.json") 


## #Training

In [19]:
TRAIN_DATA_JOB = load_data("Training_Data_JOBS.json") 

In [20]:
nlp_job = train_spacy(TRAIN_DATA_JOB , 30) 

Starting iteration 1
{'ner': 4529.982361423046}
Time for iteration 1: 20.90625 sec

Starting iteration 2
{'ner': 428.1034681043575}
Time for iteration 2: 20.4375 sec

Starting iteration 3
{'ner': 298.4955564291333}
Time for iteration 3: 19.6875 sec

Starting iteration 4
{'ner': 256.61046730290457}
Time for iteration 4: 19.875 sec

Starting iteration 5
{'ner': 222.30583960599864}
Time for iteration 5: 19.59375 sec

Starting iteration 6
{'ner': 156.60888734678366}
Time for iteration 6: 20.0625 sec

Starting iteration 7
{'ner': 133.72382892135386}
Time for iteration 7: 20.203125 sec

Starting iteration 8
{'ner': 182.1530448441432}
Time for iteration 8: 19.59375 sec

Starting iteration 9
{'ner': 140.0385483820626}
Time for iteration 9: 19.65625 sec

Starting iteration 10
{'ner': 127.99565015565321}
Time for iteration 10: 20.0 sec

Starting iteration 11
{'ner': 89.9441712210246}
Time for iteration 11: 19.5 sec

Starting iteration 12
{'ner': 81.72932763982838}
Time for iteration 12: 20.09375

In [21]:
nlp_job.to_disk('Jobs_model')

## #Testing

In [31]:
for text in df['Resume_str'].iloc[750:800]:

    for ent in nlp_job(text).ents:
         
        print(ent.text, ent.label_)
        

Manager JOBS
Teller JOBS
Agent JOBS
Manager JOBS
Manager JOBS
Auditor JOBS
Hostess JOBS
Cashier JOBS
Designer JOBS
Manager JOBS
Director JOBS
Director JOBS
Designer JOBS
Designer JOBS
Designer JOBS
Graphic Designer JOBS
Customer Service Representative JOBS
Clerk JOBS
Manager JOBS
Professor JOBS
Paralegal JOBS
Manager JOBS
Director JOBS
Manager JOBS
Manager JOBS
Director JOBS
Professor JOBS
Manager JOBS
Physician JOBS
Director JOBS
Director JOBS
Manager JOBS
Manager JOBS
Manager JOBS
Sales Manager JOBS
Manager JOBS
Manager JOBS
Educational JOBS
Manager JOBS
Accountant JOBS
Accountant JOBS
Director JOBS
Director JOBS
Manager JOBS
Accountant JOBS
Accountant JOBS
Director JOBS
Compensation JOBS
Adjuster JOBS
Counselor JOBS
Property JOBS
Agent JOBS
Counselor JOBS
Physician JOBS
Manager JOBS
Specialist JOBS
Director JOBS
Director JOBS
Director JOBS
Manager JOBS
Clinical JOBS
Clinical JOBS
Manager JOBS
Director JOBS
Director JOBS
Construction Manager JOBS
Engineer JOBS
Director JOBS
Doctor JO

## 3.Creating Education model

In [53]:
all_in_one('Education.json',data['Resume_str'],'EDUCATION')

copy the code to load the training data :
 TRAIN_DATA_EDUCATION = load_data("Training_Data_EDUCATION.json") 


## #Training

In [54]:
TRAIN_DATA_EDUCATION = load_data("Training_Data_EDUCATION.json") 

In [55]:
nlp_education = train_spacy(TRAIN_DATA_EDUCATION , 30) 

Starting iteration 1
{'ner': 5744.359347480603}
Time for iteration 1: 9.25 sec

Starting iteration 2
{'ner': 251.76508041171246}
Time for iteration 2: 8.84375 sec

Starting iteration 3
{'ner': 48.084731241128466}
Time for iteration 3: 8.375 sec

Starting iteration 4
{'ner': 41.86431409665979}
Time for iteration 4: 8.546875 sec

Starting iteration 5
{'ner': 20.446692312522597}
Time for iteration 5: 8.515625 sec

Starting iteration 6
{'ner': 20.907547699762816}
Time for iteration 6: 8.734375 sec

Starting iteration 7
{'ner': 22.954360278071373}
Time for iteration 7: 8.375 sec

Starting iteration 8
{'ner': 18.880495986052413}
Time for iteration 8: 8.5 sec

Starting iteration 9
{'ner': 12.155890301682021}
Time for iteration 9: 7.953125 sec

Starting iteration 10
{'ner': 14.263146145917572}
Time for iteration 10: 8.15625 sec

Starting iteration 11
{'ner': 11.728171947591989}
Time for iteration 11: 8.265625 sec

Starting iteration 12
{'ner': 12.556757428908519}
Time for iteration 12: 9.20312

In [56]:
nlp_education.to_disk('Education_model')

## #Testing

In [57]:
for text in df['Resume_str'].iloc[750:800]:

    for ent in nlp_education(text).ents:
         
        print(ent.text, ent.label_)
        

Diploma EDUCATION
Diploma EDUCATION
Master of Science EDUCATION
Master of Science EDUCATION
Master's degree EDUCATION
Diploma EDUCATION
Diploma EDUCATION
Bachelor of Professional EDUCATION
Bachelor's degree EDUCATION
Bachelor's degree EDUCATION
Diploma EDUCATION
Diploma EDUCATION
Master of Science EDUCATION
Diploma EDUCATION
Bachelor of Dental EDUCATION
Master of Science EDUCATION
Master's Degree EDUCATION
Master of Science EDUCATION
Diploma EDUCATION
Diploma EDUCATION
Diploma EDUCATION
Master of Science EDUCATION
Bachelor  of EDUCATION
Diploma EDUCATION
Diploma EDUCATION
Master of Technology EDUCATION
Master of Chiropractic EDUCATION


## 4.Creating Skill model

In [40]:
def load_jsonl(file):
    with open(file , 'r', encoding = 'utf-8') as f:
        data = [json.loads(line) for line in f]
    return data
skill_patterns = load_jsonl("jz_skill_patterns.jsonl")

In [42]:
generate_rules(skill_patterns, 'SKILL')

In [43]:
skill_rules = spacy.load('SKILL_ner')

In [44]:
Training_Data_Skill = generate_training_data(skill_rules , data['Resume_str'])

In [46]:
save_data("Training_Data_Skill.json",Training_Data_Skill)

## #Training

In [49]:
TRAIN_DATA_SKILL = load_data("Training_Data_Skill.json") 

In [50]:
nlp_skill = train_spacy(TRAIN_DATA_SKILL , 30) 

Starting iteration 1
{'ner': 6408.335494675233}
Time for iteration 1: 21.46875 sec

Starting iteration 2
{'ner': 846.4045612367999}
Time for iteration 2: 20.765625 sec

Starting iteration 3
{'ner': 540.6862681862888}
Time for iteration 3: 20.046875 sec

Starting iteration 4
{'ner': 655.778152417762}
Time for iteration 4: 21.21875 sec

Starting iteration 5
{'ner': 475.99520087548495}
Time for iteration 5: 20.828125 sec

Starting iteration 6
{'ner': 564.2282106241917}
Time for iteration 6: 21.546875 sec

Starting iteration 7
{'ner': 509.8512804234038}
Time for iteration 7: 21.75 sec

Starting iteration 8
{'ner': 419.5681777034657}
Time for iteration 8: 20.46875 sec

Starting iteration 9
{'ner': 324.69948997156706}
Time for iteration 9: 21.21875 sec

Starting iteration 10
{'ner': 305.94626629902194}
Time for iteration 10: 20.71875 sec

Starting iteration 11
{'ner': 338.48816361351834}
Time for iteration 11: 21.578125 sec

Starting iteration 12
{'ner': 289.51283205946083}
Time for iteratio

In [51]:
nlp_skill.to_disk('Skill_model')

## #Testing

In [52]:
for text in df['Resume_str'].iloc[750:800]:

    for ent in nlp_skill(text).ents:
         
        print(ent.text, ent.label_)
        

engineering SKILL
marketing SKILL
C SKILL
multimedia SKILL
Engineering SKILL
Business Administration SKILL
Design SKILL
Business Administration SKILL
Business Administration SKILL
finance SKILL
finance SKILL
Vault SKILL
Software SKILL
security SKILL
vault SKILL
payments SKILL
Accounting SKILL
finance SKILL
marketing SKILL
Operations management SKILL
Documentation SKILL
marketing SKILL
Server SKILL
server SKILL
schedule SKILL
DESIGN SKILL
Design SKILL
marketing SKILL
Spring SKILL
design SKILL
Design SKILL
marketing SKILL
Project management SKILL
Design SKILL
Design SKILL
spring SKILL
support SKILL
schedule SKILL
material SKILL
release SKILL
design SKILL
schedule SKILL
schedule SKILL
schedule SKILL
support SKILL
Certificate SKILL
Certificate SKILL
BUSINESS SKILL
Business SKILL
testing SKILL
Business SKILL
Project management SKILL
Business process SKILL
testing SKILL
Business SKILL
Project Management SKILL
Business SKILL
System Analysis SKILL
business intelligence SKILL
Oracle SKILL
secur

Marketing SKILL
marketing SKILL
Marketing SKILL
Business SKILL
INTERACTION SKILL
design SKILL
design SKILL
design SKILL
interaction SKILL
Interaction SKILL
Design SKILL
engineering SKILL
User interface SKILL
design SKILL
testing SKILL
design SKILL
Interaction SKILL
design SKILL
Design SKILL
Software SKILL
Software SKILL
testing SKILL
HTML SKILL
3D SKILL
3D SKILL
SQL SKILL
CDC/ SKILL
design SKILL
Project management SKILL
Design SKILL
Interaction SKILL
Design SKILL
Reality SKILL
Interaction SKILL
interaction SKILL
software SKILL
software SKILL
testing SKILL
interface analysis SKILL
software SKILL
Engineering SKILL
Engineering SKILL
reality SKILL
software SKILL
software SKILL
support SKILL
design SKILL
design SKILL
multimedia SKILL
C SKILL
testing SKILL
testing SKILL
business SKILL
software SKILL
Design SKILL
marketing SKILL
engineering SKILL
engineering SKILL
engineering SKILL
design SKILL
design SKILL
workflow SKILL
design SKILL
testing SKILL
software SKILL
visualization SKILL
software 

API SKILL
specification SKILL
C++ SKILL
API SKILL
framework SKILL
server SKILL
API SKILL
C++ SKILL
C++ SKILL
server SKILL
Software SKILL
Engineering SKILL
server SKILL
Software SKILL
C++ SKILL
business SKILL
Software SKILL
software SKILL
C++ SKILL
Windows SKILL
Computer Science SKILL
Computer Science SKILL
Engineering SKILL
Engineering SKILL
API SKILL
C++ SKILL
documentation SKILL
HTML5 SKILL
XML SKILL
JavaScript SKILL
C SKILL
Windows SKILL
specification SKILL
Visual C++ SKILL
workflow SKILL
business SKILL
design SKILL
business SKILL
HTML SKILL
CSS SKILL
Business SKILL
design SKILL
collaboration SKILL
Communications SKILL
design SKILL
graphic design SKILL
support SKILL
business SKILL
business SKILL
communications SKILL
support SKILL
Library SKILL
library SKILL
support SKILL
business SKILL
business SKILL
finance SKILL
documentation SKILL
database SKILL
business SKILL
business SKILL
marketing SKILL
business SKILL
business SKILL
graphic design SKILL
material SKILL
Graphic Design SKILL
Cer