In [17]:
import spacy
import pickle
import random

In [18]:
train_data = pickle.load(open('train_data.pkl', 'rb'))
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [19]:
nlp = spacy.blank('en')

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last = True)
    
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
            
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            index = 0
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as e:
                    pass
                
            print(losses)
    

In [20]:
train_model(train_data)

Statring iteration 0
{'ner': 15437.056169815718}
Statring iteration 1
{'ner': 11535.532822716163}
Statring iteration 2
{'ner': 9503.211848936737}
Statring iteration 3
{'ner': 7992.478447735519}
Statring iteration 4
{'ner': 6079.627460361285}
Statring iteration 5
{'ner': 4881.53824571216}
Statring iteration 6
{'ner': 6299.140179365195}
Statring iteration 7
{'ner': 4565.099246710874}
Statring iteration 8
{'ner': 4049.9170016815456}
Statring iteration 9
{'ner': 4763.480781305039}


In [21]:
nlp.to_disk('nlp_model')

In [22]:
nlp_model = spacy.load('nlp_model')

In [23]:
train_data[0][0]

'Anand S Bangalore, Karnataka - Email me on Indeed: indeed.com/r/Anand-S/ce230cad6115ae68  WORK EXPERIENCE  Space auditing  Microsoft -  2017 to 2017  Auditing the space.  EDUCATION  Vijaya main  Vijaya  SKILLS  Good listener,take up responsibilities,good at communication,great at taking challenges,excellent in various sports like soccer,cricket,kabbadi,cycling,running,swimming, fluent in English,kannada, known languages hindi,tail,telugu  https://www.indeed.com/r/Anand-S/ce230cad6115ae68?isid=rex-download&ikw=download-top&co=IN'

In [24]:
doc = nlp_model(train_data[0][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Anand S
LOCATION                      - Bangalore
EMAIL ADDRESS                 - indeed.com/r/Anand-S/ce230cad6115ae68
DESIGNATION                   - Space auditing
COMPANIES WORKED AT           - Microsoft
SKILLS                        - Vijaya main  Vijaya
SKILLS                        - Good listener,take up responsibilities,good at communication,great at taking challenges,excellent in various sports like soccer,cricket,kabbadi,cycling,running,swimming, fluent in English,kannada, known languages hindi,tail,telugu


In [25]:
train_data[2][0]

"Shiksha Bhatnagar chnadigarh - Email me on Indeed: indeed.com/r/Shiksha-Bhatnagar/70e68b28225ca499  WORK EXPERIENCE  online job in home  Microsoft and copy past -  Chandigarh, Chandigarh -  August 2016 to July 2017  i need a online job so that i can attend  my regular college and i want to earn money that's it a part time online job so that i can do it on my phone or laptop  EDUCATION  pass 12 in medical  chandigarh university -  Chandigarh, Chandigarh  September 2016 to August 2019  SKILLS  Microsoft office and java (Less than 1 year)  ADDITIONAL INFORMATION  i want to earn money by my hard work or smart work p  https://www.indeed.com/r/Shiksha-Bhatnagar/70e68b28225ca499?isid=rex-download&ikw=download-top&co=IN"

In [26]:
doc = nlp_model(train_data[2][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Shiksha Bhatnagar
LOCATION                      - chnadigarh
EMAIL ADDRESS                 - indeed.com/r/Shiksha-Bhatnagar/70e68b28225ca499
LOCATION                      - Chandigarh
LOCATION                      - Chandigarh
SKILLS                        - Microsoft office and java (Less than 1 year)
