### Objective of the script:
This script intends to implement a naive mechanism for extraction of Name, Contact Number and Email Id from a collection of resumes and save it systematically in an excel file.
<br>
<br>
Extraction of phone number and email id: Simple regex is used
<br>
Extraction of name: Standford's NER is used which is currently the SOTA method to achieve Named Entity extraction 

In [None]:
### LOAD REQUIRED LIBRARIES ###

import PyPDF2                                       #library to read PDF files
import docx2txt                                     #library to read docx files
import re                                           #library for regex
import os                                           #library for listing directory and path
import nltk                                         #library for NLP
import pandas as pd                                 #library for extraction to excel
import datetime                                     #library for extraction of datetime
from nltk.tag.stanford import StanfordNERTagger     #library for Stanford NER
st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar')     #path for Stanford NER Models

In [None]:
### FUNCTION TO ACHIEVE THE EXTRACTION ###

def extract_resume_info(full_path_to_resume_folder, full_path_to_output_folder):
    
    resumes = os.listdir(full_path_to_resume_folder)
    print ('Number of resumes found: {}'.format(len(resumes)))
    
    applicant_path = list()
    applicant_name = list()
    applicant_email = list()
    applicant_phone = list()
    
    counter = 0
    for resume in resumes:
        counter = counter + 1
        if resume.split('.')[1] =='pdf':
            text_all = ''
            pdf_file = open(full_path_to_resume_folder + '/' + resume, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdf_file)
            count = pdfReader.numPages
            for i in range(count):
                page = pdfReader.getPage(i)
                text_all = text_all + ' ' + page.extractText()
        else:
            text_all = docx2txt.process(full_path_to_resume_folder + '/' + resume)
        
        text = text_all.replace(':',': ')
        try:
            email = re.findall('\S+@\S+', text)  
            applicant_email.append(email[0])
        except:
            applicant_email.append('Not Found')
            
        applicant_path.append(full_path_to_resume_folder + '/' + resume)
        
        text = text_all.replace('-','')
        text = text.title()
        text = text.replace(':','')
        text = text.replace('\n','')
        text = text.replace('.','')
        text = text.replace('(','')
        text = text.replace(')','')
        text = text.replace('!','')
        
        try:
            phone = [x for x in re.findall('\d+',text) if len(x)>6]
            applicant_phone.append(phone[0])
        except:
            applicant_phone.append('Not Found')
        
        extract_names = list()
    
        for sent in nltk.sent_tokenize(text):
            tokens = nltk.tokenize.word_tokenize(sent)
            tags = st.tag(tokens)
            if len(extract_names)>1:
                break
            else:
                for tag in tags:
                    if tag[1]=='PERSON':
                        extract_names.append(tag[0])
                        if (len(extract_names))>1:
                            break   
        try:
            name = extract_names[0] + ' ' + extract_names[1]
        except:
            name = 'Not Found'
        applicant_name.append(name)
        
        print('Completed {} out of {}'.format(counter, str(len(resumes))), end = '          \r')
        
        
    df = pd.DataFrame()
    df['applicant_path'] = applicant_path                           
    df['applicant_name'] = applicant_name 
    df['applicant_phone'] = applicant_phone
    df['applicant_email'] = applicant_email
    
    extract_current_time = datetime.datetime.now()
    append_datetime = str(extract_current_time.year)+str(extract_current_time.month)+str(extract_current_time.day)+str(extract_current_time.hour)+str(extract_current_time.minute)+str(extract_current_time.second)
    
    df.to_excel(full_path_to_output_folder + '/' +'resume_summary_' + append_datetime +'.xlsx')

In [None]:
### EXECUTION OF THE FUNCTION ###

extract_resume_info('D:\\AnuragHalder\\Learning\\Resume Extraction\\resumes', 'D:\\AnuragHalder\\Learning\\Resume Extraction') #Note the double backslash