In [None]:
import json
import re
import spacy

In [None]:
from spacy.matcher import Matcher

In [301]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [306]:
import pandas as pd

In [167]:
import enchant

# Resume Data extraction
Data like **Name, Email ID, Phone No, Education** present in the resume are extracted using this sript.

Resume extraction can be done in *two* ways, 1. Rule based 2. Machine learning
The entire script broadly uses *Rule based* analysis of the data

Libraries used:
1. re(RegEx)
2. nltk(Natural Language Toolkit)
3. spacy(spaCy)
4. json
5. pandas(Pandas)
6. enchant(PyEnchant)

*About Functions*
1. read_data(url) : 
    This function takes path of the .txt format of resume file and returns the text present in it after removing any unnecessary unicode characters like bullets etc.

**The attributes Name, Email, Phone No. are assumed to be in the first 100-200 words of the resume and proceeded accordingly**

------------------------------------------------------------------------------------------------------------------------
2. extractPhno(text) : 
    This function takes one sentence or a string and extracts Phone No. using RegEx pattern matching. Output is a list of all phone no. present in it.

3. extractEmailid(text):
    This function takes one sentence or a string and extracts Email ID using RegEx pattern matching. Output is a list of all Email IDs present in it.

4. extractName(sentence): Extracts the name of the candidate. It checks with all possible Indian Names present in 'allNames.txt' which is downloded from internet, and gives the output.

----------------------------------------------------------------------------------------------------------------------

5. extractEdu(sentences): This function is written assuming the resumes do not have any degrees other than 'Bachelors' and 'Masters' as primary **and coming from engineering domain**. This function makes use of 'education.txt' as source file to get the results.

6. extractExp(sentences): A 'TechSkills.txt' file is generated with all possible technologies and used for finding them in resumes. Since these are definite, using a database is much better. The 'TechSkill.txt' can be populated with all other technologies also so that better results can be obtained.

-----------------------------------------------------------------------------------------------------------------------
*Also if the person has mentioned a skill anywhere in resume then, that should be about some work he has done in the past which would count as an experience. So if we find these skills on any part of the document, it should be counted in, and so what the function does.*

------------------------------------------------------------------------------------------------------------------------
**Due to time constraint, I couldn't populate the file, but the file still contains most popular computer skills which every programmer is expected of**

In [675]:
def read_data(url):
    file = open(url, 'r')
    data = file.read()
    return data.encode('ascii', errors='ignore').decode()

In [None]:
def extractPhno(text):
    phno = []
    pattern = re.compile(r'(([+]|\d){10,14})')
    entries = re.split("\n", text)
    for entry in entries:
        text = entry.replace(" ", "")
        text = text.replace("-", "")
        ans = pattern.search(text)
        if ans is not None:
            phno.append(ans[0][-10:])
    return phno

In [None]:
def extractEmailid(text):
    pattern = re.compile(r'(\w|\d|[.])+[@](\w+)[.](\w+)')
    entries = re.split("\n+", text)
    email = []
    for entry in entries:
        ans = pattern.search(entry)
        if ans is not None:
            email.append(ans[0])
    return email

In [550]:
def extractName(sent):
    sent = re.sub(r'[^a-zA-z]', ' ', sent)
    sent = nlp(sent)
    dictionary = enchant.Dict('en_US')
    indian_names = set(open('allNames.txt', 'r').read().split())
    ans = []
    new_ans = []
    count = 0
    for word in sent:
        if count==15:
            break
        if word.tag_=='NNP':
            w = str(word)
            if not dictionary.check(w.lower()):
                new_ans.append(w)
                if w.lower() in indian_names:
                    ans.append(w)        
        count+=1
    if len(ans)==0 or len(new_ans)<3 :
        return new_ans[:2]
    else:
        return ans

In [664]:
def extractEdu(sentences):
    ans_edu = []
    for sent in sentences:
        pattern = re.compile(r'((Master of)|(Bachelor of)|(Bachelors in)|(Masters in))\s\w+\s\w+', re.IGNORECASE)
        match = re.search(pattern, sent)
        if match is not None:
            temp = []
            for w in word_tokenize(match.group()):
                if w not in stopwords.words('english'):
                    temp.append(w)
            ans_edu.append(' '.join(temp))
            continue
        edu_list = eval('dict({})'.format(open('education.txt', 'r').read()))
        all_words = word_tokenize(sent)
        for word in all_words:
            if edu_list.get(word.upper()) is not None:
                ans_edu.append(edu_list.get(word.upper()))
    #print(edu_list)
    return ans_edu

In [708]:
def extractExp(sentences):
    techSkills = set(open('TechSkills.txt').read().split('\n'))
    ans = set()
    for sent in sentences:
        all_words = word_tokenize(sent)
        for w in all_words:
            if w.upper() in techSkills:
                ans.add(w.upper())
    return list(ans)

## Testing Playground

In [699]:
d = read_data('resumes/Amarnath.txt')

In [700]:
sentences = sent_tokenize(d)

In [582]:
phno = extractPhno(d[:200])

In [583]:
emailid = extractEmailid(d[:200])

In [655]:
extractName(sentences[0])

['Amarnath']

In [665]:
education= extractEdu(sentences)

In [674]:
print(education)

['Bachelors']


In [709]:
extractExp(sentences)

['C', 'DATABASE', 'HTML', 'C++', 'SQL']

Playground **ends here** 

# Writing to different .json files

In [718]:
resumenames = ['Abbas- MS Dyna.txt', 'Amarnath.txt', 'Arup_Kumar_H1 B_NC.txt', 'CV of Binnu Thomas.txt', 'Gangadhar Vasanthapuram_Spruce InfoTech.txt']
count = 0
for i in resumenames:
    d = read_data('resumes/'+i)
    phno = extractPhno(d[:200])
    emailid = extractEmailid(d[:200])
    sentences = sent_tokenize(d)
    name = extractName(sentences[0])
    education = extractEdu(sentences)
    exp = extractExp(sentences)
    output = {
        'name' : name,
        'email' : emailid,
        'phone' : phno,
        'edu' : education,
        'exp' : exp
    }
    with open(i[:-4]+'.txt', 'w') as outfile:
        json.dump(output, outfile)
    count+=1