### NLP Project - Resume Classification

#### Feature Extraction

##### Installed Required Library

In [2]:
#Uncomment & Run to Install
#!pip install pyresparser

In [3]:
#Importing Required Libraries
import numpy as np
import pandas as pd
import spacy
from spacy.matcher import Matcher
from os import listdir
from os.path import isfile, join
import re
#Downloading Stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.corpus import stopwords

#To Extract Skills
from pyresparser import ResumeParser
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [4]:
#Loading the csv Dataset that we got from Previous EDA Steps
df = pd.read_csv('/content/Resume_df.csv')
#Taking only the 'Text' & 'Clean Text' Column for Now
df = df.iloc[:,1:3]
#df = df.iloc[:,-1:]
df.head()

Unnamed: 0,Text,Clean Text
0,Kotani Durga Prasad Objective: Aspirant for a ...,kotani durga prasad objective aspirant positio...
1,Ui-Developer/ React JS Developer NAME: KRISHN...,ui-developer/ react js developer name krishna ...
2,MAREEDU LOKESH BABU PROFESSIONAL OVERVIEW ...,mareedu lokesh babu professional overview arou...
3,KAMALAKAR REDDY. A Linked In: https://www.lin...,kamalakar reddy a linked in http //www.linkedi...
4,Thirupathamma Balla SUMMARY: 2.8 year of IT ex...,thirupathamma balla summary 2.8 year it experi...


##### Name Extraction

In [5]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

#Function to Extract 'Name'
def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('NAME', [pattern], None)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

names = []
for i in range(len(df)):
  names.append(extract_name(df['Clean Text'][i]))

In [6]:
#Creating a 'Features' DataFrame
Features = pd.DataFrame(names,columns = ['Names'])
Features.head()

Unnamed: 0,Names
0,kotani durga
1,developer/ react
2,mareedu lokesh
3,kamalakar reddy
4,thirupathamma balla


##### Mobile No Extraction

In [7]:
#Function To Extract Mobile Number
def extract_mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)

    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return '+' + number
        else:
            return number

mobile = []
for i in range(len(df)):
  mobile.append(extract_mobile_number(df['Clean Text'][i]))

In [8]:
#Adding the "Mobile Number" Extracted
Features['Mobile_Number'] = mobile
Features.head()

Unnamed: 0,Names,Mobile_Number
0,kotani durga,9112345678
1,developer/ react,9112345678
2,mareedu lokesh,9112345678
3,kamalakar reddy,777682196
4,thirupathamma balla,9112345678


##### Email ID Extraction

In [9]:
#Function To Extract Email ID
def extract_email(email):
    email = re.findall("[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", email)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

email = []
for i in range(len(df)):
  email.append(extract_email(df['Text'][i]))

In [10]:
#Adding the 'Email Id' Extracted
Features['Email'] = email
Features.head()

Unnamed: 0,Names,Mobile_Number,Email
0,kotani durga,9112345678,abc@xyz.com
1,developer/ react,9112345678,abc@xyz.com
2,mareedu lokesh,9112345678,abc@xyz.com
3,kamalakar reddy,777682196,abc@xyz.com
4,thirupathamma balla,9112345678,abc@xyz.com


##### Skills Extraction

In [11]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
#Getting Each Resumes Path
mypath = '/content/drive/MyDrive/Resumes_Dataset'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#Using Resume Parser Library To Extract Skills
skills = []
for path in onlyfiles:
  skills.append(ResumeParser(f'/content/drive/My Drive/Resumes_Dataset/{path}').get_extracted_data()['skills'])

In [13]:
#Converting the List into an Array of List
skills = np.array(skills).reshape(-1,1)
#Converting into an DF
skills_df = pd.DataFrame(skills,columns = ['Skills'])
skills_df.head()

  


Unnamed: 0,Skills
0,"[Design, Mobile, Website, Javascript, Marketin..."
1,"[Design, Website, Javascript, Ui, Agile, Json,..."
2,"[Design, Website, Javascript, Windows, Invento..."
3,"[Design, Data collection, Javascript, Ui, Wind..."
4,"[Analytical, Testing, Engineering, Json, Sql, ..."


In [14]:
#Adding the "Skills" Extracted to Our "Features"
Features['Skills'] = skills_df['Skills']
Features.head()

Unnamed: 0,Names,Mobile_Number,Email,Skills
0,kotani durga,9112345678,abc@xyz.com,"[Design, Mobile, Website, Javascript, Marketin..."
1,developer/ react,9112345678,abc@xyz.com,"[Design, Website, Javascript, Ui, Agile, Json,..."
2,mareedu lokesh,9112345678,abc@xyz.com,"[Design, Website, Javascript, Windows, Invento..."
3,kamalakar reddy,777682196,abc@xyz.com,"[Design, Data collection, Javascript, Ui, Wind..."
4,thirupathamma balla,9112345678,abc@xyz.com,"[Analytical, Testing, Engineering, Json, Sql, ..."


##### Education Extraction

In [16]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))
# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS','Bachelor of Technology','Senior Secondary' 'B.S', 'B.SC', 'B E', 'B. E.','B. E','B S','B. S','B. SC'
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 'B-TECH','M-TECH','M E', 'M. E','B.COM','B.ED','L.L.B.','LLB','LLM','L.L.M.', 'M. E.', 'M S', 'M. S',
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH','B TECH', 'B. TECH', 'M. TECH', 'M TECH',
            'B. TECH','M. TECH','B TECH','M TECH','M.D.','NDA','N.D.A.','PHD','PGDM','P.G.D.M.' 'MBA','M.B.A.','MCA','M.C.A.','MS','M.S.','MD',
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII',
            'BBA','B.B.A.','BCA','B.C.A.','BA','B.A.', 
        ]


education = []
#Function to Extract Education
def extract_education(resume_text):
    nlp_text = nlp(resume_text)
    # Sentence Tokenizer
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index]   

    # Extract year
    education = []
    if edu:
        for key in edu.keys():
            year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])  
            if year:
                education.append((key, ''.join(year[0])))
            else:
                education.append(key)
        return education
    else:
        EDU_PATTERN = re.findall(r'Bachelors? \D+|Masters? \D+',resume_text)   
        return EDU_PATTERN
        

# Calling function
for text in df['Text']:
    education.append(extract_education(text))

In [17]:
#Adding Education Extracted
Features['Education'] = education
Features.head()

Unnamed: 0,Names,Mobile_Number,Email,Skills,Education
0,kotani durga,9112345678,abc@xyz.com,"[Design, Mobile, Website, Javascript, Marketin...",[BTech]
1,developer/ react,9112345678,abc@xyz.com,"[Design, Website, Javascript, Ui, Agile, Json,...",[B-Tech]
2,mareedu lokesh,9112345678,abc@xyz.com,"[Design, Website, Javascript, Windows, Invento...",[BTech]
3,kamalakar reddy,777682196,abc@xyz.com,"[Design, Data collection, Javascript, Ui, Wind...",[]
4,thirupathamma balla,9112345678,abc@xyz.com,"[Analytical, Testing, Engineering, Json, Sql, ...","[(SSC, 2014)]"


##### Years of Experience Extraction

In [18]:
#Function to Extract Years of Experience
def extract_years_of_experience(text):
    lines = sent_tokenize(text)           
    experience = []
    for sentence in lines:
        if re.search('experience',sentence.lower()):        
            sen_tokenized = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(sen_tokenized)            
            entities = nltk.chunk.ne_chunk(tagged)          
            for subtree in entities.subtrees():
                for leaf in subtree.leaves():
                    if leaf[1] == 'CD':                     
                        experience.append(leaf[0])
                        
    exp = []
    for ele in experience:
        if len(ele) <= 3 or (len(ele) <= 4 and ele[-1] == '0' 
                                and ele not in ('2020','2010','2000')):       
            exp.append(ele)
    if exp:
        return exp[0]
    else:
        return np.nan

years_of_experience = []
for i in range(len(df)):
    years_of_experience.append(extract_years_of_experience(df['Text'][i]))

In [19]:
#Adding "Years of Experience" Extracted
Features['Years_of_Experience'] = years_of_experience
Features.head()

Unnamed: 0,Names,Mobile_Number,Email,Skills,Education,Years_of_Experience
0,kotani durga,9112345678,abc@xyz.com,"[Design, Mobile, Website, Javascript, Marketin...",[BTech],3.1
1,developer/ react,9112345678,abc@xyz.com,"[Design, Website, Javascript, Ui, Agile, Json,...",[B-Tech],3.2
2,mareedu lokesh,9112345678,abc@xyz.com,"[Design, Website, Javascript, Windows, Invento...",[BTech],2.0
3,kamalakar reddy,777682196,abc@xyz.com,"[Design, Data collection, Javascript, Ui, Wind...",[],3.0
4,thirupathamma balla,9112345678,abc@xyz.com,"[Analytical, Testing, Engineering, Json, Sql, ...","[(SSC, 2014)]",2.8


##### Links Extraction 

In [20]:
links = []
def extract_links(text):
    x = re.findall(r'(https://(www.)?[a-z]+.com(/in)?/[a-z0-9]+)',text)   
    links = []
    for strings in x:
        for link in strings:
            if len(link) > 4:
                links.append(link)
    return links

for i in df['Text']:
    links.append(extract_links(text))

In [21]:
#Adding "Links" Extracted
Features['links'] = links
Features.head()

Unnamed: 0,Names,Mobile_Number,Email,Skills,Education,Years_of_Experience,links
0,kotani durga,9112345678,abc@xyz.com,"[Design, Mobile, Website, Javascript, Marketin...",[BTech],3.1,"[https://www.linkedin.com/fake, https://www.gi..."
1,developer/ react,9112345678,abc@xyz.com,"[Design, Website, Javascript, Ui, Agile, Json,...",[B-Tech],3.2,"[https://www.linkedin.com/fake, https://www.gi..."
2,mareedu lokesh,9112345678,abc@xyz.com,"[Design, Website, Javascript, Windows, Invento...",[BTech],2.0,"[https://www.linkedin.com/fake, https://www.gi..."
3,kamalakar reddy,777682196,abc@xyz.com,"[Design, Data collection, Javascript, Ui, Wind...",[],3.0,"[https://www.linkedin.com/fake, https://www.gi..."
4,thirupathamma balla,9112345678,abc@xyz.com,"[Analytical, Testing, Engineering, Json, Sql, ...","[(SSC, 2014)]",2.8,"[https://www.linkedin.com/fake, https://www.gi..."


##### University Extraction

In [22]:
#Loading the csv which contains List of University
univ = pd.read_csv('/content/List_of_universities.csv')
univ.head()

Unnamed: 0,University Name
0,Abhilashi University
1,Acharya N.G. Ranga Agricultural University
2,Acharya Nagarjuna University
3,Adamas University
4,Adesh University


In [23]:
#Funtion to get University Names
fun_uni = []

def get_uni_names(text): 
    University_Names = []
    lines = sent_tokenize(text)          
    for sentence in lines:                             
            if re.search('university',sentence.lower()) or re.search('education',sentence.lower()) or re.search('qualifications',sentence.lower()): # Search for words like education in sentence
                if re.search(r'\s{3,5}',sentence):  
                    sens = re.split(r'\s{3,5}', sentence)
                    for j in sens:
                        if re.search('university',j.lower()) or re.search('education',j.lower()) or re.search('qualifications',j.lower()):  # Search for keywords like education in splitted sentences
                            edu_sen = j
                            for k in univ['University Name']:  # Loop to run through each university name
                                if re.search(k.lower(),edu_sen.lower()):
                                    University_Names.append(k)   # If university is found, append it.
                                    

                else:                               # If there are no 3-5 spaces found, then run following
                    edu_sen = sentence
                    for k in univ['University Name']:      # Loop to run through each university name
                        if re.search(k.lower(),edu_sen.lower()):
                            University_Names.append(k)           # If university is found, append it.
            

                if University_Names:
                    continue
                else:
                    # print(sentence)
                    University_Names.append(sentence)    # If particular university name not found, append whole sentence with keywords like education.

    # Remove duplicates    
    for j in range(len(University_Names)-1):
        if University_Names[j] == University_Names[j+1]:
            del University_Names[j]
    
    return University_Names

for i in range(len(df)):
    fun_uni.append(get_uni_names(df['Text'][i]))

In [24]:
#Adding Universities Extracted
Features['Universities_Names'] = fun_uni
Features.head(10)

Unnamed: 0,Names,Mobile_Number,Email,Skills,Education,Years_of_Experience,links,Universities_Names
0,kotani durga,9112345678,abc@xyz.com,"[Design, Mobile, Website, Javascript, Marketin...",[BTech],3.1,"[https://www.linkedin.com/fake, https://www.gi...",[Education Details: B.Tech (Computer Science ...
1,developer/ react,9112345678,abc@xyz.com,"[Design, Website, Javascript, Ui, Agile, Json,...",[B-Tech],3.2,"[https://www.linkedin.com/fake, https://www.gi...",[QUALIFICATION: B-Tech from JNTU-Kakinada Univ...
2,mareedu lokesh,9112345678,abc@xyz.com,"[Design, Website, Javascript, Windows, Invento...",[BTech],2.0,"[https://www.linkedin.com/fake, https://www.gi...",[]
3,kamalakar reddy,777682196,abc@xyz.com,"[Design, Data collection, Javascript, Ui, Wind...",[],3.0,"[https://www.linkedin.com/fake, https://www.gi...",[TITLE : lernbook DESCRIPTION: Fortunapix wor...
4,thirupathamma balla,9112345678,abc@xyz.com,"[Analytical, Testing, Engineering, Json, Sql, ...","[(SSC, 2014)]",2.8,"[https://www.linkedin.com/fake, https://www.gi...",[Education Course Institution Percentage Year ...
5,haripriya battina,919908576,haripriyabattini@gmai.com,"[C, Sql, Javascript, Ui, Windows, Gmail, Infor...","[BTech, (SSC, 2012), MS]",1.0,"[https://www.linkedin.com/fake, https://www.gi...","[WORK EXPERIENCE EDUCATION 1., Andhra Univers..."
6,kanumuru deepak,9112345678,abc@xyz.com,"[Design, Website, Javascript, Apis, Ui, Api, W...","[BTech, SSC]",2.0,"[https://www.linkedin.com/fake, https://www.gi...",[ACADEMIC QUALIFICATIONS: Qualification Instit...
7,developer/ react,9112345678,abc@xyz.com,"[Design, Website, Javascript, Ui, Agile, Json,...",[B-Tech],3.2,"[https://www.linkedin.com/fake, https://www.gi...",[QUALIFICATION: B-Tech from JNTU-Kakinada Univ...
8,sarala madasu,9112345678,abc@xyz.com,"[Debugging, Time management, Research, Usabili...",[],3.0,"[https://www.linkedin.com/fake, https://www.gi...","[Education B.Tech.,CSE Education B.Tech.,CSE 2..."
9,venkatalakshmi pedireddy,9112345678,abc@xyz.com,"[Procurement, Erp, Logistics, Engineering, Sys...","[SSC, (Btech, 2011)]",3.0,"[https://www.linkedin.com/fake, https://www.gi...","[Ltd 05/2018 - Present, Visakapatnam Achievem..."


##### Saving the "Features" DF for Next Step of Model Building

In [25]:
Features.to_csv('Features.csv', index = None)