# NLP Project

## Resume Analyzer

### Importing Libraries

In [39]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import euclidean
import io
import PyPDF2
import pytesseract
from PIL import Image
import pathlib
import docx2txt 
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import pathlib
import os
import spacy
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import ocrmypdf
import warnings
warnings.filterwarnings("ignore")


### Importing Trained Model (Trained on ------ > Kaggle Resume Dataset )

In [2]:
svc = pickle.load(open('svc_model','rb'))
word_vectorizer = pickle.load(open('word_vectorizer','rb'))
enc = pickle.load(open('Label_encoder','rb'))

#### NLP

In [3]:
nlp = spacy.load('en_core_web_sm')

####  Text Extraction Functions

In [5]:
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    output_string = io.StringIO()
    converter = TextConverter(resource_manager, output_string)
    interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(pdf_path, 'rb') as pdf_file:
        for page in PDFPage.get_pages(pdf_file):
            try:
                interpreter.process_page(page)
            except pdfminer.pdfparser.PDFSyntaxError:
                ocrmypdf.ocr(pdf_path, pdf_path,redo_ocr=True)
                interpreter.process_page(page)
    text = output_string.getvalue()
    output_string.close()
    converter.close()
    text = str(text.replace('\n','\t')).replace('\t',' ')
    return text


def extract_text_from_doc(doc):
    text = docx2txt.process(doc)
    text = str(text.replace('\n','\t')).replace('\t',' ')
    return text

#### Category Function 

In [6]:
def category(text):
    category = enc.inverse_transform(svc.predict(word_vectorizer.transform([text])))[0]
    return category

#### Details Extration Functions

In [7]:
def applicant_name(docx):
    person_names = []
    for ent in docx.ents:
        if ent.label_ == 'PERSON':
            person_names.append(ent.text)
    return person_names

In [8]:
def phone_extract(docx):
    phone_numbers = []
    for token in docx:
        if token.like_num and len(token.text) >= 10:
            phone_numbers.append(token.text)
    if phone_numbers!=[]:
        return phone_numbers[0]
    else:
        return None

In [9]:
def skills_extract(docx):
    skills = []
    for ent in docx.ents:
        if ent.label_ == "SKILL":
            skills.append(ent.text)
            return skills    
            

In [10]:
def extract_experience(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "DATE":
            if "year" in ent.text.lower():
                return ent.text.strip()

In [36]:
def resume_score(resume,Job_desc):
    ## Co simmilarity
    
    stop_words = list(spacy.lang.en.stop_words.STOP_WORDS)
    resume = nlp(resume)
    Job_desc = nlp(job)
    resume_filtered = [token.text for token in resume if token.is_alpha]
    Job_desc_filtered = [token.text for token in Job_desc if token.is_alpha]
    resume_filtered =[i.lower() for i in resume_filtered]
    Job_desc_filtered = [i.lower() for i in Job_desc_filtered]
    resume_filtered= ' '.join(resume_filtered)
    Job_desc_filtered= ' '.join(Job_desc_filtered)
    vectorizer = TfidfVectorizer(sublinear_tf=True,stop_words=stop_words)
    vectorizer.fit([Job_desc_filtered])
    J_vector = vectorizer.transform([Job_desc_filtered])
    R_vector = vectorizer.transform([resume_filtered])
    similarity_score = cosine_similarity(J_vector,R_vector)[0][0]*100
    
    ## Euclidean distance
    J_vector_array = J_vector.toarray().ravel()
    R_vector_array = R_vector.toarray().ravel()
    euclidean_distance = euclidean(J_vector_array, R_vector_array)*100
    
    ## User Category
    user_category = category(resume_filtered)
    
    ## Job vs User category score
    job_category = category(Job_desc_filtered)
    category_match_score = 1 if job_category == user_category else 0
    
    ## Skills Scores
    skills_score = len(skills_extract(resume)) if skills_extract(resume) else 0 ##Spacy en_core_web_sm not able to detect skills 
    
    ## Experience Score
    experience_score = len(extract_experience(resume)) if extract_experience(resume) else 0
    ## Give a weights to each one
    ## w1-->cosimilarity 50%
    ## w2-->euclidean_distance 35%, w3-->category_match_score 5%, w4-->skills scores 5%, 
    ## w5-->Experience scores 5%
    
    w1=0.45
    w2=0.45
    w3=0.05
    w4=0.025
    w5=0.025
    total_scores = (w1*similarity_score)+(w2*euclidean_distance)+(w3*category_match_score)+(w4*skills_score)+(w5*experience_score)
    if (total_scores >= 60):
        result = 'PASS'
    else:
        result = 'FAIL'
    return f'RESULT {result}', f'User Category: {user_category}.',f'co similarity:{similarity_score}%',f'E_distance score {euclidean_distance}', f"Skills score {skills_score}", f'Experience scores {experience_score}',f'Total Resume Score {total_scores}'   
            


In [37]:
job='''machine learning, data science, pandas  , numpy, sql, deep learning, computer vision , data visualisation , python '''

In [44]:
resume_score(extract_text_from_pdf('CV.pdf'),job)[1]

'User Category: Data Science.'