In [None]:
import spacy
import sys
import fitz
import math
import threading
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from dateutil import parser
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

Change This Accordingly

In [None]:

base = "D:/Git/NLP-Resume-Parser"

In [None]:
def extract_text_from_pdf(file_path):

    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_number in range(pdf_document.page_count):
                page = pdf_document[page_number]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None


def parse_entities_to_dict(doc):
    entities_dict = defaultdict(list)
    for ent in doc.ents:
        entities_dict[ent.label_].append(ent.text + " ")
    return entities_dict

In [None]:
 
def find_and_extract_lines(file, target_word):
    
    doc = fitz.open(file)
    
    result_lines = [] 
    for page_number in range(doc.page_count):
        page = doc[page_number]
        page_text = page.get_text("text")
        lines = page_text.split('\n')
        
        for i, line in enumerate(lines):
            # Case-insensitive search for the target word
            if target_word.lower() in line.lower() and f' {target_word.lower()} ' in f' {line.lower()} ':
                # Add the next 10 lines to the result array
                result_lines.extend(lines[i+1:i+11])
                break   

    return result_lines


In [None]:
def loadResumeModel(file_path):
  
     

    nlp = spacy.load(base+'/models/resumeModel/output/model-best') 

    
    cv_filename = file_path

    if cv_filename.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    # elif cv_filename.endswith('.docx'):
    #     text = extract_text_from_docx(filename)
    else:
        print("Unsupported file format")
        sys.exit(1)

    doc = nlp(text)
    resume_entities = parse_entities_to_dict(doc) 
    print('resume_entities-------------------------------------------------------')
    print(resume_entities)
    return resume_entities

In [None]:
def loadJdModel(file_path):
    
    
    jdModel = spacy.load(base+'/models/JdModel/output/model-best') 



    jd_filename = file_path
 

    if jd_filename.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    else:
        print("Unsupported file format")
        sys.exit(1)
 
    jd_doc = jdModel(text)

    def parse_entities_to_dict(doc):
        entities_dict = defaultdict(list)
        for ent in doc.ents:
            entities_dict[ent.label_].append(ent.text + " ")
        return entities_dict

    jd_entities = parse_entities_to_dict(jd_doc)
    print('jd_entities-------------------------------------------------------')
    print(jd_entities)
    # Print the parsed entities 
    return jd_entities


In [None]:
# Initialize Sentence Transformer Model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def sentence_transformer_similarity(sentence1, sentence2):
    # Encode sentences to get their embeddings
    embedding1 = sentence_model.encode(sentence1, convert_to_tensor=True)
    embedding2 = sentence_model.encode(sentence2, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

    return cosine_scores.item()

def compute_tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return similarity

def preprocess_text(text):
    return re.sub(r'[^A-Za-z0-9]+', ' ', text).lower()

In [None]:
def preprocess_skills(skills):
    processed_skills = []
    for skill in skills:
        for split_skill in re.split(r'\||,| and ', skill):
            processed_skill = re.sub(r'[^A-Za-z0-9]+', ' ', split_skill).strip().lower()
            if processed_skill:
                processed_skills.append(processed_skill)
    return set(processed_skills)


def skill_matching(jd_skills, cv_skills):
    jd_skills_processed = preprocess_skills(jd_skills)
    cv_skills_processed = preprocess_skills(cv_skills)

    # Join the processed skills into a single string for each set of skills
    jd_skills_text = ' '.join(jd_skills_processed)
    cv_skills_text = ' '.join(cv_skills_processed)

    # Compute similarity using the sentence transformer model
    skill_similarity = sentence_transformer_similarity(jd_skills_text, cv_skills_text)

    return skill_similarity

In [None]:
def compute_experience_months(experience_list):
    total_months = 0
    for exp in experience_list:
        exp = exp.lower().strip()

        # Check for specific durations (e.g., "1 year 5 months")
        if 'year' in exp or 'month' in exp:
            years = months = 0
            year_match = re.search(r'(\d+)\s*year', exp)
            month_match = re.search(r'(\d+)\s*month', exp)
            if year_match:
                years = int(year_match.group(1))
            if month_match:
                months = int(month_match.group(1))
            total_months += 12 * years + months
        elif re.match(r'\d+', exp):  # Only numbers (assume years if no unit)
            total_months += 12 * int(exp)
        else:
            # Parse date ranges
            dates = parse_dates(exp)
            if len(dates) == 2:
                start_date, end_date = dates
                total_months += (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
            elif len(dates) == 1:
                start_date = dates[0]
                current_date = datetime.now().date()
                total_months += (current_date.year - start_date.year) * 12 + current_date.month - start_date.month
    return total_months


def parse_dates(date_string):
    date_pattern = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?|\d{1,4})[\s./-]*(?:[0-3]?[0-9],?[\s./-]*\d{2,4})?\b'
    dates = re.findall(date_pattern, date_string)
    parsed_dates = [parser.parse(date).date() for date in dates]
    return parsed_dates


def compute_relevant_experience_similarity(jd, cv):
    jd_job_post = jd['JOBPOST'][0].lower()
    jd_experience_required = compute_experience_months(jd['EXPERIENCE'])
    highest_relevance_score = 0
    most_relevant_job = ""
    most_relevant_experience = 0
    experience_present = 'YEARS OF EXPERIENCE' in cv and any(cv['YEARS OF EXPERIENCE'])

    for job_title in cv['WORKED AS']:
        
        relevance_score = sentence_transformer_similarity(jd_job_post, job_title)
        # print(f"Comparing '{jd_job_post}' with '{job_title}': Relevance Score = {relevance_score}")

        if relevance_score > highest_relevance_score:
            highest_relevance_score = relevance_score
            most_relevant_job = job_title
            if experience_present:
                idx = cv['WORKED AS'].index(job_title)
                most_relevant_experience = compute_experience_months([cv['YEARS OF EXPERIENCE'][idx]])

    # print(f"Most Relevant Job: {most_relevant_job} with Relevance Score: {highest_relevance_score}")

    if experience_present:
        experience_similarity = min(most_relevant_experience, jd_experience_required) / jd_experience_required
    else:
        experience_similarity = 0
        # print("No years of experience found in CV. Deducting 0.25 from overall similarity score.")

    overall_similarity = (0.75 * highest_relevance_score) + (0.25 * experience_similarity)

    # print("Relevant Experience Similarity:", overall_similarity)
    return overall_similarity




In [None]:

def compute_degree_similarity(jd_degree, cv_degree):
    jd_degree_processed = preprocess_text(jd_degree[0])
    cv_degree_processed = preprocess_text(cv_degree[0])
    similarity = sentence_transformer_similarity(jd_degree_processed, cv_degree_processed)
    # print(f"Comparing Degrees - JD: '{jd_degree[0]}' and CV: '{cv_degree[0]}': Degree Similarity = {similarity}")
    return similarity

In [None]:

def compute_language_requirement_similarity(jd, cv):
    if jd['LANGUAGE']:
        jd_language = preprocess_text(jd['LANGUAGE'])
        cv_language = preprocess_text(cv['LANGUAGE'])
        return 0.15 * compute_tfidf_similarity(list(jd_language), list(cv_language))
    return 0

In [None]:

def compute_awards_certifications_similarity(jd, cv):
    jd_text = jd['JOBPOST'][0].lower()
    highest_similarity = 0

    # Iterate through each award and certification individually
    for award in cv['AWARDS']:
        award_similarity = sentence_transformer_similarity(jd_text, award.lower())
        # print("Calculating similarity for Award:", award)
        # print(f"Similarity Score: {award_similarity}")
        highest_similarity = max(highest_similarity, award_similarity)

    for certification in cv['CERTIFICATION']:
        certification_similarity = sentence_transformer_similarity(jd_text, certification.lower())
        # print("Calculating similarity for Certification:", certification)
        # print(f"Similarity Score: {certification_similarity}")
        highest_similarity = max(highest_similarity, certification_similarity)

    # Check if the highest similarity score is above the threshold
    if highest_similarity > 0.5:
        # print("Relevant Award/Certification found with highest similarity. Adding bonus to score.")
        return 0.1  # Add extra 0.1 if a relevant award or certification is found

    # Print when no relevant award/certification is found or similarity is low
    # print("No relevant Award/Certification found or highest similarity score is low. No bonus added.")
    return 0


In [None]:

def enhanced_similarity_score(jd, cv):
    total_score = 0
    total_weight = 0
    scoreDetail = "\nScore Details ---->\n"
    # Skills similarity
    if jd.get('SKILLS'):
    
        jd_skills = jd['SKILLS']
        cv_skills = cv.get('SKILLS', [])
        skill_similarity = skill_matching(list(jd_skills), list(cv_skills))
         
        total_score += 0.4 * skill_similarity
        total_weight += 0.4
        # print(f"Skills Contribution to Total Score: {0.4 * skill_similarity}")
        
        scoreDetail = scoreDetail + f"\nSkills similarity: {skill_similarity} \nSkills Contribution to Total Score: {0.4 * skill_similarity}\n"

    # Experience similarity
    if jd.get('EXPERIENCE'):
        
        experience_similarity = compute_relevant_experience_similarity(jd, cv)
        # print(f"Experience Similarity: {experience_similarity}")
        total_score += 0.4 * experience_similarity
        total_weight += 0.4
        # print(f"Experience Contribution to Total Score: {0.4 * experience_similarity}")
        
        scoreDetail = scoreDetail + f"\nExperience similarity: {experience_similarity} \nExperience Contribution to Total Score: {0.4 * experience_similarity}\n"

    # Degree similarity
    if jd.get('DEGREE'):
         
        jd_degree = jd['DEGREE']
        cv_degree = cv.get('DEGREE', [''])
        degree_similarity = compute_degree_similarity(jd_degree, cv_degree)
        # print(f"Degree Similarity: {degree_similarity}")
        total_score += 0.2 * degree_similarity
        total_weight += 0.2
        # print(f"Degree Contribution to Total Score: {0.2 * degree_similarity}")

        scoreDetail = scoreDetail + f"\nDegree similarity: {degree_similarity} \nDegree Contribution to Total Score: {0.2 * degree_similarity}\n"

    # Language requirement similarity
    if jd.get('LANGUAGE'):
       
        language_requirement_similarity = compute_language_requirement_similarity(jd, cv)
        # print(f"Language Requirement Similarity: {language_requirement_similarity}")
        total_score += language_requirement_similarity
        total_weight += 0.15
        # print(f"Language Contribution to Total Score: {language_requirement_similarity}")

        scoreDetail = scoreDetail + f"\nLanguage similarity: {language_requirement_similarity} \nLanguage Contribution to Total Score: {language_requirement_similarity}\n"

    # Awards and certifications similarity
     
    awards_certifications_bonus = compute_awards_certifications_similarity(jd, cv)
    # print(f"Awards/Certifications Bonus: {awards_certifications_bonus}")
    total_score += awards_certifications_bonus
    total_weight += 0.1 if awards_certifications_bonus else 0
    # print(f"Awards/Certifications Contribution to Total Score: {awards_certifications_bonus}")

    scoreDetail = scoreDetail + f"\nAwards/Certifications similarity: {awards_certifications_bonus} \nAwards/Certifications Contribution to Total Score: {0.1*awards_certifications_bonus}\n"

    # Normalize the score based on the weights of the components included
    normalized_score = total_score / total_weight if total_weight else 0 
  
    return normalized_score , scoreDetail




In [None]:
def transform_score(final_score):
    # Sigmoid function
    final_score = final_score*100
    transformed_score = 100 / (1 + math.exp(-0.1 * (final_score - 50)))
    return round(transformed_score, 3)

In [None]:

def calculatMatch(jd_entities,resume_entities):
    
    final_score , scoreDetail = enhanced_similarity_score(jd_entities, resume_entities)
    final_score = transform_score(final_score)
    
    cvmatch = 'Poor Match'

    if(final_score > 40.0 and final_score <= 60.0 ) :
        cvmatch = 'Good Match'
    elif(final_score > 60.0 and final_score <= 85.0):
        cvmatch = 'Very Good Match'
    elif(final_score > 85.0):
        cvmatch = 'Excellent Match'
         

    cv_content = "\n<----------------------------------------->\n\nCV Details ---->\n\n"

    
    # Assuming resume_entities is a dictionary with lists as values
    for key, values in resume_entities.items():
        cv_content += "\t-> "+key.upper() + ":  "
        
        # Check if the list of values is empty
        if not values:
            cv_content += "none"
        else:
            # Join the values with a comma and space, then strip to remove any extra whitespace
            cv_content += ', '.join([value.strip() for value in values])
        
        cv_content += "\n"
        
    cv_content += "\n<-----------------------------------------> \n\n"
    cv_content=cv_content+scoreDetail
    cv_content = f"\nFinal Enhanced Similarity Score: {final_score} \nMatch Quality: {cvmatch}\n\n" +cv_content
    name = resume_entities.get('NAME')[0].replace('\xa0', ' ')
    cv_content = f"\nNAME --> {name}\n" +cv_content
    # Now cv_content is a string that can be used with the summarizer
    return cv_content ,cvmatch, final_score 

In [None]:
def skillsPreProcess(cv, cv_file):
    skills = []
    cv_skills_1 = cv.get('SKILLS', []) 
    cv_skills_2 = cv.get('Skills', []) 
   
        
    # Assuming find_and_extract_lines returns a list of skills
    skills += find_and_extract_lines(cv_file, "skills")
     
    # Combine all arrays and get unique values
    combined_skills = set(skills + cv_skills_1+cv_skills_2)



    # Update the 'SKILLS' key in the cv dictionary with the unique skills
    cv['SKILLS'] = list(combined_skills)
    if 'Skills' in cv:
        cv.pop('Skills')
     
     
    return cv


In [None]:
def extract_first_two_words_from_first_line(file):
    doc = fitz.open(file)
    result_lines = []

    if doc.page_count > 0:
        first_page = doc[0]
        first_page_text = first_page.get_text("text")
        first_line_words = first_page_text.split()[0:2]
        result_lines.append(' '.join(first_line_words))
 
    return result_lines

In [None]:
def namePreProcess(cv, cv_file): 
     
    if cv.get('NAME') is None:
        cv['NAME'] = extract_first_two_words_from_first_line(cv_file)
    
    return cv


In [None]:
def checkCv(jd_file,cv_file):
    
    resume_entities = loadResumeModel(cv_file)
    jd_entities = loadJdModel(jd_file)
 
    resume_entities = skillsPreProcess(resume_entities,cv_file)
    resume_entities = namePreProcess(resume_entities,cv_file)

    cv_content ,cvmatch, final_score = calculatMatch(jd_entities,resume_entities)    

    return cv_content, cvmatch , final_score

SET the sender_email & sender_password To send email  

In [None]:

def sendMail(message, receiver_email='example@mail.com'):

    # Set your email credentials
    sender_email = 'example@mail.com----------------------------'
    sender_password = '******************************************'

    subject = 'Resume Screening output Score'
    body = 'Email from the Resume Screening app\n' + message

    message = MIMEMultipart()
    message['From'] = sender_email
    message['To'] = receiver_email
    message['Subject'] = subject
    message.attach(MIMEText(body, 'plain'))

    # Connect to the SMTP server (in this case, Gmail's SMTP server)
    smtp_server = 'smtp.gmail.com'
    smtp_port = 587

    server = None  # Initialize server outside try block

    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.starttls()  # Use TLS for a secure connection
        server.login(sender_email, sender_password)
        server.sendmail(sender_email, receiver_email, message.as_string())
        print('Email sent successfully!')
    except Exception as e:
        print(f'Error: {e}')
    finally:
        if server is not None:
            server.quit()



In [None]:

app = Flask(__name__, static_url_path='/static')

CORS(app, )

@app.route('/')
def index():
    return render_template('index.html')


@app.route('/post_example', methods=['POST'])
def post_example():
    
    jd_file = request.form['jdFile']
    cv_file = request.form['cvFile']
    email = request.form['email'] 
    
    # will send an email after a very good Match
    threshold = int(request.form['threshold']) 
     

    cv_content, cvmatch , final_score = checkCv(jd_file,cv_file)
    
    message = "File Name - >" + cv_file + "\n" + cv_content

    print(message)
    
    data = {'cv_content':message,'match':cvmatch,'score':final_score}


    if(final_score > threshold and  email is not None):
        email_thread = threading.Thread(target=sendMail, args=(message, email))
        # Start the thread
        email_thread.start()

    # Send a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080, debug=False)


Testing APi CEll

In [None]:
# jd_file = 'C:/Users/awais/Desktop/Ai/ReasumaScreening/models/Examples/JDs/jd.pdf'
# cv_file = 'C:/Users/awais/Desktop/Ai/ReasumaScreening/models/Examples/CVs/python-developer-3.pdf' 
 
# cv_content, cvmatch , final_score = checkCv(jd_file,cv_file)

# message = "File Name - > " + cv_file + "\n" + cv_content
# print('\n\n--message-------------------------------------------------------------------------------------------------------------------------------------------------------------\n')
# print(message)