# Sprouts AI Assignment

Home Assignment: Resume Matching and Scoring System
Please deliver the following:
### 1. Design document explaining your approach to build a resume matching and scoring system
#### for a given job description.The deliverable should include
#### A. Architecture diagram of the overall system
#### B. Component design for the following:
#### a. Job description parser
#### b. Resume parser
#### c. Matching engine
#### d. Scoring algorithm
#### C. Details on how you will extract and match the following attributes between job description and resume:
#### a. Job title
#### b. Location
#### c. Industry/domain
#### d. Education degree
#### e. Technical skills

### 2. Functional code in Python to implement the system described in the design. The code should:
#### a. Take as input a job description and a resume
#### b. Create models for the the job descriptions and the resumes
#### c. Implement similarity scoring using appropriate NLP techniques to match the resume to the job description
#### d. Output a match score from 0 to 100 indicating the suitability of the resume for the job.
### 3. Test cases and validation results for your implementation.
### 4. Deployment documentation explaining how to containerize and deploy the system as a microservice.
### 5. Project repository on Github containing all the above documents, code and test cases.
### Evaluation Criteria:
#### - Quality of design and approach
#### - Correct implementation of techniques
#### - Robustness of code
#### - Deployability of solution
#### - Overall functional matching and scoring system delivered in time
#### We are looking for demonstrable NLP skills, coding quality, and the ability to deliver a functional system end-to-end within the timeline. All the best!

In [34]:
import os
import re
import spacy
import pandas as pd
import pdfplumber
from datetime import datetime
from dateutil import relativedelta
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
import torch
from datetime import datetime
from dateutil.relativedelta import relativedelta  # Corrected import
import re
from spacy.matcher import Matcher



# Constants and Configuration
PDF_DIRECTORY = 'C:/Users/Acer/Desktop/JD and Resume matcher/Resume/'
JD_DIRECTORY = 'C:/Users/Acer/Desktop/JD and Resume matcher/Job descriptions/'
SKILLS_FILE_PATH = 'C:/Users/Acer/Desktop/JD and Resume matcher/Resume_parser/skills_orignal.txt'
WORLD_UNIVERSITIES_PATH = 'C:/Users/Acer/Desktop/JD and Resume matcher/Resume_parser/world-universities.csv'
TITLES_COMBINED_PATH = 'C:/Users/Acer/Desktop/JD and Resume matcher/Resume_parser/titles_combined.txt'

# Initialize matchers with a vocab
matcher = Matcher(nlp.vocab)
designitionmatcher = Matcher(nlp.vocab)
skillsmatcher = Matcher(nlp.vocab)

# Load titles and skills from external files
with open(TITLES_COMBINED_PATH, "r", encoding='utf-8') as file:
    designation = [line.strip().lower() for line in file]
    for title in designation:
        # Create a pattern for each title
        pattern = [{"LOWER": word} for word in title.split()]  # Splitting title into words
        designitionmatcher.add(title, [pattern]) 

with open(SKILLS_FILE_PATH, "r", encoding='utf-8') as file:
    skill = [line.strip().lower() for line in file]
    for skill_name in skill:
        if skill_name:  # Check if the skill_name is not empty
            # Create a pattern for each skill
            pattern = [{"LOWER": word} for word in skill_name.split()]  # Splitting skill into words
            skillsmatcher.add(skill_name, [pattern])

# Load Spacy Model
nlp = spacy.load('en_core_web_sm')

def convert_pdf_to_txt(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        return ''.join([page.extract_text() for page in pdf.pages if page.extract_text()])

def extract_skills(nlp_text, SKILLS_FILE_PATH):
    tokens = [token.text for token in nlp_text if not token.is_stop]
    noun_chunks = list(nlp_text.noun_chunks)

    # Load skills from a text file
    with open(SKILLS_FILE_PATH, 'r', encoding='utf-8') as file:
        skills = set(line.strip().lower() for line in file.readlines())

    skillset = set()
    # Check for one-grams
    for token in tokens:
        if token.lower() in skills:
            skillset.add(token.capitalize())

    # Check for bi-grams and tri-grams in noun chunks
    for chunk in noun_chunks:
        chunk_text = chunk.text.lower().strip()  # Convert chunk to string and apply lower()
        if chunk_text in skills:
            skillset.add(chunk_text.capitalize())
    
    return skillset

def calculate_experience(resume_text): 
        def correct_year(result):
            if len(result) < 2:
                if int(result) > int(str(date.today().year)[-2:]):
                    result = str(int(str(date.today().year)[:-2]) - 1) + result
                else:
                    result = str(date.today().year)[:-2] + result
            return result

        # try:
        experience = 0
        start_month = -1
        start_year = -1
        end_month = -1
        end_year = -1

        not_alpha_numeric = r'[^a-zA-Z\d]'
        number = r'(\d{2})'

        months_num = r'(01)|(02)|(03)|(04)|(05)|(06)|(07)|(08)|(09)|(10)|(11)|(12)'
        months_short = r'(jan)|(feb)|(mar)|(apr)|(may)|(jun)|(jul)|(aug)|(sep)|(oct)|(nov)|(dec)'
        months_long = r'(january)|(february)|(march)|(april)|(may)|(june)|(july)|(august)|(september)|(october)|(november)|(december)'
        month = r'(' + months_num + r'|' + months_short + r'|' + months_long + r')'
        regex_year = r'((20|19)(\d{2})|(\d{2}))'
        year = regex_year
        start_date = month + not_alpha_numeric + r"?" + year
        
        # end_date = r'((' + number + r'?' + not_alpha_numeric + r"?" + number + not_alpha_numeric + r"?" + year + r')|(present|current))'
        end_date = r'((' + number + r'?' + not_alpha_numeric + r"?" + month + not_alpha_numeric + r"?" + year + r')|(present|current|till date|today))'
        longer_year = r"((20|19)(\d{2}))"
        year_range = longer_year + r"(" + not_alpha_numeric + r"{1,4}|(\s*to\s*))" + r'(' + longer_year + r'|(present|current|till date|today))'
        date_range = r"(" + start_date + r"(" + not_alpha_numeric + r"{1,4}|(\s*to\s*))" + end_date + r")|(" + year_range + r")"

        
        regular_expression = re.compile(date_range, re.IGNORECASE)
        
        regex_result = re.search(regular_expression, resume_text)
        
        while regex_result:
          
          try:
            date_range = regex_result.group()
            # print(date_range)
            # print("*"*100)
            try:
              
                year_range_find = re.compile(year_range, re.IGNORECASE)
                year_range_find = re.search(year_range_find, date_range)
                # print("year_range_find",year_range_find.group())
                                
                # replace = re.compile(r"(" + not_alpha_numeric + r"{1,4}|(\s*to\s*))", re.IGNORECASE)
                replace = re.compile(r"((\s*to\s*)|" + not_alpha_numeric + r"{1,4})", re.IGNORECASE)
                replace = re.search(replace, year_range_find.group().strip())
                # print(replace.group())
                # print(year_range_find.group().strip().split(replace.group()))
                start_year_result, end_year_result = year_range_find.group().strip().split(replace.group())
                # print(start_year_result, end_year_result)
                # print("*"*100)
                start_year_result = int(correct_year(start_year_result))
                if (end_year_result.lower().find('present') != -1 or 
                    end_year_result.lower().find('current') != -1 or 
                    end_year_result.lower().find('till date') != -1 or 
                    end_year_result.lower().find('today') != -1): 
                    end_month = date.today().month  # current month
                    end_year_result = date.today().year  # current year
                else:
                    end_year_result = int(correct_year(end_year_result))


            except Exception as e:
                # logging.error(str(e))
                start_date_find = re.compile(start_date, re.IGNORECASE)
                start_date_find = re.search(start_date_find, date_range)

                non_alpha = re.compile(not_alpha_numeric, re.IGNORECASE)
                non_alpha_find = re.search(non_alpha, start_date_find.group().strip())

                replace = re.compile(start_date + r"(" + not_alpha_numeric + r"{1,4}|(\s*to\s*))", re.IGNORECASE)
                replace = re.search(replace, date_range)
                date_range = date_range[replace.end():]
        
                start_year_result = start_date_find.group().strip().split(non_alpha_find.group())[-1]

                # if len(start_year_result)<2:
                #   if int(start_year_result) > int(str(date.today().year)[-2:]):
                #     start_year_result = str(int(str(date.today().year)[:-2]) - 1 )+start_year_result
                #   else:
                #     start_year_result = str(date.today().year)[:-2]+start_year_result
                # start_year_result = int(start_year_result)
                start_year_result = int(correct_year(start_year_result))

                if date_range.lower().find('present') != -1 or date_range.lower().find('current') != -1:
                    end_month = date.today().month  # current month
                    end_year_result = date.today().year  # current year
                else:
                    end_date_find = re.compile(end_date, re.IGNORECASE)
                    end_date_find = re.search(end_date_find, date_range)

                    end_year_result = end_date_find.group().strip().split(non_alpha_find.group())[-1]

                    # if len(end_year_result)<2:
                    #   if int(end_year_result) > int(str(date.today().year)[-2:]):
                    #     end_year_result = str(int(str(date.today().year)[:-2]) - 1 )+end_year_result
                    #   else:
                    #     end_year_result = str(date.today().year)[:-2]+end_year_result
                    # end_year_result = int(end_year_result)
                    try:
                        end_year_result = int(correct_year(end_year_result))
                    except Exception as e:
                        logging.error(str(e))
                        end_year_result = int(re.search("\d+",correct_year(end_year_result)).group())

            if (start_year == -1) or (start_year_result <= start_year):
                start_year = start_year_result
            if (end_year == -1) or (end_year_result >= end_year):
                end_year = end_year_result

            resume_text = resume_text[regex_result.end():].strip()
            regex_result = re.search(regular_expression, resume_text)
          except Exception as e:
            logging.error(str(e))
            resume_text = resume_text[regex_result.end():].strip()
            regex_result = re.search(regular_expression, resume_text)
            
        return end_year - start_year  # Use the obtained month attribute

def extract_email(text):
    email = re.findall(r"([^@|\s]+@[^@]+\.[^@|\s]+)", text)
    return email[0].split()[0].strip(';') if email else None

def extract_mobile_number(text):
    mob_num_regex = r'''(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)
                        [-\.\s]*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'''
    phone = re.findall(re.compile(mob_num_regex), text)
    return ''.join(phone[0]) if phone else None

def get_degree(text):
    doc = custom_nlp2(text)
    return [ent.text.replace("\n", " ") for ent in doc.ents if ent.label_ == 'Degree']


def job_designition(text):
    __nlp = nlp(text)
    matches = designitionmatcher(__nlp)
    return list(set([__nlp[start:end].text for match_id, start, end in matches]))

def extract_name(text):
    nlp_text = nlp(text)
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('NAME', None, pattern)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
    return None

def bert_encode(text, model, tokenizer):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_input)
    return output.pooler_output[0]

def cosine_similarity(vec1, vec2):
    # Calculate cosine similarity and convert the range from 0 to 100
    return (1 - cosine(vec1, vec2)) * 100


def process_resumes():
    all_extracted_info_resume = []
    for pdf_file in os.listdir(PDF_DIRECTORY):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(PDF_DIRECTORY, pdf_file)
            resume_text = convert_pdf_to_txt(pdf_path)
            resume_doc = nlp(resume_text)
            
            extracted_info = {
                "File Name": pdf_file,
                "Email": extract_email(resume_text),
                "Phone": extract_mobile_number(resume_text),
                "Name": extract_name(resume_text),
                "Skills": extract_skills(resume_doc,SKILLS_FILE_PATH),
                "Total Experience": calculate_experience(resume_text),
                "Designation": job_designition(resume_text),
                # Add other extracted fields as necessary
            }
            all_extracted_info_resume.append(extracted_info)
    return all_extracted_info_resume

def process_job_descriptions():
    all_extracted_info_jd = []
    for pdf_file in os.listdir(JD_DIRECTORY):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(JD_DIRECTORY, pdf_file)
            jd_text = convert_pdf_to_txt(pdf_path)
            jd_doc = nlp(jd_text)
            
            expected_roles = job_designition(jd_text)

            jd_info = {
                "File Name": pdf_file,
                "skills_required": extract_skills(jd_doc, SKILLS_FILE_PATH),
                "expected_role": expected_roles
            }
            all_extracted_info_jd.append(jd_info)
    return all_extracted_info_jd


def match_resumes_with_jds(resumes, job_descriptions):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    resume_embeddings = [bert_encode(" ".join(resume["Skills"]) + " " + str(resume["Total Experience"]) + " " + " ".join(resume["Designation"]), model, tokenizer) for resume in resumes]
    jd_embeddings = [bert_encode(" ".join(jd.get("skills_required", [])) + " " + " ".join(jd.get("expected_role", [])),model, tokenizer) for jd in job_descriptions]
    for i, resume_emb in enumerate(resume_embeddings):
        print(f"Resume {i+1} Similarity Scores:")
        for j, jd_emb in enumerate(jd_embeddings):
            similarity = cosine_similarity(resume_emb, jd_emb)
            print(f"\tto JD {j+1}: {similarity:.2f}%")  # Display as a percentage

# Main execution
all_resumes = process_resumes()
all_jds = process_job_descriptions()
match_resumes_with_jds(all_resumes, all_jds)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Resume 1 Similarity Scores:
	to JD 1: 97.45%
	to JD 2: 98.99%
	to JD 3: 98.01%
	to JD 4: 98.99%
Resume 2 Similarity Scores:
	to JD 1: 97.47%
	to JD 2: 99.60%
	to JD 3: 98.45%
	to JD 4: 99.32%
Resume 3 Similarity Scores:
	to JD 1: 98.71%
	to JD 2: 97.59%
	to JD 3: 98.24%
	to JD 4: 97.90%
Resume 4 Similarity Scores:
	to JD 1: 97.88%
	to JD 2: 98.83%
	to JD 3: 99.34%
	to JD 4: 99.45%
Resume 5 Similarity Scores:
	to JD 1: 71.56%
	to JD 2: 82.08%
	to JD 3: 75.17%
	to JD 4: 80.23%
Resume 6 Similarity Scores:
	to JD 1: 97.96%
	to JD 2: 98.37%
	to JD 3: 98.81%
	to JD 4: 98.98%


In [29]:
all_resumes

[{'File Name': 'CutShort-Chethan.pdf',
  'Email': 'chetan.muliya133@gmail.com',
  'Phone': '7208669778',
  'Name': 'Chethan Muliya',
  'Skills': {'Adobe',
   'Android',
   'App',
   'Application',
   'Apps',
   'Architecture',
   'Barrier',
   'Bdm',
   'Codes',
   'Components',
   'Connect',
   'Debugging',
   'Design',
   'Features',
   'Firebase',
   'Github',
   'Information technology',
   'It',
   'Java',
   'Kotlin',
   'Languages',
   'Linkedin',
   'Mobile',
   'Mumbai',
   'New features',
   'Pages',
   'Pdf',
   'Player',
   'Prototype',
   'Publishing',
   'Qr',
   'Qr code',
   'Record',
   'Responsiveness',
   'Retrofit',
   'Technology',
   'Testing',
   'Translation',
   'Ui',
   'Unit testing',
   'Video',
   'Web'},
  'Total Experience': 8,
  'Designation': ['Android Developer', 'founder'],
  'education_degree': ['Bachelor of Science in Information Technology']},
 {'File Name': 'CutShort-Hari-resume-8c27.pdf',
  'Email': 'hram3348@gmail.com',
  'Phone': '8248052413',


In [32]:
all_jds

[{'File Name': 'Android Developer.pdf',
  'skills_required': {'Agile',
   'Algorithms',
   'Amazon',
   'Amazon web services',
   'Android',
   'Api',
   'App',
   'Apple',
   'Box',
   'Client',
   'Command',
   'Data structures',
   'Design',
   'Design patterns',
   'Distributed systems',
   'Django',
   'Ecommerce',
   'Ecosystem',
   'Framework',
   'Git',
   'Google',
   'Grasp',
   'It',
   'Javascript',
   'Jquery',
   'Json',
   'Laravel',
   'Learning',
   'Linux',
   'Machine learning',
   'Mercurial',
   'Mobile',
   'Mobile applications',
   'Mvc',
   'Node.js',
   'Object-oriented programming',
   'Omni',
   'Parsing',
   'Plus',
   'Resolve',
   'Ruby',
   'Server',
   'Sprints',
   'Sqlite',
   'Store',
   'Subversion',
   'Technology',
   'Transparency',
   'Version control tools',
   'Video',
   'Web',
   'Xml'},
  'expected_role': ['Android Developer'],
  'required_experience': 0},
 {'File Name': 'IOS Developer .pdf',
  'skills_required': {'Agile',
   'Algorithms',
 