# AI Driven Candidata Matching System Powered by Gemini LLM (gemini-2.5-flash)

In [22]:
import sys, os

sys.path.append(os.path.abspath(".."))
# To provide access to external directory

In [23]:
# Taking necessary imports

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

# Extracting key features from the resume

In [None]:
# As an Example
# loading the resume

document = PyPDFLoader(r"D:\Projects\CandidateScreeningSystem\resumes\AmanResume.pdf")
resume = document.load()

In [25]:
resume

[Document(metadata={'producer': 'Skia/PDF m142 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of TCS Aman Resume', 'source': 'D:\\Projects\\CandidateScreeningSystem\\resumes\\Aman_ds.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Aman  Sharma  \n+91\n \n8799706360\n \n|\n \naman.ds24@duk.ac.in\n \n|\n \nhttp://www.linkedin.com/in/aman-sharma-4272ab231\n \n|\n \nhttps://aman18sh.github.io/portfolio/\n  \nE\nDUCATION\n \n \n  \nDigital  University  Kerala                                                                                                                                                                   Kerala,  India  M.Sc   in  Data  Analytics  &  Computational  Science\n \nAugust  2024  -  June  2026  Relevant  Coursework:  Parallel  Computing,  Deep  learning,  DSA    \nDelhi  Skill  and  Entrepreneurship  University  New  Delhi,  India  B.Sc  in  Data  Analytics  December  2021  -  June  2024  Relevant  Coursework:  Dat

In [26]:
resume[0]

Document(metadata={'producer': 'Skia/PDF m142 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of TCS Aman Resume', 'source': 'D:\\Projects\\CandidateScreeningSystem\\resumes\\Aman_ds.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Aman  Sharma  \n+91\n \n8799706360\n \n|\n \naman.ds24@duk.ac.in\n \n|\n \nhttp://www.linkedin.com/in/aman-sharma-4272ab231\n \n|\n \nhttps://aman18sh.github.io/portfolio/\n  \nE\nDUCATION\n \n \n  \nDigital  University  Kerala                                                                                                                                                                   Kerala,  India  M.Sc   in  Data  Analytics  &  Computational  Science\n \nAugust  2024  -  June  2026  Relevant  Coursework:  Parallel  Computing,  Deep  learning,  DSA    \nDelhi  Skill  and  Entrepreneurship  University  New  Delhi,  India  B.Sc  in  Data  Analytics  December  2021  -  June  2024  Relevant  Coursework:  Data

In [27]:
# preprocessing the resume

import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*?>', '', text)
    
    # Remove URLs
    text = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        '',
        text
    )

    # Preserve emails by temporarily replacing them with placeholders
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    for i, email in enumerate(emails):
        text = text.replace(email, f"__EMAIL{i}__")

    # Preserve numeric patterns like "2.5 years" or "2-5 years"
    numbers = re.findall(r'\b\d+(?:[\.-]\d+)?(?:\s*-\s*\d+(?:\.\d+)?)?\s*(?:years?|yrs?)\b', text, flags=re.IGNORECASE)
    for i, num in enumerate(numbers):
        text = text.replace(num, f"__NUM{i}__")

    # Remove unwanted special characters (keep . - @ for preserved placeholders)
    text = re.sub(r'[^a-zA-Z0-9@.\-_\s]', ' ', text)

    # Replace multiple spaces with single space
    text = re.sub(r'\s{2,}', ' ', text).strip()

    # Restore numeric patterns
    for i, num in enumerate(numbers):
        text = text.replace(f"__NUM{i}__", num)

    # Restore emails
    for i, email in enumerate(emails):
        text = text.replace(f"__EMAIL{i}__", email)

    return text


cleaned_resume = clean_text(resume[0].page_content)
print(cleaned_resume)

Aman Sharma 91 8799706360 aman.ds24@duk.ac.in E
DUCATION Digital University Kerala Kerala India M.Sc in Data Analytics Computational Science August 2024 - June 2026 Relevant Coursework Parallel Computing Deep learning DSA Delhi Skill and Entrepreneurship University New Delhi India B.Sc in Data Analytics December 2021 - June 2024 Relevant Coursework Data Analysis Python DBMS Machine learning Deep learning W
ORK E
XPERIENCE 99acres.com - Info Edge India Ltd Noida U.P. India Research Analyst Intern January 2024 June 2024 Conducted hierarchical analysis of 3000 localities using Google Maps for accurate administrative structuring. Streamlined data validation and classification workflows ensuring accuracy and RERA compliance across 10 states. Supported research and operational enhancements contributing to improved sales and user engagement. Ministry of Statistics and Programme Implementation New Delhi India Data Analyst Intern July 2023 August 2023 Produced 30 daily interactive data visualiz

In [28]:
# setting up the llm

import os
llm = model2 = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",      
    temperature=0.7,
    google_api_key=os.environ["GEMINI_API_KEY"]
)

In [29]:
# defining the prompt for llm to extract the key features from resume in json format

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

parser = JsonOutputParser()

prompt = PromptTemplate(
    template=
    """
    ### SCRAPED TEXT FROM RESUME:
    {resume_data}
    ### INSTRUCTION:
    The scraped text above is from a candidate's resume.
    Your task is to extract the key information and return it in a structured JSON format with the following fields:

    - `name`: Full name of the candidate.
    - `email`: Candidate’s email address.
    - `phone`: Candidate’s phone number (if available).
    - `role`: Current or most recent job titles/ internships / designation with company details.
    - `experience_years`: Total years of relevant professional experience.
    - `skills`: List of technical and soft skills mentioned (normalized and deduplicated).
    - `education`: All Eduction qualification and institution.
    - `projects`: List of key projects with their titles and short descriptions.
    - `certifications`: List of certifications or courses (if any).
    - `summary`: A brief 2–3 sentence professional summary based on the resume.

    {format_instructions}
    """,
    input_variables=["resume_data"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [30]:
chain = prompt | llm | parser

In [31]:
response = chain.invoke({'resume_data':cleaned_resume})
print(response)

{'name': 'Aman Sharma', 'email': 'aman.ds24@duk.ac.in', 'phone': '918799706360', 'role': [{'title': 'Research Analyst Intern', 'company': '99acres.com - Info Edge India Ltd', 'dates': 'January 2024 - June 2024'}, {'title': 'Data Analyst Intern', 'company': 'Ministry of Statistics and Programme Implementation', 'dates': 'July 2023 - August 2023'}], 'experience_years': 0.67, 'skills': ['Python', 'C', 'SQL', 'HTML', 'CSS', 'JavaScript', 'VS Code', 'PyCharm', 'Jupyter Notebook', 'Git', 'GitHub', 'Machine Learning', 'Deep Learning', 'Natural Language Processing', 'Gen AI', 'Database Management System (DBMS)', 'Data Analysis', 'Data Visualization', 'Flask', 'PyTorch', 'TensorFlow', 'Power BI', 'MySQL', 'Pyspark', 'English', 'Hindi', 'Leadership', 'Teamwork', 'Communication', 'Time Management'], 'education': [{'degree': 'M.Sc in Data Analytics Computational Science', 'institution': 'Digital University Kerala', 'dates': 'August 2024 - June 2026', 'relevant_coursework': ['Parallel Computing', '

In [32]:
def resume_to_text(resume_json):
    return f"""
    Name: {resume_json.get('name')}
    Role: {resume_json.get('role')}
    Experience: {resume_json.get('experience_years')} years
    Skills: {", ".join(resume_json.get('skills', []))}
    Education: {resume_json.get('education')}
    Projects: {[p['title'] for p in resume_json.get('projects', [])]}
    Summary: {resume_json.get('summary')}
    Certifications: {", ".join(resume_json.get('certifications', []))}
    """

In [33]:
resume_text = resume_to_text(response)
print(resume_text)


    Name: Aman Sharma
    Role: [{'title': 'Research Analyst Intern', 'company': '99acres.com - Info Edge India Ltd', 'dates': 'January 2024 - June 2024'}, {'title': 'Data Analyst Intern', 'company': 'Ministry of Statistics and Programme Implementation', 'dates': 'July 2023 - August 2023'}]
    Experience: 0.67 years
    Skills: Python, C, SQL, HTML, CSS, JavaScript, VS Code, PyCharm, Jupyter Notebook, Git, GitHub, Machine Learning, Deep Learning, Natural Language Processing, Gen AI, Database Management System (DBMS), Data Analysis, Data Visualization, Flask, PyTorch, TensorFlow, Power BI, MySQL, Pyspark, English, Hindi, Leadership, Teamwork, Communication, Time Management
    Education: [{'degree': 'M.Sc in Data Analytics Computational Science', 'institution': 'Digital University Kerala', 'dates': 'August 2024 - June 2026', 'relevant_coursework': ['Parallel Computing', 'Deep learning', 'DSA']}, {'degree': 'B.Sc in Data Analytics', 'institution': 'Delhi Skill and Entrepreneurship Univ

In [34]:
from typing import List
from langchain_core.documents import Document

In [35]:
resume_doc = Document(metadata={"name":response['name'],"experience":response['experience_years'],"skills":response['skills']},page_content=resume_text)
resume_doc
# This Document will get embedded and store in vector db

Document(metadata={'name': 'Aman Sharma', 'experience': 0.67, 'skills': ['Python', 'C', 'SQL', 'HTML', 'CSS', 'JavaScript', 'VS Code', 'PyCharm', 'Jupyter Notebook', 'Git', 'GitHub', 'Machine Learning', 'Deep Learning', 'Natural Language Processing', 'Gen AI', 'Database Management System (DBMS)', 'Data Analysis', 'Data Visualization', 'Flask', 'PyTorch', 'TensorFlow', 'Power BI', 'MySQL', 'Pyspark', 'English', 'Hindi', 'Leadership', 'Teamwork', 'Communication', 'Time Management']}, page_content="\n    Name: Aman Sharma\n    Role: [{'title': 'Research Analyst Intern', 'company': '99acres.com - Info Edge India Ltd', 'dates': 'January 2024 - June 2024'}, {'title': 'Data Analyst Intern', 'company': 'Ministry of Statistics and Programme Implementation', 'dates': 'July 2023 - August 2023'}]\n    Experience: 0.67 years\n    Skills: Python, C, SQL, HTML, CSS, JavaScript, VS Code, PyCharm, Jupyter Notebook, Git, GitHub, Machine Learning, Deep Learning, Natural Language Processing, Gen AI, Datab

# Extracting Key features from the job posting

In [36]:
job_post="""Senior Software Design Engineer 2-5 YRS  Noida, India  Immediate Joiners Preferred  Job Description:  · Translating system requirements into the design and development of systems. · Transform vital business needs into code and drive innovation. · Collaborating and integrating code into enterprise systems. · Creative problem-solving skills. · Attitude to achieve the goals with complete ownership and hard work.   Technical Skills:  Must Have:  · 2+ Years of experience in UI/Front-end development of rich interactive web-based applications or modules. · Strong proficiency in JavaScript, including DOM manipulation and the JavaScript object model. · Experience working as a React.js Developer. Thorough understanding of React.js, Redux and its core principles. · In-depth knowledge of JavaScript, CSS, HTML, HTML 5, JSON, XML and front-end languages. · Knowledge of REACT tools including React.js, Webpack, Enzyme. · Knowledge of MVC Pattern and Architecture. Working experience with popular React.js workflows (such as Flux or Redux) · Experience with user interface design. · Experience with web debugging. · Experience in all testing phases of development including Unit & Integration Testing. · Excellent troubleshooting skills. · Team player · Excellent time-management skills    Beneficial:  · Working experience with Core Java. ·Working with Databases (MSSQL, Oracle, Postgres) · Working experience on source code version control systems (SVN, GIT)  · Working experience on Issue Tracking tool e.g. Jira, Bugzilla · Working experience on Code Analysis tools e.g. Sonar Qube, Checkmarks. · Experience in Restful Web Services integration. · Independent worker · Great interpersonal and communication skills   Responsibilities  · Participate as a team member in all phases of software development lifecycle. · Perform design, development and testing on individual tasks · Managing initiatives on UI/UX work · Participate in integrated testing of product/ package. · Building reusable components and front-end libraries for future use. · Translating designs and wireframes into high-quality code. · Learn and understand user interactions. · Optimizing components for maximum performance across a vast array of web-capable devices and browsers · Keep compliance with Quality Systems and Procedures. ...Less Key Skills: React, HTML, Javascript, ReactJS, Redux
"""

In [37]:
print(job_post)

Senior Software Design Engineer 2-5 YRS  Noida, India  Immediate Joiners Preferred  Job Description:  · Translating system requirements into the design and development of systems. · Transform vital business needs into code and drive innovation. · Collaborating and integrating code into enterprise systems. · Creative problem-solving skills. · Attitude to achieve the goals with complete ownership and hard work.   Technical Skills:  Must Have:  · 2+ Years of experience in UI/Front-end development of rich interactive web-based applications or modules. · Strong proficiency in JavaScript, including DOM manipulation and the JavaScript object model. · Experience working as a React.js Developer. Thorough understanding of React.js, Redux and its core principles. · In-depth knowledge of JavaScript, CSS, HTML, HTML 5, JSON, XML and front-end languages. · Knowledge of REACT tools including React.js, Webpack, Enzyme. · Knowledge of MVC Pattern and Architecture. Working experience with popular React.

In [38]:
cleaned_post = clean_text(job_post)
cleaned_post

'Senior Software Design Engineer 2-5 YRS Noida India Immediate Joiners Preferred Job Description Translating system requirements into the design and development of systems. Transform vital business needs into code and drive innovation. Collaborating and integrating code into enterprise systems. Creative problem-solving skills. Attitude to achieve the goals with complete ownership and hard work. Technical Skills Must Have 2 Years of experience in UI Front-end development of rich interactive web-based applications or modules. Strong proficiency in JavaScript including DOM manipulation and the JavaScript object model. Experience working as a React.js Developer. Thorough understanding of React.js Redux and its core principles. In-depth knowledge of JavaScript CSS HTML HTML 5 JSON XML and front-end languages. Knowledge of REACT tools including React.js Webpack Enzyme. Knowledge of MVC Pattern and Architecture. Working experience with popular React.js workflows such as Flux or Redux Experien

In [39]:
# defining prompt for the llm to extract key features from the job post

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

parser = JsonOutputParser()

job_post_prompt = PromptTemplate(
    template="""
    ### SCRAPED TEXT FROM WEBSITE:
    {job_post}

    ### INSTRUCTION:
    The scraped text above is from the careers or jobs page of a company website.
    Your task is to extract job postings and return them in **strict JSON format**.

    job posting should be represented with the following keys, 
    job_posting is the main key and its value is the list containing all the following field:

    - `role`: The job title or designation.
    - `company`: Name of the company (if mentioned).
    - `location`: Job location (city, state, or remote/hybrid if specified).
    - `experience_required`: Required experience in years as integer.
    - `skills`: A list of technical and soft skills explicitly mentioned.
    - `skill_classification`: Categorize the extracted skills into the following buckets:
        - `must_have`: essential or required skills.
        - `important`: valuable but not strictly required skills.
        - `nice_to_have`: additional or preferred skills.
        if any of skill not availble mention ["None"]
    - `description`: A clean summary (2–3 sentences) combining key responsibilities and qualifications.
    - `employment_type`: Full-time, Part-time, Internship, Contract, etc., if available.
    - `posted_date`: Date of posting (if available).

    If some fields are missing in the text, use `null` for that field.

    ### REQUIRED JSON FORMAT:
    {format_instructions}

    ### RETURN ONLY VALID JSON, NO PREAMBLE:
    """,
    input_variables=["job_post"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [40]:
chain = job_post_prompt | llm | parser

In [41]:
post_json = chain.invoke({'job_post':job_post}) 

In [42]:
post_json

{'job_posting': [{'role': 'Senior Software Design Engineer',
   'company': None,
   'location': 'Noida, India',
   'experience_required': 2,
   'skills': ['JavaScript',
    'DOM manipulation',
    'JavaScript object model',
    'React.js',
    'Redux',
    'CSS',
    'HTML',
    'HTML 5',
    'JSON',
    'XML',
    'Front-end development',
    'Webpack',
    'Enzyme',
    'MVC Pattern',
    'Flux',
    'User Interface Design',
    'Web Debugging',
    'Unit Testing',
    'Integration Testing',
    'Troubleshooting',
    'Teamwork',
    'Time Management',
    'Creative Problem Solving',
    'Ownership',
    'Hard Work',
    'Core Java',
    'Databases',
    'MSSQL',
    'Oracle',
    'Postgres',
    'Version Control',
    'SVN',
    'GIT',
    'Issue Tracking',
    'Jira',
    'Bugzilla',
    'Code Analysis',
    'Sonar Qube',
    'Checkmarks',
    'RESTful Web Services',
    'Independent Work',
    'Interpersonal Skills',
    'Communication Skills'],
   'skill_classification': {'must_h

In [43]:
print(post_json['job_posting'])

[{'role': 'Senior Software Design Engineer', 'company': None, 'location': 'Noida, India', 'experience_required': 2, 'skills': ['JavaScript', 'DOM manipulation', 'JavaScript object model', 'React.js', 'Redux', 'CSS', 'HTML', 'HTML 5', 'JSON', 'XML', 'Front-end development', 'Webpack', 'Enzyme', 'MVC Pattern', 'Flux', 'User Interface Design', 'Web Debugging', 'Unit Testing', 'Integration Testing', 'Troubleshooting', 'Teamwork', 'Time Management', 'Creative Problem Solving', 'Ownership', 'Hard Work', 'Core Java', 'Databases', 'MSSQL', 'Oracle', 'Postgres', 'Version Control', 'SVN', 'GIT', 'Issue Tracking', 'Jira', 'Bugzilla', 'Code Analysis', 'Sonar Qube', 'Checkmarks', 'RESTful Web Services', 'Independent Work', 'Interpersonal Skills', 'Communication Skills'], 'skill_classification': {'must_have': ['JavaScript', 'DOM manipulation', 'JavaScript object model', 'React.js', 'Redux', 'CSS', 'HTML', 'HTML 5', 'JSON', 'XML', 'Front-end development', 'Webpack', 'Enzyme', 'MVC Pattern', 'Flux', '

In [44]:
# json job post to regular text
def job_post_to_text(job_json):
    return f"""
Role: {job_json.get('role')}
Company: {job_json.get('company')}
Location: {job_json.get('location')}
Experience Required: {job_json.get('experience_required')}
Employment Type: {job_json.get('employment_type')}
Posted Date: {job_json.get('posted_date')}

Skills: {", ".join(job_json.get('skills', []))}

Skill Classification:
  Must Have: {", ".join(job_json.get("skill_classification", {}).get("must_have", []))}
  Important: {", ".join(job_json.get("skill_classification", {}).get("important", []))}
  Nice To Have: {", ".join(job_json.get("skill_classification", {}).get("nice_to_have", []))}

Description:
{job_json.get('description')}
""".strip()


In [45]:
post_text = job_post_to_text(post_json['job_posting'][0])
post_text

'Role: Senior Software Design Engineer\nCompany: None\nLocation: Noida, India\nExperience Required: 2\nEmployment Type: None\nPosted Date: None\n\nSkills: JavaScript, DOM manipulation, JavaScript object model, React.js, Redux, CSS, HTML, HTML 5, JSON, XML, Front-end development, Webpack, Enzyme, MVC Pattern, Flux, User Interface Design, Web Debugging, Unit Testing, Integration Testing, Troubleshooting, Teamwork, Time Management, Creative Problem Solving, Ownership, Hard Work, Core Java, Databases, MSSQL, Oracle, Postgres, Version Control, SVN, GIT, Issue Tracking, Jira, Bugzilla, Code Analysis, Sonar Qube, Checkmarks, RESTful Web Services, Independent Work, Interpersonal Skills, Communication Skills\n\nSkill Classification:\n  Must Have: JavaScript, DOM manipulation, JavaScript object model, React.js, Redux, CSS, HTML, HTML 5, JSON, XML, Front-end development, Webpack, Enzyme, MVC Pattern, Flux, User Interface Design, Web Debugging, Unit Testing, Integration Testing, Troubleshooting, T

In [47]:
document_post = Document(metadata={"experience_required":post_json['job_posting'][0]['experience_required'],"employment_type":post_json['job_posting'][0]['employment_type'],"posted_date":post_json['job_posting'][0]["posted_date"]},page_content=post_text)

In [49]:
print(document_post.page_content)

Role: Senior Software Design Engineer
Company: None
Location: Noida, India
Experience Required: 2
Employment Type: None
Posted Date: None

Skills: JavaScript, DOM manipulation, JavaScript object model, React.js, Redux, CSS, HTML, HTML 5, JSON, XML, Front-end development, Webpack, Enzyme, MVC Pattern, Flux, User Interface Design, Web Debugging, Unit Testing, Integration Testing, Troubleshooting, Teamwork, Time Management, Creative Problem Solving, Ownership, Hard Work, Core Java, Databases, MSSQL, Oracle, Postgres, Version Control, SVN, GIT, Issue Tracking, Jira, Bugzilla, Code Analysis, Sonar Qube, Checkmarks, RESTful Web Services, Independent Work, Interpersonal Skills, Communication Skills

Skill Classification:
  Must Have: JavaScript, DOM manipulation, JavaScript object model, React.js, Redux, CSS, HTML, HTML 5, JSON, XML, Front-end development, Webpack, Enzyme, MVC Pattern, Flux, User Interface Design, Web Debugging, Unit Testing, Integration Testing, Troubleshooting, Teamwork, Ti

# Resume Embedding (Store itin Pincone Vectordb)

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

def embedding_model():
    embeddings =  GoogleGenerativeAIEmbeddings(
            model="gemini-embedding-001",
            task_type="RETRIEVAL_DOCUMENT",
            google_api_key=os.environ["GEMINI_API_KEY"]
        )
    return embeddings

In [None]:
# loading the functions
from src.functions import load_pdf_file, filter_to_minimal_docs, resume_to_text, resume_features_extraction, embedding_model
from src.prompts import resume_prompt

In [None]:
# Initializing the resume loading and import feature extraction out of each candidate resume using llm 
extracted_resume = load_pdf_file(data='resume')
filter_resume = filter_to_minimal_docs(extracted_resume)
resume_extraction = resume_features_extraction(filter_resume,llm,clean_text,resume_prompt, resume_to_text, parser) # extracted resumes document with key features

In [None]:

# Initializing the gemini embedding model (gemini-embedding-001)
embeddings = embedding_model()

# configuring pinecone to store embedding for each candidate resume
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

# index name for pinecone database
index_name = "candidate-matching"  

# creating an index pinecone database if it is present in database it will delete then again create it
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    pc.create_index(
        name=index_name,
        dimension=3072, # high dimensional embedding to generate or retrieve better context
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
        
    # index = pc.Index(index_name)

    # connecting and storing embedding for each candidate resumes
    vectorstore = PineconeVectorStore.from_documents(
        documents=resume_extraction,
        index_name=index_name,
        embedding=embeddings,
    )


# Job Description Extraction

In [8]:
from src.functions import embedding_model, load_pdf_file, filter_to_minimal_docs, resume_features_extraction, resume_to_text, jobpost_feature_extraction, job_post_to_text

In [9]:
embeddings = embedding_model()

In [None]:
job_description_input = input("Enter the Job Description")

In [None]:
job_description_document = jobpost_feature_extraction(
        job_description_input, llm, clean_text, job_post_prompt, job_post_to_text, parser
    )

job_description = job_description_document.page_content

In [None]:
print(job_description)

# Hybrid Candidate Retrievel/Search

In [None]:
# Dense retrievel (semantic search)

# connecting to vectordb (index: candidate-matching)
embeddings = embedding_model()
index_name = "candidate-matching"
doc_search = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)

retriever = doc_search.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke(job_description_document.page_content)

In [None]:
# sparse retrievel (keyword search)

from rank_bm25 import BM25Okapi

docs = [doc.page_content for doc in resume_extraction]
tokenized_docs = [d.lower().split() for d in docs]
bm25 = BM25Okapi(tokenized_docs)

keywords = job_description.lower().split()
bm25_scores = bm25.get_scores(keywords)

top_ids = bm25_scores.argsort()[-3:][::-1]
sparse_results = [resume_extraction[i] for i in top_ids]

In [None]:
# Metadata based search

metadata_filtered = [
        c for c in resume_extraction
        if c.metadata["experience"] >= job_description_document.metadata["experience_required"]
    ]

In [None]:
# Taking the combined result of all

combined = retrieved_docs + sparse_results + metadata_filtered
unique_docs = list({doc.metadata["id"]: doc for doc in combined}.values())

# Candidate Matching for the Jobpost

In [None]:
# Giving the candidate context and job description to llm to rank candidates

from src.prompts import candidata_matching_prompt

candidate_context = "\n\n".join(doc.page_content for doc in unique_docs)

chain = candidata_matching_prompt | llm

response = chain.invoke({
        "candidate_context": candidate_context,
        "job_description": job_description
    })

In [None]:
# candidates matching result with explaination will be shown on the UI

print(response)

# Thank You
 Best Regards |
 Aman Sharma |
 Digital University Kerala (Formerly IIITMK) | aman.ds24@duk.ac.in, amansharmaaa9313@gmail.com | +91 8799706360


- Linkedin: https://www.linkedin.com/in/aman-sharma-4272ab231
- Portfolio: https://aman18sh.github.io/portfolio/
- Medium : https://medium.com/@amansharmaaa9313
- Github : https://github.com/Aman18sh

In [1]:
from rank_bm25 import BM25Okapi

docs = [
    "Deep learning models require large datasets.",
    "Machine learning algorithms include supervised and unsupervised methods.",
    "Dataset preprocessing is crucial for machine learning."
]

tokenized_docs = [doc.lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

query = "machine learning dataset".lower().split()
scores = bm25.get_scores(query)

print(scores)


[0.10774747 0.18946199 0.61164647]


In [2]:
tokenized_docs

[['deep', 'learning', 'models', 'require', 'large', 'datasets.'],
 ['machine',
  'learning',
  'algorithms',
  'include',
  'supervised',
  'and',
  'unsupervised',
  'methods.'],
 ['dataset', 'preprocessing', 'is', 'crucial', 'for', 'machine', 'learning.']]

In [3]:
bm25

<rank_bm25.BM25Okapi at 0x212685525f0>

In [4]:
query

['machine', 'learning', 'dataset']

In [5]:
scores

array([0.10774747, 0.18946199, 0.61164647])