# load dependencies


In [2]:
from pypdf import PdfReader
from pathlib import Path

from tqdm.auto import tqdm

In [3]:
pdf_path = Path("../../data/raw_data/resumeScraper/data/INFORMATION-TECHNOLOGY/")

In [4]:
# read all files 
from concurrent.futures import ThreadPoolExecutor, as_completed
import os 

total_pdfs = len(list(pdf_path.glob("*.pdf")))
cv_data = []

def process_pdf(file_path):
    reader = PdfReader(file_path)
    return [page.extract_text() for page in reader.pages]

num_threads = os.cpu_count()

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    future_to_file = {executor.submit(process_pdf, file): file for file in pdf_path.glob("*.pdf")}
    
    for future in tqdm(as_completed(future_to_file), total=total_pdfs):
        cv_data.append(future.result())

# Join multi-page CVs into a single string
cv_data = [" ".join(cv) if len(cv) > 1 else cv[0] for cv in cv_data]

  0%|          | 0/120 [00:00<?, ?it/s]

In [None]:
cv_data.to_csv("output/09_cv_data.csv", index=False)

# Evaluation: 1st stage


In [5]:
import _resume_eval_import_helper

In [6]:
import os 
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from uuid import uuid4
import pandas as pd
import json
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI 
from langchain_anthropic import ChatAnthropic
from langchain_ollama import ChatOllama

from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables.base import RunnableSequence
from prompts.two_stage_eval_jd import TWO_STAGE_EVAL_JD_PROMPT
from prompts.two_stage_eval_cv import TWO_STAGE_EVAL_CV_PROMPT

import logging
import time
from datetime import datetime 
from typing import Dict, Any, Union, List, Tuple

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

In [7]:
# load environment variables 
load_dotenv(find_dotenv("../../.env"))

# get current time 
current_time = datetime.now().strftime(("%Y%m%d_%H%M"))
output_dir = f"./output_{current_time}/"

# create output directory 
os.makedirs(output_dir, exist_ok=True)

temperature = 0
max_tokens = 2048

# set up logger 
log_file = os.path.join(output_dir, "evaluation_log.txt")
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO, filename=log_file, datefmt="%Y-%m-%d %H:%M:%S")

In [8]:
job_id = "cbc8a72c-2452-433c-838d-a9ee5fab7e69"

job_description = """
About the job
Key Responsibilities

Develop and implement advanced statistical and machine learning models to solve complex problems
Lead the data-driven decision-making process, from data collection and analysis to implementation and monitoring of solutions
Manage data science projects, ensuring they meet business requirements and are delivered on time
Mentor junior data scientists, providing guidance and support in their professional development
Apply advanced statistical and machine learning techniques to extract insights and generate solutions from large and diverse data sources.
Develop and deploy scalable and robust data products and applications that support decision making and service improvement.
Collaborate with cross-functional teams to integrate data science solutions into business processes
Communicate and present data-driven findings and recommendations to various stakeholders and senior management.
Mentor and coach junior data scientists and analysts in the team.
Manage and motivate a team of data scientists and analysts, and foster a culture of collaboration, innovation, and excellence.

Requirements

at least 10 years of relevant working experience in data projects or data science or analytics, preferably in the Singapore public sector.
Proficient in Python, R, SQL, and other programming languages and tools for data analysis and manipulation.
Strong knowledge & experience in building complex Data Platform system equipped with latest Enterprise Data warehousing and modern design applications ready for consumption for Data Analysis & Business decision making using AIML & GenAI technologies.
Strong knowledge and experience in applying statistical and machine learning methods, such as regression, classification, clustering, natural language processing, computer vision, etc.
Experience in developing and deploying data products and applications using cloud platforms, such as AWS, Azure, or Google Cloud.
Proficient in working with large language models, such as GPT-3, BERT, or Transformer-XL, and applying them to various natural language tasks, such as text generation, summarization, question answering, etc.
Experience in building and deploying AI and ML models using the Artificial Intelligence and Machine Learning (AIML) platform, a government-wide platform that provides end-to-end capabilities for data ingestion, processing, modelling, and deployment.
Exhibit strong Data Architect skills to provide solution & Design for complex Data Platform projects.
Excellent communication and presentation skills, with the ability to translate complex data and technical concepts into clear and actionable insights for non-technical audiences.
Passionate about solving public sector challenges and creating positive social impact through data and analytics.
Demonstrated leadership and people management skills, with the ability to inspire, empower, and develop a diverse and high-performing team.
"""

In [9]:
# evaluate resume 
def two_stage_eval_jd(model_tuples: List[Tuple[str, RunnableSequence]], job_description: str, job_id: str, output_dir: str) -> Union[pd.DataFrame, None]:
    model_results = {}
    for model_name, grader in model_tuples:
        try:
            result = grader.invoke({"job_description": job_description})
            model_results[model_name] = result

            # save model result 
            json_file = os.path.join(output_dir, f"{job_id}_{model_name}.json")
            with open(json_file, "w") as f:
                json.dump(result, f, indent=4)
            time.sleep(2.1)  # Add a small delay to avoid rate limiting

        except Exception as e:
            error_msg = f"Error with {model_name} for job_id: {job_id}. Error: {str(e)}"
            logging.error(error_msg)
            print(error_msg)

    if not model_results:
        error_msg = f"All models failed for job_id: {job_id}."
        logging.error(error_msg)
        print(error_msg)
        return None
    
def two_stage_eval_cv(model_tuples: List[Tuple[str, RunnableSequence]], job_requirements: str, job_id: str, cv: str, cv_id: str, output_dir: str) -> Union[pd.DataFrame, None]:
    model_results = {}
    for model_name, grader in model_tuples:
        try:
            result = grader.invoke({"job_requirements": job_requirements, "resume": cv})
            model_results[model_name] = result

            # save model result 
            json_file = os.path.join(output_dir, f"{job_id}_{cv_id}_{model_name}.json")
            with open(json_file, "w") as f:
                json.dump(result, f, indent=4)
            time.sleep(2.1)  # Add a small delay to avoid rate limiting

        except Exception as e:
            error_msg = f"Error with {model_name} for job_id: {job_id}. Error: {str(e)}"
            logging.error(error_msg)
            print(error_msg)

    if not model_results:
        error_msg = f"All models failed for job_id: {job_id}."
        logging.error(error_msg)
        print(error_msg)
        return None
    

In [10]:
groq_llm = ChatGroq(model="llama3-70b-8192", temperature=temperature, max_tokens=max_tokens)

jd_eval_prompt = PromptTemplate(
    input_variables=["job_description"],
    template=TWO_STAGE_EVAL_JD_PROMPT
    )

groq_grader = jd_eval_prompt | groq_llm | JsonOutputParser()

model_tuples = [
    ("groq", groq_grader)
]

def process_jobs(job_data):
    job_id, job_description = job_data
    return two_stage_eval_jd(model_tuples, job_description, job_id, output_dir)

def process_all_jobs():
    job_data = [(job_id, job_description)]  

    total_jobs = len(job_data)
    
    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = [] 
        for job in job_data:
            futures.append(executor.submit(process_jobs, job))
            
        for future in tqdm(as_completed(futures), total=total_jobs, desc="Processing all jobs"):
            try:
                future = future.result()
            except Exception as e:
                print(f"Error: {e}")

In [11]:
process_all_jobs()

Processing all jobs:   0%|          | 0/1 [00:00<?, ?it/s]

# 2nd stage: resume evaluation


In [12]:
# load job data json 

job_data = []

for file in Path(output_dir).glob("*.json"):
        file_name = file.stem
        job_id = file_name.split("_")[0]
        model_name = file_name.split("_")[1]
        with open(file, "r") as f:
            job_description = json.load(f)
            
            job_data.append((job_id, job_description))


In [13]:
job_data

[('cbc8a72c-2452-433c-838d-a9ee5fab7e69',
  {'technical_skills': {'essential': ['Python',
     'R',
     'SQL',
     'Data Platform system',
     'Enterprise Data warehousing',
     'AIML & GenAI technologies',
     'statistical and machine learning methods (regression, classification, clustering, natural language processing, computer vision)',
     'cloud platforms (AWS, Azure, or Google Cloud)',
     'large language models (GPT-3, BERT, or Transformer-XL)',
     'Artificial Intelligence and Machine Learning (AIML) platform'],
    'advantageous': []},
   'soft_skills': ['excellent communication and presentation skills',
    'ability to translate complex data and technical concepts into clear and actionable insights',
    'passionate about solving public sector challenges and creating positive social impact',
    'demonstrated leadership and people management skills',
    'ability to inspire, empower, and develop a diverse and high-performing team'],
   'level_of_exp': 'senior',
   'ed

In [14]:
cv_data = [(str(uuid4()), cv_data[i]) for i in range(len(cv_data))]

In [15]:
groq_llm = ChatGroq(model="llama3-70b-8192", temperature=temperature, max_tokens=max_tokens)

cv_eval_prompt = PromptTemplate(
    input_variables=["job_requirements", "resume"],
    template=TWO_STAGE_EVAL_CV_PROMPT
    )

groq_grader = cv_eval_prompt | groq_llm | JsonOutputParser()

model_tuples = [
    ("groq", groq_grader)
]


In [16]:
def process_job_cv_pairs(jod_data, cv_data):
    job_id, job_requirements = jod_data
    cv_id, cv = cv_data
    return two_stage_eval_cv(model_tuples, job_requirements, job_id, cv, cv_id, output_dir)

def process_all_pairs():
    
    total_pairs = len(job_data) * len(cv_data)
    
    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = [] 
        for job in job_data:
            for cv in cv_data:
                futures.append(executor.submit(process_job_cv_pairs, job, cv))
            
        for future in tqdm(as_completed(futures), total=total_pairs, desc="Processing job-cv pairs"):
            try:
                future = future.result()
            except Exception as e:
                print(f"Error: {e}")


In [17]:
process_all_pairs()

Processing job-cv pairs:   0%|          | 0/120 [00:00<?, ?it/s]

Error with groq for job_id: cbc8a72c-2452-433c-838d-a9ee5fab7e69. Error: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}
All models failed for job_id: cbc8a72c-2452-433c-838d-a9ee5fab7e69.
Error with groq for job_id: cbc8a72c-2452-433c-838d-a9ee5fab7e69. Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01j1rzqwkaf1sa3624x7yr1a2c` on tokens per minute (TPM): Limit 6000, Used 0, Requested 6161. Please try again in 1.61s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
All models failed for job_id: cbc8a72c-2452-433c-838d-a9ee5fab7e69.


## additional data


In [18]:
data_path = "../repos/Resume-Screening-RAG-Pipeline/data/supplementary-data/"
cleaned_resume_path = data_path + "cleaned_resume.csv"

cleaned_resume = pd.read_csv(cleaned_resume_path)
ds_pool = cleaned_resume[cleaned_resume["Category"] == "Data Science"].reset_index(drop=True)

In [19]:
from pprint import pprint

for r in ds_pool["Resume"].values:
    pprint(r)
    print("\n")
    print("\n")
    

('Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, '
 'matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, '
 'SVM, Nave Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, '
 'Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language '
 'processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & '
 'Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, '
 'ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * '
 'Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python '
 'Flask, Git, Docker, computer vision - Open CV and understanding of Deep '
 'learning.Education Details \n'
 'Data Science Assurance Associate \n'
 'Data Science Assurance Associate - Ernst & Young LLP\n'
 'Skill Details \n'
 'JAVASCRIPT- Exprience - 24 months\n'
 'jQuery- Exprience - 24 months\n'
 'Python- Exprience - 24 monthsCompany Details \n'
 'company - Ernst & Youn

In [20]:
cv_data = ds_pool[["ID", "Resume"]].values

In [21]:
cv_data

array([['af844c8b-de2a-47ff-9b7d-4466af8826c2',
        'Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, Nave Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details \nData Science Assurance Associate \nData Science Assurance Associate - Ernst & Young LLP\nSkill Details \nJAVASCRIPT- Exprience - 24 months\njQuery- Exprience - 24 months\nPython- Exprience - 24 monthsCompany Details \ncompany - Ernst & Young LLP\nde

In [22]:
def process_all_pairs():
    
    total_pairs = len(job_data) * len(cv_data)
    
    with ThreadPoolExecutor(max_workers=1) as executor:
      
        futures = [] 
        for job in job_data:
            for cv in cv_data:
                futures.append(executor.submit(process_job_cv_pairs, job, cv))
            
        for future in tqdm(as_completed(futures), total=total_pairs, desc="Processing job-cv pairs"):
            try:
                future = future.result()
            except Exception as e:
                print(f"Error: {e}")

In [23]:
process_all_pairs()

Processing job-cv pairs:   0%|          | 0/10 [00:00<?, ?it/s]

# evaluate the results


In [24]:
results = []
errors = []

for file in tqdm(Path(output_dir).glob("*.json"), total=len(list(Path(output_dir).glob("*.json"))), desc="Evaluating results"):
    try:
        file_name = file.stem
        job_id, cv_id, model_name = file_name.split("_")
        with open(file, "r") as f:
            result = json.load(f)
        
        data = {
            "job_id": job_id,
            "cv_id": cv_id,
            "model_name": model_name,
            "original_technical_skills": result["resume_evaluation"]["original_scores"].get("technical_skills", None),
            "original_soft_skills": result["resume_evaluation"]["original_scores"].get("soft_skills", None),
            "original_experience": result["resume_evaluation"]["original_scores"].get("experience", None),
            "original_education": result["resume_evaluation"]["original_scores"].get("education", None),
            "recalibrated_technical_skills": result["recalibrated_scores"].get("technical_skills", None),
            "recalibrated_soft_skills": result["recalibrated_scores"].get("soft_skills", None),
            "recalibrated_experience": result["recalibrated_scores"].get("experience", None),
            "recalibrated_education": result["recalibrated_scores"].get("education", None),
            "inferred_experience": ", ".join(result["deeper_analysis"].get("inferred_experience", [])),
            "suitability": result["assessment"].get("suitability", None),
            "strengths": result["assessment"].get("strengths", None),
            "concerns": result["assessment"].get("concerns", None)
        }
        
        results.append(data)
    except Exception as e:
        errors.append({"file": str(file), "error": str(e)})
        print(f"Error processing {file}: {e}")

# Convert results to a DataFrame
df = pd.DataFrame(results)


Evaluating results:   0%|          | 0/129 [00:00<?, ?it/s]

Error processing output_20240902_1904/cbc8a72c-2452-433c-838d-a9ee5fab7e69_groq.json: not enough values to unpack (expected 3, got 2)


## calculate the fit score


In [25]:
# Define weights
weights = {
    "technical_skills": 0.6,
    "soft_skills": 0.1,
    "experience": 0.2,
    "education": 0.1
}

# Define score types
score_types = ["original", "recalibrated"]

# Calculate overall scores using pandas' dot product
for score_type in score_types:
    columns = [f"{score_type}_{skill}" for skill in weights.keys()]
    df[score_type+"_overall_score"] = df[columns].values.dot(pd.Series(weights).values)

In [27]:
df["suitability"].value_counts()

suitability
no     79
kiv    49
Name: count, dtype: int64

In [29]:
df.sort_values("recalibrated_overall_score", ascending=False).to_csv("output/09_sorted_results.csv", index=False)

In [30]:
df[df["suitability"]=='kiv']

Unnamed: 0,job_id,cv_id,model_name,original_technical_skills,original_soft_skills,original_experience,original_education,recalibrated_technical_skills,recalibrated_soft_skills,recalibrated_experience,recalibrated_education,inferred_experience,suitability,strengths,concerns,original_overall_score,recalibrated_overall_score
2,cbc8a72c-2452-433c-838d-a9ee5fab7e69,8752682f-2310-4fa9-a33c-08f9b02360a3,groq,40,30,80,50,50,40,90,50,"project management, systems administration, we...",kiv,The candidate has strong experience in project...,The candidate lacks essential technical skills...,48.0,57.0
3,cbc8a72c-2452-433c-838d-a9ee5fab7e69,f70d634d-cfcf-48fd-bafa-a73ef937a464,groq,40,80,80,90,40,90,90,90,"leadership and people management skills, abili...",kiv,The candidate has excellent communication and ...,The candidate lacks essential technical skills...,57.0,60.0
5,cbc8a72c-2452-433c-838d-a9ee5fab7e69,325b809d-2e28-4d3a-9beb-7992dcb9ffd9,groq,30,40,60,80,40,50,70,80,"database security, risk assessment, compliance...",kiv,The candidate has strong experience in databas...,The candidate lacks essential technical skills...,42.0,51.0
7,cbc8a72c-2452-433c-838d-a9ee5fab7e69,ac863d97-9dac-4736-9198-4c5feb246f2f,groq,40,60,80,80,30,70,90,80,"project management, leadership, team managemen...",kiv,The candidate has strong leadership and manage...,The candidate lacks essential technical skills...,54.0,51.0
8,cbc8a72c-2452-433c-838d-a9ee5fab7e69,2d5c21af-5794-43ff-a6a5-bc97b91224f5,groq,70,60,80,80,75,65,85,80,"ability to work with big data, experience with...",kiv,The candidate has strong technical skills in m...,The candidate lacks experience in Data Platfor...,72.0,76.5
9,cbc8a72c-2452-433c-838d-a9ee5fab7e69,05366a81-5dbc-44ba-907e-84eb084817ce,groq,30,60,80,80,40,70,90,80,"leadership and people management skills, abili...",kiv,The candidate has strong leadership and people...,The candidate lacks essential technical skills...,48.0,57.0
11,cbc8a72c-2452-433c-838d-a9ee5fab7e69,e8f25172-07e1-45ab-b935-4e9a8ea3ea82,groq,40,60,80,60,40,80,90,60,"leadership and people management skills, abili...",kiv,"The candidate has excellent soft skills, inclu...",The candidate lacks essential technical skills...,52.0,56.0
12,cbc8a72c-2452-433c-838d-a9ee5fab7e69,6030173d-67f4-4601-abe8-c85dcf6ea745,groq,40,60,80,80,50,70,90,80,"project management, team management, technical...",kiv,The candidate has strong experience in IT and ...,The candidate lacks essential technical skills...,54.0,63.0
14,cbc8a72c-2452-433c-838d-a9ee5fab7e69,b4400273-d8c4-4739-aa8c-3b426078f589,groq,30,60,80,80,20,70,90,80,"leadership and people management skills, abili...",kiv,The candidate has strong leadership and people...,The candidate lacks essential technical skills...,48.0,45.0
19,cbc8a72c-2452-433c-838d-a9ee5fab7e69,af0e3df3-8a06-4878-b68d-5bcb06a36ed1,groq,40,60,80,60,30,70,90,60,"project management, leadership, data analysis",kiv,The candidate has strong experience in project...,The candidate lacks essential technical skills...,52.0,49.0


In [None]:
job_description