In [9]:
from dotenv import load_dotenv
import os
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import re
import time

load_dotenv(".env")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"FYP-Goo"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.environ.get('LANGCHAIN_API_KEY')

# Extraction

In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("GooYeJui.pdf", extract_images=True)
pages = loader.load()

In [2]:
pages

[Document(page_content='Profile Summary\nIndustry ExperienceRelevant Skills\nGOO YE JUI\nA recent graduate with a strong foundation in both front-end and back-end development, combined with a\npassion for innovation and a commitment to making lives better through technology. Eager to contribute to\nthe vision of Unit Nukleus GovTech by leveraging technical expertise to empower the nation and enhance\ndigital government services. \nFull-stack web development (HTML 5, CSS,\nJavaScript, PHP, SQL, Python, .NET, React)\nNatural Language Processing: spaCy, NLTK,\nTensorFlow, PyTorch\nGenerative AI related : LangChain, Llama\nIndex\nPetronas Digital Sdn Bhd - Data Science Intern Sept 2023 - Jun 2024\nDeveloped (in 2 months) an Generative AI Based Resume Parser for Group HRM. The AI-powered parser\nautomates the extraction and parsing of candidate information from resumes, saving HR professionals\nvaluable time and effort in manually reviewing and categorizing resumes. Responsible for all fron

In [9]:
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
class Candidate(BaseModel):
    """Information about a candidate from his/her resume."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.

    name: Optional[str] = Field(..., description="The name of the candidate")
    phone_number: Optional[str] = Field(
        ..., description="The phone number of the candidate"
    )
    email: Optional[str] = Field(
        ..., description="The email of the candidate"
    )
    local: Optional[str] = Field(
        ..., description="Is the candidate Malaysian(Yes or No)?"
    )
    expected_salary: Optional[str] = Field(
        ..., description="Candidate's expected salary in RM if known. (If the currency is Ringgit Malaysia, assign the numerical value or range values only Eg:'3000-3100'. If in other currency, assign alongside currency)"
    )
    current_location: Optional[List] = Field(
        ..., description="Candidate's current location if known. If the candidate does not mention the country, assign the country based on the state and city (return it in a python list containing dictionary format like this 'Country': '', 'State': '', 'City': '' )"
    )
    education_background: Optional[List] = Field(
        ..., description="Every single candidate's education background. (field of study, level (always expand to long forms), cgpa, university, Start Date, Year of Graduation (Year in 4-digits only, remove month). All in a python dict format."
    )
    professional_certificate: Optional[List] = Field(
        ..., description="Candidate's professional certificates if known"
    )
    skill_group: Optional[List] = Field(
        ..., description="Candidate's skill groups if known"
    )
    technology_programs_tool: Optional[List] = Field(
        ..., description="Technology (Tools, Program, System) that the candidate knows if known."
    )
    language: Optional[List] = Field(
        ..., description="Languages that the candidate knows"
    )
    previous_job_roles: Optional[List] = Field(
        ..., description="Every single one of the candidate's (job title, job company, Industries (strictly classify according to to The International Labour Organization), start date and end date (only assign date time format if available. Do not assign duration), job location, Job Duration (Years) (if not in years, convert to years)) (If duration is stated, update the job duration instead.) in a python dict format."
    )


In [12]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm with 20 years experience in the recruiting industry. You will be provided with candidate's resume. "
            "Extract relevant candidate's information mentioned in the following candidate's resume together with their properties. "
            "If you do not know the value of an attribute asked to extract, "
            "1) Please provide an accurate answers, no guessing."
            "2) Please return 'N/A' only if the information is not mentioned."
            "3) The response should strictly follow the Python dictionary format."
            "4) No need to return any reasoning as this is only for extraction of information."
            "5) Extracted Properties of all Start date and End date: "
            "* if the month is not stated, assume that start/end date is in the middle of the year. "
            "* should never include english words such as 'months', 'years', 'days'. "
            "* Instead, dates should be dates converted to the following format: "
            "* date values assigned are strictly in Python datetime format "
            """Strict Format of either one: 
                YYYY
                YYYY-MM or YYYYMM
                YYYY-MM-DD or YYYYMMDD
            6) Ensure that for any duration (year) calculation: 
            * Any end date that indicates "Present", refers to today's date, which is {current_date}. 
            * Do not assume the work experiences are continuous without breaks.
            * Method of duration calculation: Subtract the end date from start date to get the number of months. Finally sum up all relevant durations and convert to years. 
            * Triple check your calculations. ","""
        ),
        ("human", "{text}"),
    ]
)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.3)
runnable = prompt | llm.with_structured_output(schema=Candidate)
result = runnable.invoke({"text": pages,"current_date":datetime.now()})

In [74]:
result.dict()

{'name': 'GOO YE JUI',
 'phone_number': '+60184040438',
 'email': 'yjyejui626@gmail.com',
 'local': 'N/A',
 'expected_salary': 'N/A',
 'current_location': ['Country: Malaysia',
  'State: Penang',
  'City: Bukit Mertajam'],
 'education_background': [{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)',
   'level': "Bachelor's Degree",
   'cgpa': '3.97',
   'university': 'Universiti Teknologi Malaysia',
   'start_date': '2020',
   'year_of_graduation': '2024'},
  {'field_of_study': 'Foundation in Science',
   'level': 'Foundation',
   'cgpa': '3.78',
   'university': 'Universiti Teknologi Malaysia',
   'start_date': '2019',
   'year_of_graduation': '2020'}],
 'professional_certificate': ['Microsoft Certified: Azure AI Fundamentals',
  'Google Data Analytics Certificate by Coursera',
  'Alteryx Foundational Micro-Credential',
  'Alteryx Designer Core Certification',
  'AWS Academy Graduate - AWS Academy Cloud Foundations',
  'AWS Academy Graduate - AWS Academy Machine Lear

In [84]:
import pandas as pd

# Convert Candidate object to dictionary
candidate_dict = result.dict()

# Create DataFrame with one row using the dictionary
df = pd.DataFrame([candidate_dict])
df.to_excel('results.xlsx')

In [83]:
print(df['education_background'][0])

[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}]


In [90]:
import secrets,string
batch_token = "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(12))
batch_token

'9zuTpclGO06S'

# Evaluation

In [1]:
import pandas as pd

data_dict = pd.read_excel('results.xlsx',index_col=0)
data_dict

Unnamed: 0,name,phone_number,email,local,expected_salary,current_location,education_background,professional_certificate,technical_skill,technology_programs_tool,language,previous_job_roles
0,Ang Teik Hun,60124773683,teikhun0422@hotmail.com,Yes,,"['Country: Malaysia', 'State: Penang', 'City: ...","[{'field_of_study': 'Applied Data Analytics', ...","['Microsoft Certified: Azure Fundamentals', 'M...","['Python', 'R', 'SQL']",['Power BI'],[],"[{'job_title': 'Data Scientist', 'job_company'..."
1,GOO YE JUI,60184040438,yjyejui626@gmail.com,,,"['Country: Malaysia', 'State: Penang', 'City: ...",[{'field_of_study': 'Bachelor Of Computer Scie...,"['Microsoft Certified: Azure AI Fundamentals',...","['Full-stack web development (HTML 5, CSS, Jav...",[],"['English', 'Mandarin', 'Malay', 'French']","[{'job_title': 'Data Science Intern', 'job_com..."


In [24]:
import re
import time
import pandas as pd
import openai
from openai import OpenAI

def evaluate_education_background(row, input, weightage):
    max_retries = 5
    retry_count = 0
    
    try:
        edu_prompt_system = f"""[Instruction] You will be provided with details such as the preferred field of study, job_title, and the candidate's field of study.
        Please act as an impartial judge and evaluate the candidate's field of study based on the job title and preferred education background. For this evaluation, you should primarily consider the following accuracy:
        [Accuracy]
        Score 1: The candidate's field of study is completely unrelated to {input} and the job title stated.
        Score 3: The candidate's field of study has minor relevance but does not align with {input} and the job title stated.
        Score 5: The candidate's field of study has moderate relevance but contains inaccuracies to {input} and the job title stated.
        Score 7: The candidate's field of study aligns with {input} and the job title stated but has minor errors or omissions on either one of them.
        Score 10: The candidate's field of study is completely accurate and aligns very well with {input} and the job title stated.
        
        [Rules]
        1. If the candidate has several education background, you should always consider the most related to {input} and the job title only.
        2. You should always ignore those that are unrelated to {input} and the job title and make sure they do not affect the total scoring.
        3. You should only assess the candidate's Field of Study and it's level. Ignore any other criterias.

        [Steps]
        Step 1 : Start the evaluation by giving reasons, Be as objective as possible.
        Step 2 : You must rate the candidate on a scale of 1 to 10 by strictly following this format: "[[rating]]", 
        for example:
        "Education Background Rating: [[6]].

        [Question]
        How will you rate the candidate's education background based on the provided job title with preferred education background?
        """

        edu_prompt_user = f"""
        Preferred Field of Study: {input}
        
        Job Title: {job_title}

        [The Start of Candidate's Education Background]
        {row['education_background']}
        [The End of Candidate's Education Background]
        """
        
        client = OpenAI()
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": edu_prompt_system},
                {"role": "user", "content": edu_prompt_user}
            ],
            model="gpt-3.5-turbo-0125",
            temperature=0.3,
            n=3,
        )
        
        print("Response from edu", response)
        
    except openai.RateLimitError as e:
        print(f"OpenAI rate limit exceeded. Pausing for one minute before resuming... (From RateLimitError)")
        print(e)
        time.sleep(30)
        retry_count += 1

        if retry_count >= max_retries:
            print("Exceeded maximum retries for evaluating education background.... (From RateLimitError)")
            return response
    
    # Extract the number using regex
    def extract_gpt_response_rating(response):
        ratings = []
        pattern = r'\[\[([\d]+)\]\]'

        for i in range(len(response.choices)):
            match = re.search(pattern, response.choices[i].message.content)
            if match:
                rating = int(match.group(1))
                ratings.append(rating)
            else:
                # ratings = 0
                ratings.append(0)
        return ratings
    
    # Calculate average rating
    def calculate_average_rating(ratings):
        if not ratings:
            return 0
        return round(sum(ratings) / len(ratings))

    # Calculate weighted score
    def calculate_weighted_score(average_rating, weightage):
        if average_rating is None:
            return 0
        return round(average_rating / 10 * weightage)
            
    edu_rating = extract_gpt_response_rating(response)
    average_rating = calculate_average_rating(edu_rating)
    edu_weighted_score = calculate_weighted_score(average_rating, weightage)
    
    print(f"Candidate: {row['name']}\t\t1. EDU Score:{edu_weighted_score}/{weightage}\t C: refer data_dict E: {input}\t ")
    
    return edu_weighted_score

# Example usage:
# Assuming 'df' is your dataframe containing the data
# input and weightage are assumed to be defined earlier
input = "Bachelor's Degree in Data Science or Computer Science"
job_title = "Executive (Data Scientist)"
data_dict['education_background_score'] = data_dict.apply(lambda row: evaluate_education_background(row, input, 20), axis=1)


Response from edu ChatCompletion(id='chatcmpl-9LUAnl6RssJDRM1F1beTBp2XDF7dl', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Step 1: The candidate's field of study is in Applied Data Analytics at the Master's level, which is a closely related field to Data Science. However, it is not a Bachelor's Degree in Data Science or Computer Science as preferred. There is a minor relevance between the candidate's field of study and the job title of Executive (Data Scientist).\n\nStep 2: Education Background Rating: [[5]].", role='assistant', function_call=None, tool_calls=None)), Choice(finish_reason='stop', index=1, logprobs=None, message=ChatCompletionMessage(content="Step 1: The candidate's field of study is in Applied Data Analytics at the Master's level, which is a related field to Data Science. However, the candidate's education background is at the Master's level, not at the Bachelor's level as required. Additionally, the job title is f

In [None]:
data_dict['education_background'][0] = '[{\'field_of_study\': \'Applied Data Analytics\', \'level\': "Master\'s", \'cgpa\': \'6.42\', \'university\': \'Australian National University\', \'Start Date\': \'2021-07\', \'year_of_graduation\': \'2022\'}]'


In [11]:
import ast
import re

def evaluate_cgpa(data_dict,input_cgpa, weightage):
    out_weighted_cgpa_score = 0.0
    c_cgpa = 0 #total 

    def get_normalize_cgpa(cgpa_str,standard_scale = 4.0):
        # Regex pattern to match CGPA values and their max scales
        pattern = r'(\d+(?:\.\d+)?)(?:/(\d+(?:\.\d+)?))?'

        # Searching for the pattern in the text
        match = re.search(pattern, cgpa_str)
        if match:
            cgpa = float(match.group(1))
            max_cgpa = float(match.group(2)) if match.group(2) else standard_scale

            print(cgpa,max_cgpa)

            # Normalize CGPA to the standard scale
            normalized_cgpa = (cgpa / max_cgpa) * standard_scale
            print (f"""normalised cgpa:  {normalized_cgpa}, raw cgpa extracted: {cgpa_str}""")
            return normalized_cgpa
        else: # if N/A in resume, cpga -> 0.0 
            print ("normalised cgpa:  CPGA not found. Default CGPA = 0.0/4.0")
            return float("0")


    if 'education_background' not in data_dict:
        print(f"Candidate: {data_dict['name']}\t\t 2. CGPA Score:{out_weighted_cgpa_score}/{weightage}\t C CGPA(normalised): {c_cgpa} VS E: {input_cgpa} \t ")
        return 0.4 * weightage
    else: 
        print ("CGPA method 2: Getting latest available cgpa")
        data_list = ast.literal_eval(data_dict.education_background)
        print(data_list)
        data_list.sort(key=lambda x: int(x['year_of_graduation']), reverse=True)
        print(data_list[0]['cgpa'])
        if data_list[0]['cgpa']  != "N/A" :
            c_cgpa = get_normalize_cgpa(data_list[0]['cgpa'])

        if float(c_cgpa) >= float(input_cgpa):
            out_weighted_cgpa_score = 1.0 * weightage
        else:
            out_weighted_cgpa_score = 0.4 * weightage
        print(f"Candidate: {data_dict['name']}\t\t 2. CGPA Score:{out_weighted_cgpa_score}/{weightage}\t C CGPA(normalised): {c_cgpa} VS E: {input_cgpa} \t ")

    return out_weighted_cgpa_score

input = "3.5"
data_dict['cgpa_score'] = data_dict.apply(lambda row: evaluate_cgpa(row, input, 20), axis=1)

CGPA method 2: Getting latest available cgpa
[{'field_of_study': 'Applied Data Analytics', 'level': "Master's", 'cgpa': '6.42', 'university': 'Australian National University', 'Start Date': '2021-07', 'year_of_graduation': '2022'}]
6.42
6.42 4.0
normalised cgpa:  6.42, raw cgpa extracted: 6.42
Candidate: Ang Teik Hun		 2. CGPA Score:20.0/20	 C CGPA(normalised): 6.42 VS E: 3.5 	 
CGPA method 2: Getting latest available cgpa
[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}]
3.97
3.97 4.0
normalised cgpa:  3.97, raw cgpa extracted: 3.97
Candidate: GOO YE JUI		 2. CGPA Score:20.0/20	 C CGPA(normalised): 3.97 VS E: 3.5 	 


In [3]:
job_title = "Executive (Data Scientist)"
job_description = """Responsibilities:
Lead and mentor a team of data scientists and analysts, providing guidance and support to ensure high-quality deliverables
Collaborate with cross-functional teams to define data-driven strategies and objectives
Develop advanced statistical models and machine learning algorithms to analyze and interpret complex datasets
Design and implement data pipelines and workflows to streamline data collection, processing, and analysis
Identify and explore new data sources and technologies to enhance our analytical capabilities
Communicate findings and recommendations to key stakeholders through compelling visualizations, presentations, and reports
Stay current with the latest developments in data science and technology, and proactively identify opportunities for innovation and improvement
"""
job_requirement = """Qualifications:
Bachelor's degree in Computer Science, Statistics, Mathematics, or a related field; advanced degree preferred
 years of experience in data science or a related field, with a proven track record of leading successful projects and initiatives
Proficiency in programming languages such as Python, R, or SQL, and experience with data analysis and visualization libraries (e.g., pandas, scikit-learn, matplotlib, seaborn)
Strong understanding of statistical methods, machine learning techniques, and data mining algorithms
Experience with big data technologies (e.g., Hadoop, Spark) and cloud platforms (e.g., AWS, Azure, GCP) preferred
Excellent communication and interpersonal skills, with the ability to effectively collaborate with cross-functional teams and communicate complex technical concepts to non-technical stakeholders
Proven leadership abilities, with the ability to inspire and motivate team members to achieve common goals
"""

In [4]:
from assess_criteria_class import JobParser


JD = JobParser(job_title,job_description,job_requirement)
JD_skills = JD.extract_additional_skills()
JD_skills

printing skills from jobdescription ['Python', 'R', 'SQL', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn', 'Hadoop', 'Spark', 'AWS', 'Azure', 'GCP', 'statistical methods', 'machine learning techniques', 'data mining algorithms', 'communication skills', 'leadership abilities']


<assess_criteria_class.JobParser at 0x28d20df2930>

In [5]:
def evaluate_skill_groups(self,data_dict,input,weightage):
    JD = JobParser(self.pdf_dir, self.job_title)
    JD_skills = JD.extract_additional_skills()
    result_list = [skill.strip().lower() for skill in input.split(",")]
    data_dict_lower = [x.lower() for x in data_dict['Skill Groups']]
    # Convert all strings in the list to lowercase
    jd_skills_lower = [x.lower() for x in JD_skills.jd_skills]

    if not data_dict_lower or (len(data_dict_lower) == 1 and data_dict_lower[0] == 'N/A'):  # If the list is empty or contains only 'N/A'
        pos = list(data_dict.keys()).index('Skill Groups')
        items = list(data_dict.items())
        items.insert(pos+1, ('Skill Groups Score', 0))
        data_dict = dict(items)
        return data_dict
            
    #Define embeddings model
    embeddings_model = AzureOpenAIEmbeddings(azure_deployment='ada002')

    #Embeds both list
    embedding1 = embeddings_model.embed_documents(data_dict_lower) #candidate skill groups
    embedding2 = embeddings_model.embed_documents(jd_skills_lower+result_list) #required skill groups

    #Calculate the cosine similarity score from embeddings
    similarity_test = cosine_similarity(embedding1,embedding2)

    def similarity_range_score(similarity_scores):
        categorical_scores = []

        for score in similarity_scores:
            if score >= 0.88:
                categorical_scores.append(1.0)
            elif score >= 0.85:
                categorical_scores.append(0.5)
            elif score >= 0.8:
                categorical_scores.append(0.3)
            else:
                categorical_scores.append(0.0)
        print(categorical_scores)

        return categorical_scores

        
    res = round(np.mean(similarity_range_score(similarity_test.max(axis=0)))*weightage,2)
    
    
    pos = list(data_dict.keys()).index('Skill Groups')
    items = list(data_dict.items())
    items.insert(pos+1, ('Skill Groups Score', res))
    data_dict = dict(items)
    
    print(f"Candidate: {data_dict['Name']}\t\t3. SkillGroup Score:{res}/{weightage}\tC similairty score: {res} E: {input} \t ")
        
    return data_dict

<assess_criteria_class.JobParser at 0x28d20df2930>