In [23]:
from dotenv import load_dotenv
import os
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import re
import time

load_dotenv(".env")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"FYP-Goo"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.environ.get('LANGCHAIN_API_KEY')

# Extraction

In [25]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("GooYeJui.pdf", extract_images=True)
pages = loader.load()

In [26]:
pages

[Document(page_content='Profile Summary\nIndustry ExperienceRelevant Skills\nGOO YE JUI\nA recent graduate with a strong foundation in both front-end and back-end development, combined with a\npassion for innovation and a commitment to making lives better through technology. Eager to contribute to\nthe vision of Unit Nukleus GovTech by leveraging technical expertise to empower the nation and enhance\ndigital government services. \nFull-stack web development (HTML 5, CSS,\nJavaScript, PHP, SQL, Python, .NET, React)\nNatural Language Processing: spaCy, NLTK,\nTensorFlow, PyTorch\nGenerative AI related : LangChain, Llama\nIndex\nPetronas Digital Sdn Bhd - Data Science Intern Sept 2023 - Jun 2024\nDeveloped (in 2 months) an Generative AI Based Resume Parser for Group HRM. The AI-powered parser\nautomates the extraction and parsing of candidate information from resumes, saving HR professionals\nvaluable time and effort in manually reviewing and categorizing resumes. Responsible for all fron

In [120]:
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
class Candidate(BaseModel):
    """Information about a candidate from his/her resume."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.

    name: Optional[str] = Field(..., description="The name of the candidate")
    phone_number: Optional[str] = Field(
        ..., description="The phone number of the candidate"
    )
    email: Optional[str] = Field(
        ..., description="The email of the candidate"
    )
    local: Optional[str] = Field(
        ..., description="Is the candidate Malaysian(Yes or No)?"
    )
    expected_salary: Optional[str] = Field(
        ..., description="Candidate's expected salary in RM if known. (If the currency is Ringgit Malaysia, assign the numerical value or range values only Eg:'3000-3100'. If in other currency, assign alongside currency)"
    )
    current_location: Optional[List] = Field(
        ..., description="Candidate's current location if known. If the candidate does not mention the country, assign the country based on the state and city. Return it in a python dict format with these three keys, example: {'Country': '', 'State': '', 'City': ''} "
    )
    education_background: Optional[List] = Field(
        ..., description="Every single candidate's education background. (field_of_study, level (always expand to long forms), cgpa (Example: 3.5/4.0), university, start_date, year_of_graduation (Year in 4-digits only, remove month). All in a python dict format."
    )
    professional_certificate: Optional[List] = Field(
        ..., description="Candidate's professional certificates stated in the resume, return each certificate as a string in a python list."
    )
    skill_group: Optional[List] = Field(
        ..., description="Every single candidate's skill groups stated in the resume, return each skills as a string in a python list."
    )
    technology_programs_tool: Optional[List] = Field(
        ..., description="Every single candidate's Technology (Tools, Program, System) related to job title stated in the resume, return each technology as a string in a python list."
    )
    language: Optional[List] = Field(
        ..., description="Languages that is stated in the resume, return each language as a string in a python list."
    )
    previous_job_roles: Optional[List] = Field(
        ..., description="Every single one of the candidate's (job_title, job_company, Industries (strictly classify according to to The International Labour Organization), start_date and end_date (only assign date time format if available. Do not assign duration), job_location, job_duration (return the job_duration in years), return in a python dict format."
    )


In [121]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm with 20 years experience in the recruiting industry. You will be provided with candidate's resume. \n"
            "Extract relevant candidate's information mentioned in the following candidate's resume together with their properties. \n"
            "1) Please provide an accurate answers, no guessing. \n"
            "2) Please return 'N/A' only if the information is not mentioned. \n"
            "3) No need to return any reasoning as this is only for extraction of information. \n"
            "4) Extracted Properties of all Start date and End date: \n"
            "* if the month is not stated, assume that start/end date is in the middle of the year. \n"
            "* should never include english words such as 'months', 'years', 'days'. \n"
            "* Instead, dates should be dates converted to the following format: \n"
            "* date values assigned are strictly in Python datetime format. \n"
            """Strict Format of either one: 
                YYYY
                YYYY-MM or YYYYMM
                YYYY-MM-DD or YYYYMMDD
            6) Ensure that for any duration (year) calculation: 
            * Any end date that indicates "Present", refers to today's date, which is {current_date}. 
            * Do not assume the work experiences are continuous without breaks.
            * Method of duration calculation: Subtract the end date from start date to get the number of months. Finally sum up all relevant durations and convert to years. 
            * Triple check your calculations. ","""
        ),
        ("human", "{text}"),
    ]
)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.3)
runnable = prompt | llm.with_structured_output(schema=Candidate)
result = runnable.invoke({"text": pages,"current_date":datetime.now()})

In [122]:
result

Candidate(name='GOO YE JUI', phone_number='+60184040438', email='yjyejui626@gmail.com', local='N/A', expected_salary='N/A', current_location=[{'Country': 'Malaysia', 'State': 'Penang', 'City': 'Bukit Mertajam'}], education_background=[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}], professional_certificate=['Microsoft Certified: Azure AI Fundamentals', 'Google Data Analytics Certificate by Coursera', 'Alteryx Foundational Micro-Credential', 'Alteryx Designer Core Certification', 'AWS Academy Graduate - AWS Academy Cloud Foundations', 'AWS Academy Graduate - AWS Academy Machine Learning Foundations', 'AWS Academy Graduate - AWS Academy Data 

In [123]:
result.dict()

{'name': 'GOO YE JUI',
 'phone_number': '+60184040438',
 'email': 'yjyejui626@gmail.com',
 'local': 'N/A',
 'expected_salary': 'N/A',
 'current_location': [{'Country': 'Malaysia',
   'State': 'Penang',
   'City': 'Bukit Mertajam'}],
 'education_background': [{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)',
   'level': "Bachelor's Degree",
   'cgpa': '3.97',
   'university': 'Universiti Teknologi Malaysia',
   'start_date': '2020',
   'year_of_graduation': '2024'},
  {'field_of_study': 'Foundation in Science',
   'level': 'Foundation',
   'cgpa': '3.78',
   'university': 'Universiti Teknologi Malaysia',
   'start_date': '2019',
   'year_of_graduation': '2020'}],
 'professional_certificate': ['Microsoft Certified: Azure AI Fundamentals',
  'Google Data Analytics Certificate by Coursera',
  'Alteryx Foundational Micro-Credential',
  'Alteryx Designer Core Certification',
  'AWS Academy Graduate - AWS Academy Cloud Foundations',
  'AWS Academy Graduate - AWS Academy Ma

In [126]:
import pandas as pd

# Convert Candidate object to dictionary
candidate_dict = result.dict()

# Create DataFrame with one row using the dictionary
df = pd.DataFrame([candidate_dict])
df.to_excel('results.xlsx')

In [31]:
print(df['education_background'][0])

[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': 3.97, 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': 3.78, 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}]


In [90]:
import secrets,string
batch_token = "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(12))
batch_token

'9zuTpclGO06S'

# Evaluation

In [127]:
import pandas as pd

data_dict = pd.read_excel('results.xlsx',index_col=0)
data_dict

Unnamed: 0,name,phone_number,email,local,expected_salary,current_location,education_background,professional_certificate,skill_group,technology_programs_tool,language,previous_job_roles
0,GOO YE JUI,60184040438,yjyejui626@gmail.com,,,"[{'Country': 'Malaysia', 'State': 'Penang', 'C...",[{'field_of_study': 'Bachelor Of Computer Scie...,"['Microsoft Certified: Azure AI Fundamentals',...","['Full-stack web development', 'Natural Langua...","['HTML 5', 'CSS', 'JavaScript', 'PHP', 'SQL', ...","['English', 'Mandarin', 'Malay', 'French']","[{'job_title': 'Data Science Intern', 'job_com..."


In [33]:
# function to parse range inputs 
def parse_range(input_string):
    """
    Parses the range string.

    Args:
    input_string: A string containing formats like "<5.6", ">5", "=5.0", or "2.0-5".

    Returns:
    tuple: A tuple containing the lower limit, upper limit, and condition.
    """
    match = re.match(r'^\s*(<|>|=)?\s*([0-9]+(?:\.[0-9]+)?)(?:\s*-\s*([0-9]+(?:\.[0-9]+)?))?\s*$', input_string)
    condition = ""
    in_threshold_lower_limit = 0
    in_threshold_upper_limit = 99999

    if match:
        condition = match.group(1)
        values = match.group(2)

        if condition == "<":
            in_threshold_upper_limit = float(values)
        elif condition == ">":
            in_threshold_lower_limit = float(values)
        elif condition == "=":
            in_threshold_lower_limit = in_threshold_upper_limit = float(values)
        elif match.group(3): # range 
            condition = "range"
            in_threshold_lower_limit = float(values)
            in_threshold_upper_limit = float(match.group(3))
        else: # exact value, same as "="
            condition = "="
            in_threshold_lower_limit = in_threshold_upper_limit = float(values)
        # print(f"\tLower Limit: {in_threshold_lower_limit}, Upper Limit: {in_threshold_upper_limit}, Condition: {condition}")
        
    else:
        # print (f"\tVal = {input_string}  Parse Range funtion detected: Invalid input format")
        in_threshold_lower_limit, in_threshold_upper_limit = 0, 9999999

    return in_threshold_lower_limit, in_threshold_upper_limit, condition
    # # Example usage
    # input_string = "11.59-888"
    # lower_limit, upper_limit, condition = parse_range(input_string)

In [24]:
import re
import time
import pandas as pd
import openai
from openai import OpenAI

def evaluate_education_background(row, input, weightage):
    max_retries = 5
    retry_count = 0
    
    try:
        edu_prompt_system = f"""[Instruction] You will be provided with details such as the preferred field of study, job_title, and the candidate's field of study.
        Please act as an impartial judge and evaluate the candidate's field of study based on the job_title and preferred education background. For this evaluation, you should primarily consider the following accuracy:
        [Accuracy]
        Score 1: The candidate's field of study is completely unrelated to {input} and the job_title stated.
        Score 3: The candidate's field of study has minor relevance but does not align with {input} and the job_title stated.
        Score 5: The candidate's field of study has moderate relevance but contains inaccuracies to {input} and the job_title stated.
        Score 7: The candidate's field of study aligns with {input} and the job_title stated but has minor errors or omissions on either one of them.
        Score 10: The candidate's field of study is completely accurate and aligns very well with {input} and the job_title stated.
        
        [Rules]
        1. If the candidate has several education background, you should always consider the most related to {input} and the job_title only.
        2. You should always ignore those that are unrelated to {input} and the job_title and make sure they do not affect the total scoring.
        3. You should only assess the candidate's Field of Study and it's level. Ignore any other criterias.

        [Steps]
        Step 1 : Start the evaluation by giving reasons, Be as objective as possible.
        Step 2 : You must rate the candidate on a scale of 1 to 10 by strictly following this format: "[[rating]]", 
        for example:
        "Education Background Rating: [[6]].

        [Question]
        How will you rate the candidate's education background based on the provided job_title with preferred education background?
        """

        edu_prompt_user = f"""
        Preferred Field of Study: {input}
        
        job_title: {job_title}

        [The Start of Candidate's Education Background]
        {row['education_background']}
        [The End of Candidate's Education Background]
        """
        
        client = OpenAI()
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": edu_prompt_system},
                {"role": "user", "content": edu_prompt_user}
            ],
            model="gpt-3.5-turbo-0125",
            temperature=0.3,
            n=3,
        )
        
        print("Response from edu", response)
        
    except openai.RateLimitError as e:
        print(f"OpenAI rate limit exceeded. Pausing for one minute before resuming... (From RateLimitError)")
        print(e)
        time.sleep(30)
        retry_count += 1

        if retry_count >= max_retries:
            print("Exceeded maximum retries for evaluating education background.... (From RateLimitError)")
            return response
    
    # Extract the number using regex
    def extract_gpt_response_rating(response):
        ratings = []
        pattern = r'\[\[([\d]+)\]\]'

        for i in range(len(response.choices)):
            match = re.search(pattern, response.choices[i].message.content)
            if match:
                rating = int(match.group(1))
                ratings.append(rating)
            else:
                # ratings = 0
                ratings.append(0)
        return ratings
    
    # Calculate average rating
    def calculate_average_rating(ratings):
        if not ratings:
            return 0
        return round(sum(ratings) / len(ratings))

    # Calculate weighted score
    def calculate_weighted_score(average_rating, weightage):
        if average_rating is None:
            return 0
        return round(average_rating / 10 * weightage)
            
    edu_rating = extract_gpt_response_rating(response)
    average_rating = calculate_average_rating(edu_rating)
    edu_weighted_score = calculate_weighted_score(average_rating, weightage)
    
    print(f"Candidate: {row['name']}\t\t1. EDU Score:{edu_weighted_score}/{weightage}\t C: refer data_dict E: {input}\t ")
    
    return edu_weighted_score

# Example usage:
# Assuming 'df' is your dataframe containing the data
# input and weightage are assumed to be defined earlier
input = "Bachelor's Degree in Data Science or Computer Science"
job_title = "Executive (Data Scientist)"
data_dict['education_background_score'] = data_dict.apply(lambda row: evaluate_education_background(row, input, 20), axis=1)


Response from edu ChatCompletion(id='chatcmpl-9LUAnl6RssJDRM1F1beTBp2XDF7dl', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Step 1: The candidate's field of study is in Applied Data Analytics at the Master's level, which is a closely related field to Data Science. However, it is not a Bachelor's Degree in Data Science or Computer Science as preferred. There is a minor relevance between the candidate's field of study and the job title of Executive (Data Scientist).\n\nStep 2: Education Background Rating: [[5]].", role='assistant', function_call=None, tool_calls=None)), Choice(finish_reason='stop', index=1, logprobs=None, message=ChatCompletionMessage(content="Step 1: The candidate's field of study is in Applied Data Analytics at the Master's level, which is a related field to Data Science. However, the candidate's education background is at the Master's level, not at the Bachelor's level as required. Additionally, the job title is f

In [None]:
data_dict['education_background'][0] = '[{\'field_of_study\': \'Applied Data Analytics\', \'level\': "Master\'s", \'cgpa\': \'6.42\', \'university\': \'Australian National University\', \'Start Date\': \'2021-07\', \'year_of_graduation\': \'2022\'}]'


In [11]:
import ast
import re

def evaluate_cgpa(data_dict,input_cgpa, weightage):
    out_weighted_cgpa_score = 0.0
    c_cgpa = 0 #total 

    def get_normalize_cgpa(cgpa_str,standard_scale = 4.0):
        # Regex pattern to match CGPA values and their max scales
        pattern = r'(\d+(?:\.\d+)?)(?:/(\d+(?:\.\d+)?))?'

        # Searching for the pattern in the text
        match = re.search(pattern, cgpa_str)
        if match:
            cgpa = float(match.group(1))
            max_cgpa = float(match.group(2)) if match.group(2) else standard_scale

            print(cgpa,max_cgpa)

            # Normalize CGPA to the standard scale
            normalized_cgpa = (cgpa / max_cgpa) * standard_scale
            print (f"""normalised cgpa:  {normalized_cgpa}, raw cgpa extracted: {cgpa_str}""")
            return normalized_cgpa
        else: # if N/A in resume, cpga -> 0.0 
            print ("normalised cgpa:  CPGA not found. Default CGPA = 0.0/4.0")
            return float("0")


    if 'education_background' not in data_dict:
        print(f"Candidate: {data_dict['name']}\t\t 2. CGPA Score:{out_weighted_cgpa_score}/{weightage}\t C CGPA(normalised): {c_cgpa} VS E: {input_cgpa} \t ")
        return 0.4 * weightage
    else: 
        print ("CGPA method 2: Getting latest available cgpa")
        data_list = ast.literal_eval(data_dict.education_background)
        print(data_list)
        data_list.sort(key=lambda x: int(x['year_of_graduation']), reverse=True)
        print(data_list[0]['cgpa'])
        if data_list[0]['cgpa']  != "N/A" :
            c_cgpa = get_normalize_cgpa(data_list[0]['cgpa'])

        if float(c_cgpa) >= float(input_cgpa):
            out_weighted_cgpa_score = 1.0 * weightage
        else:
            out_weighted_cgpa_score = 0.4 * weightage
        print(f"Candidate: {data_dict['name']}\t\t 2. CGPA Score:{out_weighted_cgpa_score}/{weightage}\t C CGPA(normalised): {c_cgpa} VS E: {input_cgpa} \t ")

    return out_weighted_cgpa_score

input = "3.5"
data_dict['cgpa_score'] = data_dict.apply(lambda row: evaluate_cgpa(row, input, 20), axis=1)

CGPA method 2: Getting latest available cgpa
[{'field_of_study': 'Applied Data Analytics', 'level': "Master's", 'cgpa': '6.42', 'university': 'Australian National University', 'Start Date': '2021-07', 'year_of_graduation': '2022'}]
6.42
6.42 4.0
normalised cgpa:  6.42, raw cgpa extracted: 6.42
Candidate: Ang Teik Hun		 2. CGPA Score:20.0/20	 C CGPA(normalised): 6.42 VS E: 3.5 	 
CGPA method 2: Getting latest available cgpa
[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}]
3.97
3.97 4.0
normalised cgpa:  3.97, raw cgpa extracted: 3.97
Candidate: GOO YE JUI		 2. CGPA Score:20.0/20	 C CGPA(normalised): 3.97 VS E: 3.5 	 


In [1]:
job_title = "Executive (Data Scientist)"
job_description = """Responsibilities:
Lead and mentor a team of data scientists and analysts, providing guidance and support to ensure high-quality deliverables
Collaborate with cross-functional teams to define data-driven strategies and objectives
Develop advanced statistical models and machine learning algorithms to analyze and interpret complex datasets
Design and implement data pipelines and workflows to streamline data collection, processing, and analysis
Identify and explore new data sources and technologies to enhance our analytical capabilities
Communicate findings and recommendations to key stakeholders through compelling visualizations, presentations, and reports
Stay current with the latest developments in data science and technology, and proactively identify opportunities for innovation and improvement
"""
job_requirement = """Qualifications:
Bachelor's degree in Computer Science, Statistics, Mathematics, or a related field; advanced degree preferred
 years of experience in data science or a related field, with a proven track record of leading successful projects and initiatives
Proficiency in programming languages such as Python, R, or SQL, and experience with data analysis and visualization libraries (e.g., pandas, scikit-learn, matplotlib, seaborn)
Strong understanding of statistical methods, machine learning techniques, and data mining algorithms
Experience with big data technologies (e.g., Hadoop, Spark) and cloud platforms (e.g., AWS, Azure, GCP) preferred
Excellent communication and interpersonal skills, with the ability to effectively collaborate with cross-functional teams and communicate complex technical concepts to non-technical stakeholders
Proven leadership abilities, with the ability to inspire and motivate team members to achieve common goals
"""

In [4]:
from assess_criteria_class import JobParser


JD = JobParser(job_title,job_description,job_requirement)
JD_skills = JD.extract_additional_skills()
JD_skills

printing skills from jobdescription ['Python', 'R', 'SQL', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn', 'Hadoop', 'Spark', 'AWS', 'Azure', 'GCP', 'statistical methods', 'machine learning techniques', 'data mining algorithms', 'communication skills', 'leadership abilities']


<assess_criteria_class.JobParser at 0x28d20df2930>

In [16]:
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from assess_criteria_class import JobParser
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate_skill_groups(data_dict,input,weightage):
    job_title = "Executive (Data Scientist)"
    job_description = """Responsibilities:
    Lead and mentor a team of data scientists and analysts, providing guidance and support to ensure high-quality deliverables
    Collaborate with cross-functional teams to define data-driven strategies and objectives
    Develop advanced statistical models and machine learning algorithms to analyze and interpret complex datasets
    Design and implement data pipelines and workflows to streamline data collection, processing, and analysis
    Identify and explore new data sources and technologies to enhance our analytical capabilities
    Communicate findings and recommendations to key stakeholders through compelling visualizations, presentations, and reports
    Stay current with the latest developments in data science and technology, and proactively identify opportunities for innovation and improvement
    """
    job_requirement = """Qualifications:
    Bachelor's degree in Computer Science, Statistics, Mathematics, or a related field; advanced degree preferred
    years of experience in data science or a related field, with a proven track record of leading successful projects and initiatives
    Proficiency in programming languages such as Python, R, or SQL, and experience with data analysis and visualization libraries (e.g., pandas, scikit-learn, matplotlib, seaborn)
    Strong understanding of statistical methods, machine learning techniques, and data mining algorithms
    Experience with big data technologies (e.g., Hadoop, Spark) and cloud platforms (e.g., AWS, Azure, GCP) preferred
    Excellent communication and interpersonal skills, with the ability to effectively collaborate with cross-functional teams and communicate complex technical concepts to non-technical stakeholders
    Proven leadership abilities, with the ability to inspire and motivate team members to achieve common goals
    """
    JD = JobParser(job_title,job_description,job_requirement)
    JD_skills = JD.extract_additional_skills()
    result_list = [skill.strip().lower() for skill in input.split(",")]
    data_dict_lower = [x.lower() for x in data_dict['technical_skill']]
    # Convert all strings in the list to lowercase
    jd_skills_lower = [x.lower() for x in JD_skills.jd_skills]
            
    #Define embeddings model
    embeddings_model = OpenAIEmbeddings(model='text-embedding-ada-002')

    #Embeds both list
    embedding1 = embeddings_model.embed_documents(data_dict_lower) #candidate skill groups
    embedding2 = embeddings_model.embed_documents(jd_skills_lower+result_list) #required skill groups

    #Calculate the cosine similarity score from embeddings
    similarity_test = cosine_similarity(embedding1,embedding2)

    def similarity_range_score(similarity_scores):
        categorical_scores = []

        for score in similarity_scores:
            if score >= 0.88:
                categorical_scores.append(1.0)
            elif score >= 0.85:
                categorical_scores.append(0.5)
            elif score >= 0.8:
                categorical_scores.append(0.3)
            else:
                categorical_scores.append(0.0)
        print(categorical_scores)

        return categorical_scores

        
    res = round(np.mean(similarity_range_score(similarity_test.max(axis=0)))*weightage,2)
    
    print(f"Candidate: {data_dict['name']}\t\t3. SkillGroup Score:{res}/{weightage}\tC similairty score: {res} E: {input} \t ")
        
    return res

input = "python"
data_dict['technical_skills_score'] = data_dict.apply(lambda row: evaluate_skill_groups(row, input, 20), axis=1)

printing skills from jobdescription ['Python', 'R', 'SQL', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn', 'Hadoop', 'Spark', 'AWS', 'Azure', 'GCP']
[0.3, 1.0, 0.3, 0.3, 0.0, 0.0, 0.0, 0.3, 0.3, 0.3, 0.3, 0.0, 0.3]
Candidate: Ang Teik Hun		3. SkillGroup Score:5.23/20	C similairty score: 5.23 E: python 	 
printing skills from jobdescription ['Python', 'R', 'SQL', 'pandas', 'scikit-learn', 'matplotlib', 'seaborn', 'Hadoop', 'Spark', 'AWS', 'Azure', 'GCP', 'statistics', 'machine learning', 'data mining', 'communication', 'leadership']
[0.3, 1.0, 0.3, 0.3, 0.0, 0.3, 0.0, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.0, 0.0, 0.3, 0.3, 0.3]
Candidate: GOO YE JUI		3. SkillGroup Score:5.44/20	C similairty score: 5.44 E: python 	 


In [21]:
data_dict['previous_job_roles'][0]

"[{'job_title': 'Data Scientist', 'job_company': 'Petroliam Nasional Berhad', 'Industries': 'Oil and Gas', 'start_date': '2020-11', 'end_date': '2021-07', 'job_location': 'KL', 'Job Duration (Years)': 0.6}, {'job_title': 'Tutor', 'job_company': 'Australian National University', 'Industries': 'Education', 'start_date': '2022-06', 'end_date': '2022-12', 'job_location': 'Canberra', 'Job Duration (Years)': 0.5}]"

In [81]:
def evaluate_total_working_exp_years(data_dict, input_string, weightage):
    
    c_total_yr_exp, out_weighted_score = 0.0, 0.0

    def parse_date(date_str):
        # Handle other values gracefully
        if date_str.lower() in ["n/a", "none"]:
            return None
        elif date_str.lower() in [ "present", "current", "now"]: 
            return datetime.now()     
        # Expanded corrections for non-standard month abbreviations to standard ones
        corrections = {
            "Jan": "Jan", "Feb": "Feb", "Mar": "Mar", "Apr": "Apr",
            "May": "May", "Jun": "Jun", "Jul": "Jul", "Aug": "Aug",
            "Sep": "Sep", "Sept": "Sep",  # Both 'Sep' and 'Sept' to 'Sep'
            "Oct": "Oct", "Nov": "Nov", "Dec": "Dec",
            "Mac": "Mar",  # Non-standard, common in some regions
            # Add additional non-standard abbreviations as needed
        }
        # Replace non-standard abbreviations with their standard equivalents
        for incorrect, correct in corrections.items():
            if incorrect in date_str:
                date_str = date_str.replace(incorrect, correct)

        # Extensive list of date formats to try parsing the date strings
        date_formats = [
            "%Y",
            "%B %Y",  # Full month name and year
            "%m %Y",
            "%d %m %Y",
            "%d-%m-%Y",
            "%Y-%m-%d",
            "%b %Y",  # Abbreviated month name and year
            "%Y-%m",
            "%m-%Y",
            "%d %B %Y",
            "%d %b %Y",
            "%Y-%m-%d %H:%M:%S",
            "%Y-%m-%d %H:%M:%S.%f",
            "%m/%d/%Y",
            "%m/%d/%y",
            "%d/%m/%Y",
            "%d/%m/%y",
        ]

        for fmt in date_formats:
            try:
                return datetime.strptime(date_str, fmt)
            except ValueError:
                continue  # If current format fails, try the next

        # If all formats fail, print an error and return None
        print(f"Error parsing date '{date_str}': does not match expected formats.")
        return None
    
    def gpt_calc_total_exp():
        # Check if 'previous_job_roles' exists and is a list
        
        total_duration = 0
        for role in data_dict['previous_job_roles']:
            try:
                # Attempt to convert job duration to float and add to total
                duration_str = role.get("job_duration", "0")  # Default to "0" if not found
                duration = float(duration_str)
                total_duration += duration
            except ValueError:
                # Handle case where conversion to float fails
                print(f"Error converting job duration to float for role: {role.get('job_title')}. Skipping this entry.")
                continue  # Skip this entry and continue with the next
        print (f"gpt4 total yr: {total_duration}")
                
        return round(total_duration, 2)

    # Manual: Total duration
    total_experience_gpt4 = gpt_calc_total_exp()
    # Use parse_range to get the lower and upper limits and condition
    in_threshold_lower_limit, in_threshold_upper_limit, condition = parse_range(input_string)
    try:
        c_total_yr_exp = float(total_experience_gpt4)
        if c_total_yr_exp < in_threshold_lower_limit:
            out_weighted_score = 0  # does not meet requirement
        elif in_threshold_lower_limit <= c_total_yr_exp <= in_threshold_upper_limit:
            out_weighted_score = 1.0 * weightage  # within range ir equal 
        elif c_total_yr_exp > in_threshold_upper_limit:
            out_weighted_score = 0.5 * weightage  # overqualified
        else:
            out_weighted_score = 0
        print(f"Candidate: {data_dict['name']}\t\t4.Total years of experience Score:{out_weighted_score}/ {weightage}\t C:{c_total_yr_exp}, Required years: {input_string}\n ")
    except ValueError:
        # Handle the case where conversion to float fails
        out_weighted_score = 0  
    
    return total_experience_gpt4,out_weighted_score

In [86]:
import json 
input = "python"

data_dict['previous_job_roles'][0] = data_dict['previous_job_roles'][0].replace("'", '"')
data_dict['previous_job_roles'][0] = json.loads(data_dict['previous_job_roles'][0])
data_dict[['work_experience_duration', 'work_experience_duration_score']] = data_dict.apply(lambda row: pd.Series(evaluate_total_working_exp_years(row, input, 20)), axis=1)

gpt4 total yr: 17.0
Candidate: GOO YE JUI		4.Total years of experience Score:20.0/ 20	 C:17.0, Required years: python
 


In [98]:
from openai import OpenAI
import math


def evaluate_year_exp_role(data_dict, input, weightage):

    def extract_yoer_similar(data_dict):
        max_retries = 5
        retry_count = 0 
        try:
            yoer_prompt_system = f"""[Instruction] 
            You will be provided with details such as the candidate's previous job roles. Please act as a hiring manager with 20 years experience to evaluate the candidate's previous job roles.
            1. Identify job roles that are similar to {job_title}. You should also consider roles that are related to {job_title}.
            2. Return all of the duration of the related job roles into a python list.
            3. The output format should strictly follow the format in the example provided.
            Example of the output: Total duration: [[2,3,4]]

            [Question]
            What are the job durations for the job roles that are related to {job_title} in the candidate's previous job experience?
            """

            yoer_prompt_user = f"""
            Candidate's Previous Job Roles: {data_dict["previous_job_roles"]}
            """
            client = OpenAI()
            response = client.chat.completions.create(
                model="gpt-3.5-turbo-0125", # 3.5 turbo
                messages=[
                    {"role": "system", "content": yoer_prompt_system},
                    {"role": "user", "content": yoer_prompt_user}
                ],
                temperature=0.3,
            )
            print("Response from yoer",response)
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            print(f"OpenAI rate limit exceeded. Pausing for one minute before resuming... (From RateLimitError)")
            print(e)
            time.sleep(30)
            retry_count += 1

            if retry_count >= max_retries:
                print("Exceeded maximum retries for parsing PDF.... (From RateLimitError)")
                return response


    def extract_duration(string):
        matches = re.findall(r'\[\[([0-9., ]+)\]\]', string)
        if matches:
            # Split by comma and directly convert each element to float
            list_of_floats = [float(x.strip()) for x in matches[0].split(",")]
            return list_of_floats
        else:
            print("No matches found for the pattern.")
            return []  # Fix to return a list directly

    def sum_floats_in_list(lst):
        if lst != 0:
            return math.fsum(lst)
        else:
            return 0

    def calculate_yoer(yoer_total, input_string, weightage):

        c_total_yr_exp = float(yoer_total)
        out_weighted_score = 0
        
        # Use parse_range to get the lower and upper limits and condition
        in_threshold_lower_limit, in_threshold_upper_limit, condition = parse_range(input_string)

        # Calculate the candidate's score based on their experience
        if c_total_yr_exp < in_threshold_lower_limit:
            out_weighted_score = 0  # does not meet requirement
        elif in_threshold_lower_limit <= c_total_yr_exp <= in_threshold_upper_limit:
            out_weighted_score = 1.0 * weightage  # within range ir equal 
        elif c_total_yr_exp > in_threshold_upper_limit:
            out_weighted_score = 0.5 * weightage  # overqualified
        else:
            out_weighted_score = 0


        return out_weighted_score


    response_yoer = extract_yoer_similar(data_dict)
    yoer_list = extract_duration(response_yoer)
    yoer_total = sum_floats_in_list(yoer_list)
    res = calculate_yoer(yoer_total, input, weightage)
    print(f"Candidate: {data_dict['name']}\t\t8. Yr of Exp in Role Score:{res}/{weightage}\t C: {yoer_total} E: {input}")
    
    return yoer_total,res
        

In [99]:
input = ">3"
data_dict[['related_work_experience_duration', 'related_work_experience_duration_score']] = data_dict.apply(lambda row: pd.Series(evaluate_year_exp_role(row, input, 20)), axis=1)

Response from yoer ChatCompletion(id='chatcmpl-9NNtS2EpRkEq1I7tG45VP7hYMrVfz', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Total duration: [[2, 2, 2]]', role='assistant', function_call=None, tool_calls=None))], created=1715360418, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=12, prompt_tokens=487, total_tokens=499))
Candidate: GOO YE JUI		8. Yr of Exp in Role Score:20.0/20	 C: 6.0 E: >3


In [100]:
data_dict

Unnamed: 0,name,phone_number,email,local,expected_salary,current_location,education_background,professional_certificate,skill_group,technology_programs_tool,language,previous_job_roles,work_experience_duration,work_experience_duration_score,related_work_experience_duration,related_work_experience_duration_score
0,GOO YE JUI,60184040438,yjyejui626@gmail.com,,,"['Country: Malaysia', 'State: Penang', 'City: ...",[{'field_of_study': 'Bachelor Of Computer Scie...,"['Microsoft Certified: Azure AI Fundamentals',...","['Full-stack web development', 'Natural Langua...","['HTML 5', 'CSS', 'JavaScript', 'PHP', 'SQL', ...","['English', 'Mandarin', 'Malay', 'French']","[{'job_title': 'Data Science Intern', 'job_com...",17.0,20.0,6.0,20.0


In [130]:
import urllib3

def evaluate_current_location(data_dict, input, weightage):

    dataset_path = 'daerah-working-set.csv'
    city_data = pd.read_csv(dataset_path)

    def get_coordinates(city_name, country):
        # Try to get the coordinates from the dataset
        print("city name and country",city_name,country)
        try:
            city_info = city_data[city_data['Negeri'] == city_name]
            if country.lower() == "malaysia":
                if city_info.empty==True:
                    city_info = city_data[city_data['Bandar'] == city_name]
                latitude, longitude = city_info['Lat'].values[0], city_info['Lon'].values[0]
                print("method1")
                return latitude, longitude
        except IndexError:
            try:
                http = urllib3.PoolManager(1, headers={'user-agent': 'cv_parser_geocoder'})
                url = f'https://nominatim.openstreetmap.org/search?q={city_name}%2C+Malaysia&format=jsonv2&limit=1'
                resp = http.request('GET', url)
                loc = json.loads(resp.data.decode())
                return loc[0]['lat'],loc[0]['lon']
            except:
                return None,None
            

    def get_city_coast(latitude, longitude):
        east_coast_range =  (2.618, 6.2733, 101.3765, 103.6015)
        north_coast_range = (3.6857, 6.6999, 99.7166, 101.5265)
        middle_coast_range = (2.6884, 3.7801, 100.9878, 101.8911)
        south_coast_range =  (1.4645, 2.9702, 101.7863, 103.9107)
        east_malaysia_range = (1.0104, 6.9244, 109.7889, 119.0566)

        try:
            # Check which coast the city falls into
            if is_in_region(latitude, longitude, east_malaysia_range):
                return "East Malaysia"
            elif is_in_region(latitude, longitude, middle_coast_range):
                return "Middle Coast"
            elif is_in_region(latitude, longitude, east_coast_range):
                return "East Coast"
            elif is_in_region(latitude, longitude, north_coast_range):
                return "North Coast"
            elif is_in_region(latitude, longitude, south_coast_range):
                return "South Coast"
            else:
                return "Out of Malaysia"
        except TypeError:
            return "Location Not Detected"

    def is_in_region(latitude, longitude, region_range):
        min_lat, max_lat, min_lon, max_lon = region_range
        return min_lat <= latitude <= max_lat and min_lon <= longitude <= max_lon
    
    state_mapping = {'wilayah persekutuan': 'WP', 'selangor': 'Selangor', 'johor': 'Johor', 'penang': 'Penang', 'pulau pinang': 'Penang', 'sabah': 'Sabah', 'sarawak': 'Sarawak', 'perak': 'Perak', 'kedah': 'Kedah', 'pahang': 'Pahang', 'terengganu': 'Terengganu', 'kelantan': 'Kelantan', 'negeri sembilan': 'N.Sembilan', 'melaka': 'Melaka','melacca': 'Melaka','perlis': 'Perlis'}
    
    def clean_state(data_dict):
        try:
            for key, value in state_mapping.items():
                if key.lower() in data_dict['current_location'][0]['State'].lower():
                    data_dict['current_location'][0]['State'] = value
                    break
            return data_dict
        except:
            return data_dict

    def clean_location_string(location_str):
        try:
            # Split the string into city and country
            location_parts = list(map(str.strip, location_str.split(',')))

            # Handle the case when location_str only has city and country
            if len(location_parts) == 2:
                state, country = location_parts

                for key, value in state_mapping.items():
                    if key.lower() in state.lower():
                        state = value
                        break

                city = 'N/A'
            elif len(location_parts) == 3:
                city, state, country = location_parts

                for key, value in state_mapping.items():
                    if key.lower() in state.lower():
                        state = value
                        break
            else:
                country = location_parts[0]
                state = 'N/A'
                city = 'N/A'

            # Create the result dictionary
            result = {'Country': country, 'State': state, 'City': city}

            return result
        except ValueError:
            return location_str
    
    def evaluate_coordinate(cleaned_location,data_dict):
        #Get coordinates for required location and candidate location
        latitude1, longitude1 = get_coordinates(cleaned_location['State'],cleaned_location['Country'])
        print(latitude1, longitude1)
        latitude2, longitude2 = get_coordinates(data_dict['current_location'][0]['State'], data_dict['current_location'][0]['Country'])
        print(latitude2, longitude2)
        #Define the coast of required location and candidate location
        coast1 = get_city_coast(latitude1, longitude1)
        coast2 = get_city_coast(latitude2, longitude2)
        #Located at the same region(coast)
        if coast1 == coast2:
            return weightage*0.5
        #Located at different region
        else:
            return 0


    def evaluate_location(cleaned_location,data_dict,weightage):
        # try:
        print(cleaned_location)
        print(data_dict['current_location'])
        # If candidate is in Malaysia
        if cleaned_location['Country'].lower() == "malaysia" and data_dict['current_location'][0]['Country'].lower() == "malaysia":
            # If Option 1 in excel
            if cleaned_location['State'].lower() == 'n/a' and cleaned_location['City'].lower() == 'n/a':
                return weightage
            
            # If same state
            elif (data_dict['current_location'][0]['State'].lower() == cleaned_location['State'].lower()):
                # State = N/A
                if cleaned_location['State'].lower() == 'n/a':
                    if cleaned_location['City'].lower() == 'n/a':
                        return 0
                    else:
                        print("weightage here")
                        return weightage
                # State != N/A
                else:
                    return weightage
                
            # if not same state
            elif (data_dict['current_location'][0]['State'].lower() != cleaned_location['State'].lower()):
                # same city
                if (data_dict['current_location'][0]['City'].lower() == cleaned_location['City'].lower() == "N/A"):
                    return 0
                else:
                    return evaluate_coordinate(cleaned_location,data_dict)
                
            # if same city
            elif (data_dict['current_location'][0]['City'].lower() == cleaned_location['City'].lower()):
                # City = N/A
                if cleaned_location['City'].lower() == 'n/a':
                    return 0
                else:
                    print("weightage here")
                    return weightage
            else:
                return 0
                
        # If candidate is overseas
        else:
            if data_dict['current_location'][0]['Country'] == cleaned_location['Country']:
                print(cleaned_location['Country'],data_dict['current_location'][0]['Country'])
                return weightage
            else:
                return 0
        # except TypeError as e:
        #     print("Different Country detected")
        #     print(e)
        #     return 0

    # Example usage:
    cleaned_location = clean_location_string(input)
    cleaned_dict = clean_state(data_dict)
    out_location_score =  evaluate_location(cleaned_location,cleaned_dict,weightage)
    print (f"Candidate: {data_dict['name']}\t\t 11. Location Score: {out_location_score}/{weightage}\t  E:{cleaned_location} C: {data_dict['current_location']}\n")
    return out_location_score

In [140]:
input = "Malaysia"

data_dict['current_location'][0] = data_dict['current_location'][0].replace("'", '"')
data_dict['current_location'][0] = json.loads(data_dict['current_location'][0])
data_dict['current_location_score'] = data_dict.apply(lambda row: evaluate_current_location(row, input,10), axis=1)

{'Country': 'Malaysia', 'State': 'N/A', 'City': 'N/A'}
[{'Country': 'Malaysia', 'State': 'Penang', 'City': 'Bukit Mertajam'}]
Candidate: GOO YE JUI		 11. Location Score: 10/10	  E:{'Country': 'Malaysia', 'State': 'N/A', 'City': 'N/A'} C: [{'Country': 'Malaysia', 'State': 'Penang', 'City': 'Bukit Mertajam'}]



In [161]:
def evaluate_targetted_employer (data_dict, in_target_employer, in_weightage_employer): 
    out_targetted_employer_score =  0 
    targEmp_industries_included = []

    # parse into include and excluded target comapanies 
    included_input = []
    excluded_input = []
    exclusion_match = ""

    def validate_input_format(input_string): 
        """
        Check if CVMATCHING template format correct for 
        Example: 
            True - "include(Shell, BP) ,  exclude( KLCC, Novella Clinical, Fidelity Investments)    
            True - "include(), exclude()"   
            True - "include(Shell, BP) , exclude()"    
            True - "include() , exclude(Shell, BP)" 
            False -  "include() , exclude(Shell, " 

        """
        # Regular expression pattern to match the valid format
        # pattern = r'^(include\([\w\s,]*\)\s*,\s*exclude\([\w\s,]*\)\s*)+$'
        pattern = r'^(include\((.*?)\)\s*,\s*exclude\((.*?)\)\s*)+$' # include special characters in company names eg &.
        
        # Check if the input string matches the pattern
        if re.match(pattern, input_string):
            return True
        else:
            return False
        
    def parse_targemp_input (correct_format_inputstring):    
        '''
            include_input updates to space-removed values 
            excluded_input updates to space-removed values 

            Example: 
            included_input: ['PetronasDigital']
            excluded_input: ['KLCC', 'NovellaClinical', 'FidelityInvestments']

            Reasoning: 
            More robust matching when spaces are removed. ExxonMobil matches Exxon Mobil inputted by User 
        ''' 
        # Regular expression pattern to match the include and exclude sections
        pattern = r'(include|exclude)\((.*?)\)'
        matches = re.findall(pattern, correct_format_inputstring)

        for match in matches:
            action, values = match
            values_list = [value.strip().replace(" ", "") for value in values.split(',')]
            
            if action == 'include':
                included_input.extend(values_list)
            elif action == 'exclude':
                excluded_input.extend(values_list)
        return True 

    # Preprocessing input & resume employers 
    def clean_employer_lst(input_str):
        """
        removes common words for better string matching 
        """
        # List of common words to remove
        common_words_to_remove = ["sdn", "bhd", "berhad", "ptd", "ltd", "inc", "co", "llc", "or", "and", "&"]
        pattern = r'\b(?:' + '|'.join(re.escape(word) for word in common_words_to_remove) + r')\b|-|\s+'
        cleaned_str = re.sub(pattern, '', input_str, flags=re.IGNORECASE)
        cleaned_list = [word.strip() for word in cleaned_str.split(',')]
        return cleaned_list
    
    def extract_indsutries (gpt_response): 
        """
        Extract industries from customised gpt output response. Example: 
            gpt_response = "[[Marketing, Food & Beverage, Shipping, Fashion, Cosmetics]]"
            output = ["Marketing", "Food & Beverage", "Shipping", "Fashion", "Cosmetics"]
        """
        # Ensure input is a string and follows the expected format
        if not isinstance(gpt_response, str) or not gpt_response.startswith("[[") or not gpt_response.endswith("]]"):
            return ["Unknown"]

        # Extract the content inside the outer brackets and split by comma
        # The slice [2:-2] removes the outermost brackets "[[" and "]]"
        industries = [industry.strip() for industry in gpt_response[2:-2].split(',')]

        return industries
    def get_employer_industries_gpt4 (company_name, company_location = ""): 
        max_retries = 5
        retry_count = 0
        # Classify employer industry by gpt
        system_p = f"""You are a helpful assistant. Given a company name and details, your task is to classify the given company's industry it is involve in as per The International Labour Organization.
        1. Classify the industries the company falls into according to The International Labour Organization, based on the company. 
        2. Output only all of industries in python list.
        3. The output format should strictly follow the format in the example provided below - enclosed with double brackets, comma-seperated
        4. A company can be classified in more than 1 industries. 
        Example of the output:
            example 1:  [[Marketing, Food & Beverage, Shipping, Fashion, Cosmetics]]
            example 2: [[Finance]]
            example 3: [[Unknown]] if the company is unfamiliar or you are unsure, output this. 

        """
        in_target_employer_petronas_description = "Petronas is a Malaysian oil and gas company that is involved in upstream and downstream activities. It is the largest oil and gas company in Malaysia, with operations in more than 30 countries around the world. Petronas is involved in exploration, production, refining, marketing, trading, and distribution of petroleum products. It also has interests in petrochemicals, shipping, engineering services, power generation, and other related businesses."
        p_example = f'[The Start of Company description] {in_target_employer_petronas_description}[The End of Company description] '
        p_example_response_format = "[[Oil and Gas, Petrochemicals, Refining, Retail, Shipping, Exploration and Production, Engineering and Construction]]"
        

        p = f'Classify the industries according to The International Labour Organization of the given company. Return results in the aforementioned output format. Given Company: {company_name}, located at {company_location}'
        try:
            client = OpenAI()
            response = client.chat.completions.create(
                model="gpt-3.5-turbo-0125", 
                messages=[
                    {"role": "system", "content": system_p},
                    {"role": "user", "content": p_example},
                    {"role": "assistant", "content": p_example_response_format},
                    {"role": "user", "content": p}
                ]
            )
            try:
                result = response.choices[0].message.content
                industries_lst = extract_indsutries (result) if result else None 
                print (f'GPT response on industry: {result}\tEXTRACTED INDUSTRIES: {industries_lst}')
                return industries_lst
            except KeyError:
                return "undectected"
        except openai.RateLimitError as e:
            print(f"OpenAI rate limit exceeded. Pausing for one minute before resuming... (From RateLimitError)")
            print(e)
            time.sleep(30)
            retry_count += 1

            if retry_count >= max_retries:
                print("Exceeded maximum retries for parsing PDF.... (From RateLimitError)")
                return response
        except Exception as ire:
            print("InvalidReqError",ire)
            return "undetected"

    
    def check_if_matching_employer_industry():
        '''
            Input: 
                user_input_bool: True if input is a list (ie from CVMatching xlsx since can be >1 company)
                in_target_employer: Company Name
            Used when candidate has not work in target employer specified, check for matching industries: 
                1. Ask GPT to classify the industries based on this description 
                2. Check against candidate's previous job company industries
                3. if candidate worked in similar industries: 50%, else 0%
        '''
        # variables
        out_targetted_employer_score =  0


        if (targEmp_industries_included == []): 
            init_input_employer_industry()

        candidate_industries = data_dict["Industries"]
        
        # find matches between overall industries and included()
        list1 = [x.lower().replace(" ", "") for x in candidate_industries if x.lower().replace(" ", "") != "unknown"]   
        list2 = [x.lower().replace(" ", "") for x in targEmp_industries_included if x.lower().replace(" ", "") != "unknown"]
        matches = [x for x in list1 if x in list2]

        print(f"GPT-ed Classified Industries.\t Included:{included_input} \tExcluded {excluded_input}. Included Industries{targEmp_industries_included}\t Candidate's data_dict['Industries']: {candidate_industries}\t Matched industries are: {matches}")
        if matches:
            print (f"Candidate: {data_dict['name']}\t\t 12. Targeted Employer Score: {out_targetted_employer_score}/{in_weightage_employer}\t Result: Case 2: Matching Industries are {matches}\n")
            res_employer = f"Matching industries detected: {matches}"
            res_employer_score = 0.5*float(in_weightage_employer)
            return res_employer,res_employer_score
        else:
            print (f"Candidate: {data_dict['name']}\t\t 12. Targeted Employer Score: {out_targetted_employer_score}/{in_weightage_employer}\t Result: Case 3: NO MATCHING INDUSTRY \n ")
            res_employer = f"No exact match and no matching industry from past employers detected"
            res_employer_score = 0
            return res_employer,res_employer_score

    def worked_in_excluded(candidate_employers, excluded): 
        excluded_matches = []
        for x in candidate_employers:
                if x in excluded:
                    excluded_matches = f"Exclusion detected[{x}]"
                    return excluded_matches, True 
        return excluded_matches, False      
    
    def init_input_employer_industry ():
        """
        Initialises list for related-industries in criteria file by user
        
        """ 
        target_employer_industries = set()
        for employer in included_input:
            print(f"xlsx included employer {employer}")
            if employer: 
                a = get_employer_industries_gpt4(employer)
                target_employer_industries.update(a)
        # Assuming 'target_employer_industries_lst' is a list of lists (each inner list contains industries for an employer)
        targEmp_industries_included = list (target_employer_industries)
        print (f"RESUME PARSER CLASS INTIALISED: XLSX Target Employer related googlesearch industries.\tIncluded {included_input}\t self.targEmp_industries: {targEmp_industries_included}\n")
        return True 
                
    # User/Employer Template input validation
    try:
        # Assuming validate_input_format raises an exception if validation fails
        if not validate_input_format(in_target_employer):
            raise ValueError("CVMatching Template.xlsx input string is invalid at 12.Target Employer")

        # If validation passes, proceed with parsing
        parse_targemp_input(in_target_employer)  # included, excluded is updated
        print(f"included: {included_input}, \t excluded: {excluded_input}")
    except ValueError as e:
        # Handle the validation error
        error_message = f"Warning, 12. Target Employer in file CVMatching Template.xlsx {e}"
        print(error_message)
        # self.targEmp_exclusion_matched = "CVMatching Template.xlsx input string is invalid at 12.Target Employer"
        return -1
        
    # Preprocessing inputs 
    req_employers = clean_employer_lst("".join(included_input))  # clean input from excel from common words 
    candidate_employers = clean_employer_lst(",".join([role["job_company"] for role in data_dict["previous_job_roles"] if isinstance(role, dict)]))
    print(f"12. Evaluating Target Employer\tIncluded: {included_input} \t excluded: {excluded_input}\tCandidate's previous employers: {candidate_employers}")

    # Preprocessing Data_dict of candidate: Reassign GPT classified industries for candidate's each previous employer 
    overall_industries = set()
    for x in data_dict["previous_job_roles"]:
        if isinstance(x, dict):
            q = x['job_company'] 
            l = x["job_location"] 
            industries_list = get_employer_industries_gpt4(q, l)  # This now returns a list directly
            
            # Directly assign the list without splitting
            x["Industries"] = industries_list
            
            # Update overall industries without needing to split; handle single-value lists correctly
            overall_industries.update([j.strip().strip('.') for j in industries_list])
            
            # Adjust the print statement to directly use industries_list
            print(f"{q} located at {l} is gpt-classified as a company in industries: {industries_list}")
            
    data_dict["Industries"] = list(overall_industries)
    # Scoring Method
    # 1. Check if candidate work in excluded companies 
    exclusion_match, excluded_flag = worked_in_excluded(candidate_employers, excluded_input)
    if excluded_flag:
        return exclusion_match,0
    else:
        # 2. Check for exact match with cleaned lists (employer and user)
        matched_employer = []
        for candidate in candidate_employers:
            # Skip if candidate is empty or whitespace
            if not candidate.strip():
                continue

            for required in req_employers:
                if re.search(fr'{re.escape(candidate)}', required, re.IGNORECASE):
                    matched_employer.append(candidate)
                    break # breaks right after matching 1 employer
                
        if not matched_employer:# 3: Check for related industry in candidate's past employers 
            print ('\t...12. Target Employer: Checking for any past employers matching to industry of target employer')
            return check_if_matching_employer_industry()
            
        else: # exact match employer 
            print (f"Candidate: {data_dict['name']}\t\t 12. Targeted Employer Score: {out_targetted_employer_score}/{in_weightage_employer}\t  Result: Case 1: MATCHING EMPLOYER \t Matches = {matched_employer}\n)")
            res_employer =  f"Inclusion detected: {matched_employer}"
            res_employer_score = float(in_weightage_employer)
    return res_employer,res_employer_score

In [162]:
input = "include(Shell, BP) , exclude()"
# data_dict['previous_job_roles'][0] = data_dict['previous_job_roles'][0].replace("'", '"')
# data_dict['previous_job_roles'][0] = json.loads(data_dict['previous_job_roles'][0])
data_dict[['targeted_employer', 'targeted_employer_score']] = data_dict.apply(lambda row: pd.Series(evaluate_targetted_employer(row, input, 20)), axis=1)

included: ['Shell', 'BP'], 	 excluded: ['']
12. Evaluating Target Employer	Included: ['Shell', 'BP'] 	 excluded: ['']	Candidate's previous employers: ['PetronasDigital']
GPT response on industry: [[Unknown]]	EXTRACTED INDUSTRIES: ['Unknown']
Petronas Digital Sdn Bhd located at N/A is gpt-classified as a company in industries: ['Unknown']
	...12. Target Employer: Checking for any past employers matching to industry of target employer
xlsx included employer Shell
GPT response on industry: [[Oil and Gas, Petrochemicals, Refining, Retail, Shipping, Exploration and Production, Engineering and Construction, Energy]]	EXTRACTED INDUSTRIES: ['Oil and Gas', 'Petrochemicals', 'Refining', 'Retail', 'Shipping', 'Exploration and Production', 'Engineering and Construction', 'Energy']
xlsx included employer BP
GPT response on industry: [[Oil and Gas, Petrochemicals, Refining, Retail, Shipping, Exploration and Production, Engineering and Construction]]	EXTRACTED INDUSTRIES: ['Oil and Gas', 'Petrochemic

In [181]:
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import traceback

def evaluate_language_score(data_dict, input, weightage):
    match_percentage = 0
    try:
        custom_languages = ["Bahasa Melayu", "Bahasa Malaysia", "Malay", "Melayu", "Bahasa"]
        def check_custom_languages(input_list):
            return set(lang.lower()for lang in custom_languages if lang.lower() in input_list)
        
        languages_str = ', '.join(data_dict['language'])

        nlp = spacy.load('en_core_web_md')
        
        def get_lang_detector(nlp, name):
            return LanguageDetector()

        if languages_str == "N, /, A" or languages_str == "N/A":
            Language.factory("language_detector", func=get_lang_detector)
            nlp.add_pipe('language_detector', last=True)
            doc1 = nlp(str(data_dict))
            if doc1._.language['language']=='en':
                languages_str='English'
                data_dict['language'] = ['English']
        doc1 = nlp(languages_str)
        doc2 = nlp(input)
        
        languages1 = set(ent.text.strip() for ent in doc1.ents if ent.label_ == "LANGUAGE")
        languages2 = set(ent.text.strip() for ent in doc2.ents if ent.label_ == "LANGUAGE")

        languages1.update(check_custom_languages(languages_str))
        languages2.update(check_custom_languages(input))

        matched_languages = set(l.lower() for l in languages1).intersection(set(l.lower() for l in languages2))

        # Calculate the percentage of matches
        if languages1:
            match_percentage = len(matched_languages) / len(languages2) * 100
        else:
            match_percentage = 0
        language_score = round(match_percentage/100*weightage)
        print("Matched Languages: ",matched_languages)
        print (f"Candidate: {data_dict['name']}\t\t 14. Language Score: {language_score}/{weightage}\t C:{languages1} {languages_str}, E: {input} \n")
        
        return language_score
        
    except Exception as e:
        print("Error on language",e)
        traceback.print_exc()  # This will print the traceback information
        return data_dict

In [182]:
input = "English, Malay, Mandarin"

# data_dict['language'][0] = data_dict['language'][0].replace("'", '"')
# data_dict['language'][0] = json.loads(data_dict['language'][0])
data_dict['language_score'] = data_dict.apply(lambda row: evaluate_language_score(row, input,10), axis=1)

Matched Languages:  {'mandarin', 'english'}
Candidate: GOO YE JUI		 14. Language Score: 7/10	 C:{'Mandarin', 'English'} English, Mandarin, Malay, French, E: English, Malay, Mandarin 



In [222]:
# 15 
def evaluate_salary_score(data_dict, input, weightage):
    """
    Checks if the candidate's expected salary matches the employer's range.

    Args:
    in_salary (str): Employer's expected salary range.
    c_exp_salary (str): Candidate's expected salary.

    Returns:
    int: Score indicating the match percentage.
    """
    # Assign 0 score for N/A or empty values
    if np.isnan(data_dict['expected_salary']):
        out_salary_score = 0
    else: 
        # Parse employer's expected salary range
        in_exp_sal_llimit, in_exp_sal_ulimit, in_exp_sal_condition = parse_range(input)

        # Parse candidate's expected salary, calculate average if it's a range
        c_exp_sal = 0 # default is 0 
        c_exp_sal_llimit, c_exp_sal_ulimit, c_exp_sal_condition = parse_range(data_dict['expected_salary'])
        if c_exp_sal_llimit != c_exp_sal_ulimit:
            # Alternative: Calculate average for a range
                # c_exp_sal = (c_exp_sal_llimit + c_exp_sal_ulimit) / 2  
            c_exp_sal = c_exp_sal_llimit # assume lower limit when cv states sal range for now 
        else:
            c_exp_sal = c_exp_sal_llimit  # Use lower limit as single value if cv input not a range

        # Check if the candidate's expected salary falls within the employer's range
        if in_exp_sal_llimit <= c_exp_sal <= in_exp_sal_ulimit:
            res = 1  # 100% 
        else:
            res = 0
        
        out_salary_score = res * weightage

    print (f"Candidate: {data_dict['name']}\t\t 15. Exp Salary in RM Score: {out_salary_score}\t Employer: {input}, Candidate: {data_dict['expected_salary']}\n ")

    return out_salary_score

In [223]:
input = "3000"
data_dict['expected_salary_score'] = data_dict.apply(lambda row: evaluate_salary_score(row, input,10), axis=1)

Candidate: GOO YE JUI		 15. Exp Salary in RM Score: 0	 Employer: 3000, Candidate: nan
 


In [225]:
data_dict

Unnamed: 0,name,phone_number,email,local,expected_salary,current_location,education_background,professional_certificate,skill_group,technology_programs_tool,language,previous_job_roles,current_location_score,targeted_employer,targeted_employer_score,language_score,expected_salary_score
0,GOO YE JUI,60184040438,yjyejui626@gmail.com,,,"[{'Country': 'Malaysia', 'State': 'Penang', 'C...",[{'field_of_study': 'Bachelor Of Computer Scie...,"['Microsoft Certified: Azure AI Fundamentals',...","['Full-stack web development', 'Natural Langua...","['HTML 5', 'CSS', 'JavaScript', 'PHP', 'SQL', ...","[English, Mandarin, Malay, French]","[{'job_title': 'Data Science Intern', 'job_com...",10,No exact match and no matching industry from p...,0,7,0


In [250]:
def evaluate_prof_cert_phrase(data_dict, input, weightage):

    def detect_match_phrases(resume, match_phrases):
        matches = []
        for phrase in match_phrases:
            print("phrase",phrase)
            if type(phrase) == list:
                for x in phrase:
                    print("printing x",x)
                    pattern = re.compile(fr'\b{re.escape(x)}\b', re.IGNORECASE)
                    matches.extend(pattern.findall(resume.lower()))
                    print("printing matches",matches)
            else:
                # Use case-insensitive matching and convert to lowercase
                print(phrase)
                pattern = re.compile(fr'{phrase}', re.IGNORECASE)
                print(pattern)
                print("resume",resume)
                matches.extend(pattern.findall(resume.lower()))

        # Remove duplicates by converting the list to a set and back to a list
        unique_matches = list(set(matches))

        return unique_matches
    
    def evaluate_candidate_score(matched_phrases, match_phrase_input, weightage):
        # Calculate the score based on the weightage
        score = round(len(matched_phrases) / len(match_phrase_input) * weightage,2)

        return score
    

    # Read the abbreviation CSV file
    # file_path = os.path.join(os.path.dirname(__file__), 'CVMatching_Prof_Cert_Wikipedia.xlsx')
    file_path = 'CVMatching_Prof_Cert_Wikipedia.xlsx'
    abb_csv = pd.read_excel(file_path)
    abb_csv = abb_csv[['Name', 'Abbreviation']]
    abb_csv = abb_csv.dropna(subset=['Abbreviation']).reset_index(drop=True)

    abb_csv['Name_lower'] = abb_csv['Name'].str.lower()
    unique_elements = [ue.strip() for ue in input.split(",")]

    # Retrieve 'Professional Certificate' field from data_dict
    professional_certificates = data_dict['professional_certificate']
    
    for phrase in unique_elements.copy():
        # Convert the current phrase to lowercase for case-insensitive comparison
        phrase_lower = phrase.lower()
        
        # Check if the lowercase phrase is an exact match in any lowercase entry in the 'Name' or 'Abbreviation' columns
        match = abb_csv[(abb_csv['Name_lower'] == phrase_lower) | (abb_csv['Abbreviation'].str.lower() == phrase_lower)]
        
        # If there is a match, remove both the abbreviation and the full name from the unique elements
        if not match.empty:
            # Update with matched abbreviations and names
            unique_elements.append([match['Name'].values[0],match['Abbreviation'].values[0]])
            unique_elements.remove(phrase)

    # Convert data_dict to a string
    data_dict_str = ''.join(professional_certificates)

    # Detect matched phrases
    matched_phrases = detect_match_phrases(data_dict_str, unique_elements)
    print('matched_phrases', matched_phrases,len(matched_phrases))
    print('unique_elements', unique_elements,len(unique_elements))
    # Evaluate candidate score
    score = evaluate_candidate_score(matched_phrases, unique_elements, weightage)
    
    print (f"Candidate: {data_dict['name']}\t\t 10. Prof Cert Score: {score}/{weightage}\t Employer's Certs: {input},  Candidate's Certs: {professional_certificates}\n ")
    
    return score

In [251]:
input = "CFA, Microsoft Certified: Azure AI Fundamentals"
data_dict['expected_salary_score'] = data_dict.apply(lambda row: evaluate_prof_cert_phrase(row, input,10), axis=1)

phrase Microsoft Certified: Azure AI Fundamentals
Microsoft Certified: Azure AI Fundamentals
re.compile('Microsoft Certified: Azure AI Fundamentals', re.IGNORECASE)
resume ['Microsoft Certified: Azure AI Fundamentals', 'Google Data Analytics Certificate by Coursera', 'Alteryx Foundational Micro-Credential', 'Alteryx Designer Core Certification', 'AWS Academy Graduate - AWS Academy Cloud Foundations', 'AWS Academy Graduate - AWS Academy Machine Learning Foundations', 'AWS Academy Graduate - AWS Academy Data Analytics', 'AWS Academy Graduate - AWS Academy Machine Learning for Natural Language Processing', 'AWS Academy Graduate - AWS Academy Data Engineering', 'AWS Academy Graduate - AWS Academy Cloud Web Application Builder', 'AWS Academy Graduate - AWS Academy Cloud Data Pipeline Builder']
phrase ['Chartered Financial Analyst', 'CFA']
printing x Chartered Financial Analyst
printing matches ['microsoft certified: azure ai fundamentals']
printing x CFA
printing matches ['microsoft certifi

In [276]:
def evaluate_year_grad_score(data_dict, input_year, weightage): 
    out_yr_grad  =  0 

    if 'education_background' not in data_dict:
        print("No educational background provided.")
        return "No educational background provided.",0
    else: 
        # Sort education background by year of graduation once, after preprocessing
        data_list = ast.literal_eval(data_dict.education_background)
        print(data_list)
        data_list.sort(key=lambda x: int(x['year_of_graduation']), reverse=True)
        # Preprocess and validate year of graduation entries
        res = ""
        for edu in data_list:
            year_of_graduation = str(edu['year_of_graduation'])  # Ensure it's a string for comparison
            print(year_of_graduation)
            if not year_of_graduation.isdigit() and year_of_graduation.lower() not in ['present', 'current']:
                edu['year_of_graduation'] = 'N/A'
            elif year_of_graduation.lower() in ['present', 'current']:
                edu['year_of_graduation'] = 'Still Studying'

            year_of_graduation = str(edu['year_of_graduation']) 
            res += year_of_graduation + ", "
            if year_of_graduation == input_year:
                out_yr_grad = weightage

        # Print the result
        res = res if res else "Not Specified" 
        print(f"16. Year of Grad: {out_yr_grad}\t Employer: {input_year},  Candidate: {res}")

    return res,out_yr_grad


In [277]:
input = "2024"
data_dict[['year_of_graduation', 'year_of_graduation_score']] = data_dict.apply(lambda row: pd.Series(evaluate_year_grad_score(row, input, 20)), axis=1)

[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}]
2024
2020
16. Year of Grad: 20	 Employer: 2024,  Candidate: 2024, 2020, 


In [278]:
data_dict

Unnamed: 0,name,phone_number,email,local,expected_salary,current_location,education_background,professional_certificate,skill_group,technology_programs_tool,language,previous_job_roles,current_location_score,targeted_employer,targeted_employer_score,language_score,expected_salary_score,year_of_graduation_score,year_of_graduation
0,GOO YE JUI,60184040438,yjyejui626@gmail.com,,,"[{'Country': 'Malaysia', 'State': 'Penang', 'C...",[{'field_of_study': 'Bachelor Of Computer Scie...,"['Microsoft Certified: Azure AI Fundamentals',...","['Full-stack web development', 'Natural Langua...","['HTML 5', 'CSS', 'JavaScript', 'PHP', 'SQL', ...","[English, Mandarin, Malay, French]","[{'job_title': 'Data Science Intern', 'job_com...",10,No exact match and no matching industry from p...,0,7,5.0,20,"2024, 2020,"


In [292]:
import openai
def gpt_recommendation_summary(data_dict):
    max_retries = 5
    retry_count = 0
    
    data_df = pd.DataFrame.from_dict([data_dict])
    df = data_df[['education_background', 'skill_group',
    'technology_programs_tool',
    'previous_job_roles', 'professional_certificate', 'language']]
    candidate_info = df.to_dict()
    
    try:
        yoer_prompt_system = f"""[Instruction]
        You are the {job_title} recruiter, state all the alignments and misalignments of the candidate's qualifications and experience with the job description, and job requirements.

        [Question]
        - [Job Description]
        {job_description}
        - [Job Requirements]
        {job_requirement}
        """

        yoer_prompt_user = f"""
        {candidate_info}
        """
        client = OpenAI()
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-0125", # 3.5 turbo 
            messages=[
                {"role": "system", "content": yoer_prompt_system},
                {"role": "user", "content": yoer_prompt_user}
            ],
            temperature=0.3
        )

        return response.choices[0].message.content
    except openai.RateLimitError as e:
        print(f"OpenAI rate limit exceeded. Pausing for one minute before resuming... (From RateLimitError)")
        print(e)
        time.sleep(30)
        retry_count += 1
        response = "Error"

        if retry_count >= max_retries:
            print("Exceeded maximum retries for parsing PDF.... (From RateLimitError)")
            return response

In [293]:
data_dict['gpt_recommendation_summary'] = data_dict.apply(lambda row: gpt_recommendation_summary(row), axis=1)

In [295]:
print(data_dict['gpt_recommendation_summary'][0])

**Alignments:**
1. **Education Background:** The candidate holds a Bachelor's degree in Computer Science, which aligns with the job requirement of a Bachelor's degree in Computer Science, Statistics, Mathematics, or a related field.
2. **Skill Set:** The candidate has skills in Python, SQL, and data analysis, which are required for the job. Additionally, the candidate's experience with Python aligns with the job requirement of proficiency in programming languages such as Python, R, or SQL.
3. **Professional Certificates:** The candidate has various certifications related to data analytics and machine learning, which demonstrate a commitment to continuous learning and align with the job requirement of staying current with the latest developments in data science and technology.

**Misalignments:**
1. **Education Background:** The candidate's field of study is more focused on Data Engineering, which may not directly align with the preferred fields of Computer Science, Statistics, or Mathe