In [7]:
from dotenv import load_dotenv
import os

load_dotenv(".env")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"FYP-Goo"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.environ.get('LANGCHAIN_API_KEY')

In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("GooYeJui.pdf", extract_images=True)
pages = loader.load()

In [2]:
pages

[Document(page_content='Profile Summary\nIndustry ExperienceRelevant Skills\nGOO YE JUI\nA recent graduate with a strong foundation in both front-end and back-end development, combined with a\npassion for innovation and a commitment to making lives better through technology. Eager to contribute to\nthe vision of Unit Nukleus GovTech by leveraging technical expertise to empower the nation and enhance\ndigital government services. \nFull-stack web development (HTML 5, CSS,\nJavaScript, PHP, SQL, Python, .NET, React)\nNatural Language Processing: spaCy, NLTK,\nTensorFlow, PyTorch\nGenerative AI related : LangChain, Llama\nIndex\nPetronas Digital Sdn Bhd - Data Science Intern Sept 2023 - Jun 2024\nDeveloped (in 2 months) an Generative AI Based Resume Parser for Group HRM. The AI-powered parser\nautomates the extraction and parsing of candidate information from resumes, saving HR professionals\nvaluable time and effort in manually reviewing and categorizing resumes. Responsible for all fron

In [9]:
from typing import List, Optional
from langchain_core.pydantic_v1 import BaseModel, Field
class Candidate(BaseModel):
    """Information about a candidate from his/her resume."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.

    name: Optional[str] = Field(..., description="The name of the candidate")
    phone_number: Optional[str] = Field(
        ..., description="The phone number of the candidate"
    )
    email: Optional[str] = Field(
        ..., description="The email of the candidate"
    )
    local: Optional[str] = Field(
        ..., description="Is the candidate Malaysian(Yes or No)?"
    )
    expected_salary: Optional[str] = Field(
        ..., description="Candidate's expected salary in RM if known. (If the currency is Ringgit Malaysia, assign the numerical value or range values only Eg:'3000-3100'. If in other currency, assign alongside currency)"
    )
    current_location: Optional[List] = Field(
        ..., description="Candidate's current location if known. If the candidate does not mention the country, assign the country based on the state and city (return it in a python list containing dictionary format like this 'Country': '', 'State': '', 'City': '' )"
    )
    education_background: Optional[List] = Field(
        ..., description="Every single candidate's education background. (field of study, level (always expand to long forms), cgpa, university, Start Date, Year of Graduation (Year in 4-digits only, remove month). All in a python dict format."
    )
    professional_certificate: Optional[List] = Field(
        ..., description="Candidate's professional certificates if known"
    )
    skill_group: Optional[List] = Field(
        ..., description="Candidate's skill groups if known"
    )
    technology_programs_tool: Optional[List] = Field(
        ..., description="Technology (Tools, Program, System) that the candidate knows if known."
    )
    language: Optional[List] = Field(
        ..., description="Languages that the candidate knows"
    )
    previous_job_roles: Optional[List] = Field(
        ..., description="Every single one of the candidate's (job title, job company, Industries (strictly classify according to to The International Labour Organization), start date and end date (only assign date time format if available. Do not assign duration), job location, Job Duration (Years) (if not in years, convert to years)) (If duration is stated, update the job duration instead.) in a python dict format."
    )


In [12]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm with 20 years experience in the recruiting industry. You will be provided with candidate's resume. "
            "Extract relevant candidate's information mentioned in the following candidate's resume together with their properties. "
            "If you do not know the value of an attribute asked to extract, "
            "1) Please provide an accurate answers, no guessing."
            "2) Please return 'N/A' only if the information is not mentioned."
            "3) The response should strictly follow the Python dictionary format."
            "4) No need to return any reasoning as this is only for extraction of information."
            "5) Extracted Properties of all Start date and End date: "
            "* if the month is not stated, assume that start/end date is in the middle of the year. "
            "* should never include english words such as 'months', 'years', 'days'. "
            "* Instead, dates should be dates converted to the following format: "
            "* date values assigned are strictly in Python datetime format "
            """Strict Format of either one: 
                YYYY
                YYYY-MM or YYYYMM
                YYYY-MM-DD or YYYYMMDD
            6) Ensure that for any duration (year) calculation: 
            * Any end date that indicates "Present", refers to today's date, which is {current_date}. 
            * Do not assume the work experiences are continuous without breaks.
            * Method of duration calculation: Subtract the end date from start date to get the number of months. Finally sum up all relevant durations and convert to years. 
            * Triple check your calculations. ","""
        ),
        ("human", "{text}"),
    ]
)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.3)
runnable = prompt | llm.with_structured_output(schema=Candidate)
result = runnable.invoke({"text": pages,"current_date":datetime.now()})

In [22]:
result

Candidate(name='GOO YE JUI', phone_number='+60184040438', email='yjyejui626@gmail.com', local='N/A', expected_salary='N/A', current_location=['Country: Malaysia', 'State: Penang', 'City: Bukit Mertajam'], education_background=[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}, {'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}], professional_certificate=['Microsoft Certified: Azure AI Fundamentals', 'Google Data Analytics Certificate by Coursera', 'Alteryx Foundational Micro-Credential', 'Alteryx Designer Core Certification', 'AWS Academy Graduate - AWS Academy Cloud Foundations', 'AWS Academy Graduate - AWS Academy Machine Learning Foundations', 'AWS Academy Graduate - AWS Academy Data Analytic

In [47]:
result.education_background

[{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)',
  'level': "Bachelor's Degree",
  'cgpa': '3.97',
  'university': 'Universiti Teknologi Malaysia',
  'start_date': '2020',
  'year_of_graduation': '2024'},
 {'field_of_study': 'Foundation in Science',
  'level': 'Foundation',
  'cgpa': '3.78',
  'university': 'Universiti Teknologi Malaysia',
  'start_date': '2019',
  'year_of_graduation': '2020'}]

In [46]:
for education in result.education_background:
    print(str(education))

{'field_of_study': 'Bachelor Of Computer Science (Data Engineering)', 'level': "Bachelor's Degree", 'cgpa': '3.97', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2020', 'year_of_graduation': '2024'}
{'field_of_study': 'Foundation in Science', 'level': 'Foundation', 'cgpa': '3.78', 'university': 'Universiti Teknologi Malaysia', 'start_date': '2019', 'year_of_graduation': '2020'}


In [69]:
import pandas as pd

# Initialize an empty list to store row dictionaries
rows = []

# Iterate over each result and append it to the DataFrame
for field_name, candidate_info in result.items():
    # Create a dictionary representing the row
    row_dict = {field_name: str(candidate_info)}
    # Append the row dictionary to the list
    rows.append(row_dict)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(rows)

# Display the DataFrame
print(df)

{'name': 'GOO YE JUI'}


AttributeError: 'DataFrame' object has no attribute 'append'

In [66]:
df

Unnamed: 0,name,phone_number,email,local,expected_salary,current_location,education_background,professional_certificate,skill_group,technology_programs_tool,language,previous_job_roles
