In [13]:
import os
import openai
import pandas as pd
import tiktoken  # for counting tokens
from scipy import spatial
import pretty_errors
import timeit
from preprocess import individual_preprocess
from dotenv import load_dotenv
from utils.SummariseJob import summarise_job_gpt
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

In [14]:
load_dotenv('.env')
openai.api_key = os.getenv("OPENAI_API_KEY")
SAVE_PATH = os.getenv("SAVE_PATH")
E5_BASE_TOTAL_JOBS = os.getenv("E5_BASE_TOTAL_JOBS")
OPENAI_TOTAL_JOBS = os.getenv("OPENAI_TOTAL_JOBS")

#Start the timer
start_time = timeit.default_timer()

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
#GPT_MODEL = "gpt-3.5-turbo"
GPT_MODEL = "gpt-4"
#GPT_MODEL = "gpt-3.5-turbo-16k"
""""
Load the embedded file
"""

embeddings_path = E5_BASE_TOTAL_JOBS

df = pd.read_parquet(embeddings_path)


In [15]:
def average_pool(last_hidden_states: Tensor,
                attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


In [16]:
def ids_ranked_by_relatedness_e5(query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    #Modify this to get more jobs
    top_n: int = 10
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    
    tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')
    model = AutoModel.from_pretrained('intfloat/e5-base-v2')

    query_preprocess = individual_preprocess(query)
    
    # Tokenize the input texts
    batch_dict = tokenizer(query_preprocess, max_length=512, padding=True, truncation=True, return_tensors='pt')

    outputs = model(**batch_dict)
    query_embedding = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).detach().numpy().flatten()
    ids_and_relatednesses = [
        (row["ids"], relatedness_fn(query_embedding, row["embeddings"]))
        for i, row in df.iterrows()
    ]
    ids_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    ids, relatednesses = zip(*ids_and_relatednesses)
    return ids[:top_n], relatednesses[:top_n]

In [17]:
#tiktoken function -> to count tokens
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [18]:

delimiters = "####"

system_prompt=f"""

You are a job recruiter for a large recruitment agency./
You will be provided with a candidate's CV./
The CV will be delimited with {delimiters} characters./
You will also be provided with the Job IDs (delimited by angle brackets) /
and corresponding descriptions (delimited by triple dashes)/
for the available job openings./

Perform the following steps:/

Step 1 - Classify the provided CV into a suitability category for each job opening./
Step 2 - For each ID briefly explain in one sentence your reasoning behind the chosen suitability category./
Step 3 - Only provide your output in json format with the keys: id, suitability and explanation./

Do not classify a CV into a suitability category until you have classify the CV yourself.

Suitability categories: Highly Suitable, Moderately Suitable,/
Potentially Suitable, Marginally Suitable and Not Suitable./

Highly Suitable: CVs in this category closely align with the job opening, demonstrating extensive relevant experience, skills, and qualifications. The candidate possesses all or most of the necessary requirements and is an excellent fit for the role./
Moderately Suitable: CVs falling into this category show a reasonable match to the job opening. The candidate possesses some relevant experience, skills, and qualifications that align with the role, but there may be minor gaps or areas for improvement. With some additional training or development, they could become an effective candidate./
Potentially Suitable: CVs in this category exhibit potential and may possess transferable skills or experience that could be valuable for the job opening. Although they may not meet all the specific requirements, their overall profile suggests that they could excel with the right support and training./
Marginally Suitable: CVs falling into this category show limited alignment with the job opening. The candidate possesses a few relevant skills or experience, but there are significant gaps or deficiencies in their qualifications. They may require substantial training or experience to meet the requirements of the role./
Not Suitable: CVs in this category do not match the requirements and qualifications of the job opening. The candidate lacks the necessary skills, experience, or qualifications, making them unsuitable for the role./
"""

introduction_prompt = """


\n Available job openings:\n

"""


abstract_cv_past = """Data Analyst: Cleansed, analyzed, and visualized data using Python, SQL Server, and Power BI.
Legal Assistant: Drafted legal documents, collaborated on negotiation outlines, and handled trademark registrations.
Data Analyst Jr.: Implemented A/B testing, utilized data analysis tools, and developed real-time visualizations.
Special Needs Counselor: Led and assisted individuals with disabilities, provided personal care, and facilitated camp activities.
Total years of professional experience: 3 years."""

abstract_cv = """('Qualifications: \n- LLB Law degree from Universidad de las Américas Puebla (UDLAP) with an accumulated average of 9.4/10.\n- Currently on an international exchange at the University of Bristol for the final year of studying Law.\n- Member of the Honours Program at UDLAP, conducting research on FinTech, Financial Inclusion, Blockchain, Cryptocurrencies, and Smart Contracts.\n\nPrevious job titles:\n- Data Analyst at Tata Consultancy Services México, where I cleansed, interpreted, and analyzed data using Python and SQL Server to produce visual reports with Power BI.\n- Legal Assistant at BLACKSHIIP Venture Capital, responsible for proofreading and drafting legal documents, as well as assisting with negotiations of International Share Purchase Agreements.\n\nResponsibilities/Key Duties:\n- Developed and introduced A/B testing to make data-driven business decisions as a Data Analyst Jr. at AMATL GRÁFICOS.\n- Taught mental arithmetic as a Mathematics Instructor at ALOHA Mental Arithmetic.\n- Led and assisted individuals with physical and mental disabilities as a Special Needs Counsellor at Camp Merrywood and YMCA Camp Independence.\n\nSkills:\n- Proficient in Python, SQL Server, Tableau, Power BI, Bash/Command Line, Git & GitHub, and Office 365.\n- Strong written and verbal communication skills, teamwork, ability to work under pressure, attention to detail, and leadership skills.\n- Knowledge in machine learning, probabilities & statistics, and proofreading.\n\nOther Achievements:\n- Published paper on "Smart Legal Contracts: From Theory to Reality" and participated in the IDEAS Summer Program on Intelligence, Data, Ethics, and Society at the University of California, San Diego."""

In [19]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    #Return a message for GPT, with relevant source texts pulled from a dataframe
    ids, relatednesses = ids_ranked_by_relatedness_e5(query, df)
    #Basically giving the most relevant IDs from the previous function
    introduction = introduction_prompt
    query_user = f"{query}"
    message = introduction
    total_cost_summarise_job = 0
    for id in ids:
        #Get the text for GPT to answer the question
        job_description = df[df['ids'] == id]['text_data'].values[0] 
        
        #Summarise the job description with GPT-3.5
        job_summarised, cost = summarise_job_gpt(job_description)
        total_cost_summarise_job += cost
        # Add job descriptions to the message along with job ID
        next_id = f'\nID:<{id}>\nJob Description:---{job_summarised}---\n'
        if (
            num_tokens(message + next_id + query_user, model=model)
            > token_budget
        ):
            break
        else:
            message += next_id
    return query_user, message, total_cost_summarise_job


In [20]:
"""def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    #Return a message for GPT, with relevant source texts pulled from a dataframe.
    ids, relatednesses = ids_ranked_by_relatedness_e5(query, df)
    #Basically giving the most relevant IDs from the previous function
    introduction = introduction_prompt
    query_user = f"{query}"
    message = introduction
    #total_cost_summarise_job = 0
    for id in ids:
        #Get the text for GPT to answer the question
        job_description = df[df['ids'] == id]['text_data'].values[0] 
        
        #Summarise the job description with GPT-3.5
        #job_summarised, cost = summarise_job(job_description)
        #total_cost_summarise_job += cost
        # Add job descriptions to the message along with job ID
        next_id = f'\nID:<{id}>\nJob Description:---{job_description}---\n'
        if (
            num_tokens(message + next_id + query_user, model=model)
            > token_budget
        ):
            break
        else:
            message += next_id
    return query_user, message"""

'def query_message(\n    query: str,\n    df: pd.DataFrame,\n    model: str,\n    token_budget: int\n) -> str:\n    #Return a message for GPT, with relevant source texts pulled from a dataframe.\n    ids, relatednesses = ids_ranked_by_relatedness_e5(query, df)\n    #Basically giving the most relevant IDs from the previous function\n    introduction = introduction_prompt\n    query_user = f"{query}"\n    message = introduction\n    #total_cost_summarise_job = 0\n    for id in ids:\n        #Get the text for GPT to answer the question\n        job_description = df[df[\'ids\'] == id][\'text_data\'].values[0] \n        \n        #Summarise the job description with GPT-3.5\n        #job_summarised, cost = summarise_job(job_description)\n        #total_cost_summarise_job += cost\n        # Add job descriptions to the message along with job ID\n        next_id = f\'\nID:<{id}>\nJob Description:---{job_description}---\n\'\n        if (\n            num_tokens(message + next_id + query_user, mo

In [21]:
def ask(
    #This query is your question, only parameter to fill in function
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096,
    print_gpt_messages: bool = True,
    print_cost_and_relatednesses: bool = True
) -> str:
    #Answers a query using GPT and a dataframe of relevant texts and embeddings.
    query_user, job_id_description, total_cost_summarise_job = query_message(query, df, model=model, token_budget=token_budget)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{delimiters}{query_user}{delimiters}"},
        {"role": "assistant", "content": job_id_description}
    ]
    if print_gpt_messages:
        print(messages)
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    
    if print_cost_and_relatednesses:
        total_tokens = response['usage']['total_tokens']
        prompt_tokens = response['usage']['prompt_tokens']
        completion_tokens = response['usage']['completion_tokens']
        print(f"\nPROMPT TOKENS USED:{prompt_tokens}\n", f"COMPLETION TOKENS USED:{completion_tokens}\n", f"\nTOTAL TOKENS USED:{total_tokens}\n", )
        #Approximate cost
        if GPT_MODEL == "gpt-4":
            prompt_cost = round((prompt_tokens / 1000) * 0.03, 3)
            completion_cost = round((completion_tokens / 1000) * 0.06, 3)
            cost_classify = prompt_cost + completion_cost
            print(f"COST FOR CLASSIFYING:", f"${cost_classify} USD")
            print(f"COST FOR SUMMARISING:", f"${total_cost_summarise_job} USD")
            final_cost = total_cost_summarise_job + cost_classify
            print(f"FINAL COST: ${final_cost} USD")
        elif GPT_MODEL == "gpt-3.5-turbo":
            prompt_cost = round((prompt_tokens / 1000) * 0.0015, 3)
            completion_cost = round((completion_tokens / 1000) * 0.002, 3)
            cost_classify = prompt_cost + completion_cost
            print(f"COST FOR CLASSIFYING:", f"${cost_classify} USD")
            print(f"COST FOR SUMMARISING:", f"${total_cost_summarise_job} USD")
            final_cost = total_cost_summarise_job + cost_classify
            print(f"FINAL COST: ${final_cost} USD")
        elif GPT_MODEL == "gpt-3.5-turbo-16k":
            prompt_cost = round((prompt_tokens / 1000) * 0.003, 3)
            completion_cost = round((completion_tokens / 1000) * 0.004, 3)
            cost_classify = prompt_cost + completion_cost
            print(f"COST FOR CLASSIFYING:", f"${cost_classify} USD")
            print(f"COST FOR SUMMARISING:", f"${total_cost_summarise_job} USD")
            final_cost = total_cost_summarise_job + cost_classify
            print(f"FINAL COST: ${final_cost} USD")

        #relatednesses
        ids, relatednesses = ids_ranked_by_relatedness_e5(query=query, df=df)
        print(f"\nTHE IDs ARE RANKED BY RELEVANCE:\n")
        for id, relatedness in zip(ids, relatednesses):
            print(f"ID: {id} has the following {relatedness=:.3f}")

        elapsed_time = timeit.default_timer() - start_time
        print("\n", f"DreamedJobAI finished! all in: {elapsed_time:.2f} seconds", "\n") 
    return response_message

In [22]:
"""def ask(
    #This query is your question, only parameter to fill in function
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 8192,
    print_gpt_messages: bool = True,
    print_cost_and_relatednesses: bool = True
) -> str:
    #Answers a query using GPT and a dataframe of relevant texts and embeddings.
    query_user, job_id_description = query_message(query, df, model=model, token_budget=token_budget)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{delimiters}{query_user}{delimiters}"},
        {"role": "assistant", "content": job_id_description}
    ]
    if print_gpt_messages:
        print(messages)
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    
    if print_cost_and_relatednesses:
        total_tokens = response['usage']['total_tokens']
        prompt_tokens = response['usage']['prompt_tokens']
        completion_tokens = response['usage']['completion_tokens']
        print(f"\nPROMPT TOKENS USED:{prompt_tokens}\n", f"COMPLETION TOKENS USED:{completion_tokens}\n", f"\nTOTAL TOKENS USED:{total_tokens}\n", )
        #Approximate cost
        if GPT_MODEL == "gpt-4":
            prompt_cost = round((prompt_tokens / 1000) * 0.03, 3)
            completion_cost = round((completion_tokens / 1000) * 0.06, 3)
            cost_classify = prompt_cost + completion_cost
            print(f"COST FOR CLASSIFYING:", f"${cost_classify} USD")
            #print(f"COST FOR SUMMARISING:", f"${total_cost_summarise_job} USD")
            #final_cost = total_cost_summarise_job + cost_classify
            #print(f"FINAL COST: ${final_cost} USD")
        elif GPT_MODEL == "gpt-3.5-turbo":
            prompt_cost = round((prompt_tokens / 1000) * 0.0015, 3)
            completion_cost = round((completion_tokens / 1000) * 0.002, 3)
            cost_classify = prompt_cost + completion_cost
            print(f"COST FOR CLASSIFYING:", f"${cost_classify} USD")
            #print(f"COST FOR SUMMARISING:", f"${total_cost_summarise_job} USD")
            #final_cost = total_cost_summarise_job + cost_classify
            #print(f"FINAL COST: ${final_cost} USD")
        elif GPT_MODEL == "gpt-3.5-turbo-16k":
            prompt_cost = round((prompt_tokens / 1000) * 0.003, 3)
            completion_cost = round((completion_tokens / 1000) * 0.004, 3)
            cost_classify = prompt_cost + completion_cost
            print(f"COST FOR CLASSIFYING:", f"${cost_classify} USD")
            #print(f"COST FOR SUMMARISING:", f"${total_cost_summarise_job} USD")
            #final_cost = total_cost_summarise_job + cost_classify
            #print(f"FINAL COST: ${final_cost} USD")

        #relatednesses
        ids, relatednesses = ids_ranked_by_relatedness_e5(query=query, df=df)
        print(f"\nTHE IDs ARE RANKED BY RELEVANCE:\n")
        for id, relatedness in zip(ids, relatednesses):
            print(f"ID: {id} has the following {relatedness=:.3f}")
        
        elapsed_time = timeit.default_timer() - start_time
        print("\n", f"DreamedJobAI finished! all in: {elapsed_time:.2f} seconds", "\n") 
    return response_message"""

'def ask(\n    #This query is your question, only parameter to fill in function\n    query: str,\n    df: pd.DataFrame = df,\n    model: str = GPT_MODEL,\n    token_budget: int = 8192,\n    print_gpt_messages: bool = True,\n    print_cost_and_relatednesses: bool = True\n) -> str:\n    #Answers a query using GPT and a dataframe of relevant texts and embeddings.\n    query_user, job_id_description = query_message(query, df, model=model, token_budget=token_budget)\n    messages = [\n        {"role": "system", "content": system_prompt},\n        {"role": "user", "content": f"{delimiters}{query_user}{delimiters}"},\n        {"role": "assistant", "content": job_id_description}\n    ]\n    if print_gpt_messages:\n        print(messages)\n    response = openai.ChatCompletion.create(\n        model=model,\n        messages=messages,\n        temperature=0\n    )\n    response_message = response["choices"][0]["message"]["content"]\n    \n    if print_cost_and_relatednesses:\n        total_tokens

In [23]:
print(ask(abstract_cv))

[{'role': 'system', 'content': "\n\nYou are a job recruiter for a large recruitment agency./\nYou will be provided with a candidate's CV./\nThe CV will be delimited with #### characters./\nYou will also be provided with the Job IDs (delimited by angle brackets) /\nand corresponding descriptions (delimited by triple dashes)/\nfor the available job openings./\n\nPerform the following steps:/\n\nStep 1 - Classify the provided CV into a suitability category for each job opening./\nStep 2 - For each ID briefly explain in one sentence your reasoning behind the chosen suitability category./\nStep 3 - Only provide your output in json format with the keys: id, suitability and explanation./\n\nDo not classify a CV into a suitability category until you have classify the CV yourself.\n\nSuitability categories: Highly Suitable, Moderately Suitable,/\nPotentially Suitable, Marginally Suitable and Not Suitable./\n\nHighly Suitable: CVs in this category closely align with the job opening, demonstratin

In [24]:
gpt_3_summary = """Write an abstract description of the following CV in four bullet points. 

Let's think step by step to summarise it into four bullet points:

1. Focus on the main skills and responsibilities of each role. 
2. One of the bullet points is the total years of experience.
2. Omit the employer names. 
3. Your answer must be in an active voice. 
4. Double-check that the summary is in four bullet points, three summarising the main skills and responsibilities and the last one is about the years of experience. 
CV IS BELOW: """