In [33]:
import os
import openai
import psycopg2
import pandas as pd
import tiktoken  # for counting tokens
from scipy import spatial
import pretty_errors
import timeit
import logging
import time
import asyncio
from openai.error import OpenAIError
import json
from typing import Callable
from utils.preprocess import individual_preprocess
from dotenv import load_dotenv
from utils.SummariseJob import summarise_job_gpt
from utils.AsyncSummariseJob import async_summarise_description
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from utils.handy import e5_base_v2_query, LoggingGPT4, filter_last_two_weeks
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


In [34]:
load_dotenv('.env')
openai.api_key = os.getenv("OPENAI_API_KEY")
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")
database = os.getenv("database")
SAVE_PATH = os.getenv("SAVE_PATH")
E5_BASE_V2_DATA = os.getenv("E5_BASE_V2_DATA")


#Start the timer
start_time = timeit.default_timer()

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
#GPT_MODEL = "gpt-3.5-turbo"
GPT_MODEL = "gpt-4"
#GPT_MODEL = "gpt-3.5-turbo-16k"
""""
Load the embedded file
"""

logging.basicConfig(filename='/Users/juanreyesgarcia/Library/CloudStorage/OneDrive-FundacionUniversidaddelasAmericasPuebla/DEVELOPER/PROJECTS/DreamedJobAI/logs/LoggingGPT4.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


embeddings_path = E5_BASE_V2_DATA

df_unfiltered = pd.read_parquet(embeddings_path)

df = filter_last_two_weeks(df_unfiltered)


In [35]:
def ids_ranked_by_relatedness_e5(query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    #Modify this to get more jobs
    top_n: int = 10
) -> tuple[list[str], list[float]]:
    
    #the query is embedded using e5
    query_embedding = e5_base_v2_query(query=query)

    ids_and_relatednesses = [
        (row["id"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    ids_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    ids, relatednesses = zip(*ids_and_relatednesses)
    return ids[:top_n], relatednesses[:top_n]     
    #Returns a list of strings and relatednesses, sorted from most related to least.


In [36]:
#tiktoken function -> to count tokens
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [37]:

delimiters = "####"

system_prompt=f"""

You are a job recruiter for a large recruitment agency./
You will be provided with a candidate's CV./
The CV will be delimited with {delimiters} characters./
You will also be provided with the Job IDs (delimited by angle brackets) /
and corresponding descriptions (delimited by triple dashes)/
for the available job openings./

Perform the following steps:/

Step 1 - Classify the provided CV into a suitability category for each job opening./
Step 2 - For each ID briefly explain in one sentence your reasoning behind the chosen suitability category./
Step 3 - Only provide your output in json format with the keys: id, suitability and explanation./

Do not classify a CV into a suitability category until you have classify the CV yourself.

Suitability categories: Highly Suitable, Moderately Suitable, Potentially Suitable, Marginally Suitable and Not Suitable./

Highly Suitable: CVs in this category closely align with the job opening, demonstrating extensive relevant experience, skills, and qualifications. The candidate possesses all or most of the necessary requirements and is an excellent fit for the role./
Moderately Suitable: CVs falling into this category show a reasonable match to the job opening. The candidate possesses some relevant experience, skills, and qualifications that align with the role, but there may be minor gaps or areas for improvement. With some additional training or development, they could become an effective candidate./
Potentially Suitable: CVs in this category exhibit potential and may possess transferable skills or experience that could be valuable for the job opening. Although they may not meet all the specific requirements, their overall profile suggests that they could excel with the right support and training./
Marginally Suitable: CVs falling into this category show limited alignment with the job opening. The candidate possesses a few relevant skills or experience, but there are significant gaps or deficiencies in their qualifications. They may require substantial training or experience to meet the requirements of the role./
Not Suitable: CVs in this category do not match the requirements and qualifications of the job opening. The candidate lacks the necessary skills, experience, or qualifications, making them unsuitable for the role./
"""

introduction_prompt = """


\n Available job openings:\n

"""


abstract_cv_past = """Data Analyst: Cleansed, analyzed, and visualized data using Python, SQL Server, and Power BI.
Legal Assistant: Drafted legal documents, collaborated on negotiation outlines, and handled trademark registrations.
Data Analyst Jr.: Implemented A/B testing, utilized data analysis tools, and developed real-time visualizations.
Special Needs Counselor: Led and assisted individuals with disabilities, provided personal care, and facilitated camp activities.
Total years of professional experience: 3 years."""

abstract_cv = """('Qualifications: \n- LLB Law degree from Universidad de las Américas Puebla (UDLAP) with an accumulated average of 9.4/10.\n- Currently on an international exchange at the University of Bristol for the final year of studying Law.\n- Member of the Honours Program at UDLAP, conducting research on FinTech, Financial Inclusion, Blockchain, Cryptocurrencies, and Smart Contracts.\n\nPrevious job titles:\n- Data Analyst at Tata Consultancy Services México, where I cleansed, interpreted, and analyzed data using Python and SQL Server to produce visual reports with Power BI.\n- Legal Assistant at BLACKSHIIP Venture Capital, responsible for proofreading and drafting legal documents, as well as assisting with negotiations of International Share Purchase Agreements.\n\nResponsibilities/Key Duties:\n- Developed and introduced A/B testing to make data-driven business decisions as a Data Analyst Jr. at AMATL GRÁFICOS.\n- Taught mental arithmetic as a Mathematics Instructor at ALOHA Mental Arithmetic.\n- Led and assisted individuals with physical and mental disabilities as a Special Needs Counsellor at Camp Merrywood and YMCA Camp Independence.\n\nSkills:\n- Proficient in Python, SQL Server, Tableau, Power BI, Bash/Command Line, Git & GitHub, and Office 365.\n- Strong written and verbal communication skills, teamwork, ability to work under pressure, attention to detail, and leadership skills.\n- Knowledge in machine learning, probabilities & statistics, and proofreading.\n\nOther Achievements:\n- Published paper on "Smart Legal Contracts: From Theory to Reality" and participated in the IDEAS Summer Program on Intelligence, Data, Ethics, and Society at the University of California, San Diego."""

In [38]:
async def async_query_summary(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    #Return a message for GPT, with relevant source texts pulled from a dataframe.
    ids, relatednesses = ids_ranked_by_relatedness_e5(query, df)
    #Basically giving the most relevant IDs from the previous function
    introduction = introduction_prompt
    query_user = f"{query}"
    message = introduction
    # Create a list of tasks
    tasks = [async_summarise_description(df[df['id'] == id]['original'].values[0]) for id in ids]

    # Run the tasks concurrently
    results = await asyncio.gather(*tasks)    

    for id, result in zip(ids, results):
        job_description_summary, cost, elapsed_time = result
        logging.info(f"TOTAL COST: {cost}. TOTAL ELAPSED TIME: {elapsed_time}")
        next_id = f'\nID:<{id}>\nJob Description:---{job_description_summary}---\n'
        if (
            num_tokens(message + next_id + query_user, model=model)
            > token_budget
        ):
            break
        else:
            message += next_id
    return query_user, message

In [39]:
#@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
async def ask(
    #This query is your question, only parameter to fill in function
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 8192,
    log_gpt_messages: bool = True
) -> str:
    #Answers a query using GPT and a dataframe of relevant texts and embeddings.
    query_user, job_id_description = await async_query_summary(query, df, model=model, token_budget=token_budget)
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{delimiters}{query_user}{delimiters}"},
        {"role": "assistant", "content": job_id_description}
    ]
    if log_gpt_messages:
        logging.info(messages)
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    
    #if print_cost_and_relatednesses:
    total_tokens = response['usage']['total_tokens']
    prompt_tokens = response['usage']['prompt_tokens']
    completion_tokens = response['usage']['completion_tokens']
    logging.info(f"\nPROMPT TOKENS USED:{prompt_tokens}\n COMPLETION TOKENS USED:{completion_tokens}\n \nTOTAL TOKENS USED:{total_tokens}\n")
    #Approximate cost
    if GPT_MODEL == "gpt-4":
        prompt_cost = round((prompt_tokens / 1000) * 0.03, 3)
        completion_cost = round((completion_tokens / 1000) * 0.06, 3)
        cost_classify = prompt_cost + completion_cost
        logging.info(f"MODEL USED: {GPT_MODEL}. COST FOR CLASSIFYING: ${cost_classify} USD")
    elif GPT_MODEL == "gpt-3.5-turbo":
        prompt_cost = round((prompt_tokens / 1000) * 0.0015, 3)
        completion_cost = round((completion_tokens / 1000) * 0.002, 3)
        cost_classify = prompt_cost + completion_cost
        logging.info(f"MODEL USED: {GPT_MODEL}. COST FOR CLASSIFYING: ${cost_classify} USD")
    elif GPT_MODEL == "gpt-3.5-turbo-16k":
        prompt_cost = round((prompt_tokens / 1000) * 0.003, 3)
        completion_cost = round((completion_tokens / 1000) * 0.004, 3)
        cost_classify = prompt_cost + completion_cost
        logging.info(f"MODEL USED: {GPT_MODEL}. COST FOR CLASSIFYING: ${cost_classify} USD")

    #relatednesses
    ids, relatednesses = ids_ranked_by_relatedness_e5(query=query, df=df)
    for id, relatedness in zip(ids, relatednesses):
        logging.info(f"ID: {id} has the following {relatedness=:.3f}")
    
    elapsed_time = timeit.default_timer() - start_time
    logging.info(f"\n DreamedJobAI finished! all in: {elapsed_time:.2f} seconds \n")
    
    return response_message

In [41]:

async def check_output_GPT4(input_cv: str) -> str:
    default = '[{"id": "", "suitability": "", "explanation": ""}]'
    default_json = json.loads(default)
    
    for _ in range(6):
        i = _ + 1
        try:
            python_string = await ask(input_cv)
            try:
                data = json.loads(python_string)
                logging.info(f"Response is a valid json object. Done in loop number: {i}")
                return data
            except json.JSONDecodeError:
                pass
        except OpenAIError as e:
            logging.warning(f"{e}. Retrying in 10 seconds. Number of retries: {i}")
            time.sleep(10)
            pass
        except Exception as e:
            logging.warning(f"{e}. Retrying in 5 seconds. Number of retries: {i}")
            time.sleep(5)
            pass

    logging.error("Check logs!!!! Main function was not callable. Setting json to default")
    return default_json

#checked_json = check_output_GPT4(ask, abstract_cv)
checked_json = await check_output_GPT4(abstract_cv)
#exp = check_output_GPT4(get_data, 0)

logging.info(f"type of the json object: {type(checked_json)} Data: {checked_json}")
#print(type(exp), exp)


  pass
  for group in groupby(strings, lambda s: s[0] == first[0])) \


KeyboardInterrupt: 

In [None]:
async def ids_json_loads(data: list[dict[str, str, str]] = None) -> str:
    if data is None:
        data = await checked_json
    
    ids = ""
    for item in data:
        if "id" in item:
            if ids:
                ids += ", "
            ids += f"'{item['id']}'"

    return f"({ids})"

ids_ready = await ids_json_loads()
logging.info(f"Getting the ids from the json object: {type(ids_ready)}, {ids_ready}")



In [None]:
def set_dataframe_display_options():
    pd.set_option('display.max_columns', None)  # Show all columns
    pd.set_option('display.max_rows', None)  # Show all rows
    pd.set_option('display.width', None)  # Disable column width restriction
    pd.set_option('display.expand_frame_repr', False)  # Disable wrapping to multiple lines
    pd.set_option('display.max_colwidth', None)  # Display full contents of each column

# Call the function to set the desired display options
set_dataframe_display_options()

In [None]:
def join_postgre_data_with_ids(ids:str) -> pd.DataFrame:
    conn = psycopg2.connect(user=user, password=password, host=host, port=port, database=database)
    # Create a cursor object
    cur = conn.cursor()
    cur.execute( f"SELECT id, title, link, location FROM test WHERE id IN {ids}")

    # Fetch all rows from the table
    rows = cur.fetchall()

    # Separate the columns into individual lists
    all_ids = [row[0] for row in rows]
    all_titles = [row[1] for row in rows]
    all_links = [row[2] for row in rows]
    all_locations = [row[3] for row in rows]

    df = pd.DataFrame({
        'id': all_ids,
        'title': all_titles,
        'link': all_links,
        'location': all_locations
    })


    # Close the database connection
    cur.close()
    conn.close()

    return df

df = join_postgre_data_with_ids(ids=ids_ready)

print(df)


Empty DataFrame
Columns: [id, title, link, location]
Index: []


In [None]:
def adding_all_data(df: pd.DataFrame, suitable_jobs: list) -> pd.DataFrame:
    for index, row in df.iterrows():
        entry_id = row['id']
        for json_item in suitable_jobs:
            if int(json_item['id']) == entry_id:
                suitability = json_item['suitability']
                explanation = json_item['explanation']
                df.at[index, 'suitability'] = suitability
                df.at[index, 'explanation'] = explanation
                break
    return df

updated_data = adding_all_data(df=df, suitable_jobs=checked_json)

print(updated_data)


Empty DataFrame
Columns: [id, title, link, location]
Index: []


In [None]:
def sort_df_by_suitability(df: pd.DataFrame = df) -> pd.DataFrame:
    custom_order = {
        'Highly Suitable': 1,
        'Moderately Suitable': 2,
        'Potentially Suitable': 3,
        'Marginally Suitable': 4,
        'Not Suitable': 5
    }
    df['suitability_rank'] = df['suitability'].map(custom_order)
    sorted_df = df.sort_values(by='suitability_rank')
    sorted_df = sorted_df.drop(columns='suitability_rank')
    return sorted_df

sorted_df = sort_df_by_suitability()

print(sorted_df)

test_output= sorted_df.to_csv("/Users/juanreyesgarcia/Library/CloudStorage/OneDrive-FundacionUniversidaddelasAmericasPuebla/DEVELOPER/PROJECTS/DreamedJobAI/data/test_output.csv", index=False)

  return compile(source, filename, mode, flags,


KeyError: 'suitability'

In [None]:
gpt_3_summary = """Write an abstract description of the following CV in four bullet points. 

Let's think step by step to summarise it into four bullet points:

1. Focus on the main skills and responsibilities of each role. 
2. One of the bullet points is the total years of experience.
2. Omit the employer names. 
3. Your answer must be in an active voice. 
4. Double-check that the summary is in four bullet points, three summarising the main skills and responsibilities and the last one is about the years of experience. 
CV IS BELOW: """