In [1]:
import os
import openai
import pandas as pd
import tiktoken  # for counting tokens
import ast  # for converting embeddings saved as strings back to arrays
from scipy import spatial
import pretty_errors
from preprocess import individual_preprocess
from displayfunction import display
from dotenv import load_dotenv

In [2]:

load_dotenv('.env')
openai.api_key = os.getenv("OPENAI_API_KEY")
SAVE_PATH = os.getenv("SAVE_PATH")
E5_BASE_TOTAL_JOBS = os.getenv("E5_BASE_TOTAL_JOBS")
OPENAI_TOTAL_JOBS = os.getenv("OPENAI_TOTAL_JOBS")


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
#GPT_MODEL = "gpt-4"

"""
Load the embedded file
"""

embeddings_path = OPENAI_TOTAL_JOBS

df = pd.read_parquet(embeddings_path)

# convert embeddings from CSV str type back to list type
x = df["embeddings"]
print(x)

0       [0.0011792784789577127, -0.0062871999107301235...
1       [0.00448582973331213, -0.009026281535625458, -...
2       [0.0032345347572118044, -0.008394144475460052,...
3       [0.0010413312120363116, -0.0062429034151136875...
4       [0.008784311823546886, -0.005815159995108843, ...
                              ...                        
1738    [-0.026897858828306198, 0.006618379149585962, ...
1739    [-0.027289917692542076, -0.014915977604687214,...
1740    [-0.009569555521011353, -0.0051603857427835464...
1741    [-0.025336530059576035, -0.012708911672234535,...
1742    [-0.02954697422683239, -0.026450172066688538, ...
Name: embeddings, Length: 1743, dtype: object


"""
## 2. Search

Now we'll define a search function that:
- Takes a user query and a dataframe with text & embedding columns
- Embeds the user query with the OpenAI API
- Uses distance between query embedding and text embeddings to rank the texts
- Returns two lists:
    - The top N texts, ranked by relevance
    - Their corresponding relevance scores

"""

In [3]:

# search function
def ids_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    #Modify this to get more jobs
    top_n: int = 15
) -> tuple[list[str], list[float]]:

    """Returns a list of strings and relatednesses, sorted from most related to least."""
    
    query_preprocess = individual_preprocess(query)
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query_preprocess,
    )
    #This is the query (e.g., Data Engineer) that the model will find relatednessnes
    query_embedding = query_embedding_response["data"][0]["embedding"]
    ids_and_relatednesses = [
        (row["ids"], relatedness_fn(query_embedding, row["embeddings"]))
        for i, row in df.iterrows()
    ]
    ids_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    ids, relatednesses = zip(*ids_and_relatednesses)
    return ids[:top_n], relatednesses[:top_n]


In [4]:
query = "query: Python Remote Worldwide or MX (Mexico)"
ids, relatednesses = ids_ranked_by_relatedness(query, df)
print(f"\nTHE IDs ARE RANKED BY RELEVANCE:\n")
for id, relatedness in zip(ids, relatednesses):
    print(f"ID: {id} has the following {relatedness=:.3f}")


THE IDs ARE RANKED BY RELEVANCE:

ID: 34207 has the following relatedness=0.837
ID: 35211 has the following relatedness=0.834
ID: 36738 has the following relatedness=0.830
ID: 33752 has the following relatedness=0.830
ID: 35222 has the following relatedness=0.829
ID: 34204 has the following relatedness=0.828
ID: 34294 has the following relatedness=0.826
ID: 34155 has the following relatedness=0.826
ID: 34191 has the following relatedness=0.824
ID: 35209 has the following relatedness=0.823
ID: 34244 has the following relatedness=0.823
ID: 40019 has the following relatedness=0.822
ID: 34186 has the following relatedness=0.820
ID: 34237 has the following relatedness=0.818
ID: 38202 has the following relatedness=0.818


In [None]:
#tiktoken function -> to count tokens
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [None]:
introduction_prompt = """

To assist you, here are the job IDs and the respective job descriptions ranked by relatedness to the user's specifications.
Review these jobs and output the ID(s) of the job(s) that meet the following criteria:
1. The job(s) contain the user's requirements; or
2. The job(s) contain both "requirements" and "preferences". In this case, called them: "Dream Jobs"

If no job meets all “requirements”, respond with "I am sorry, I do not have your dream job just yet."

"""

In [None]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    ids, relatednesses = ids_ranked_by_relatedness(query, df)
    #Basically giving the most relevant IDs from the previous function
    introduction = introduction_prompt
    question = f"\n\Requirements and Preferences of the user: {query}"
    message = introduction
    for id in ids:
        #Get the text for GPT to answer the question
        job_description = df[df['ids'] == id]['text_data'].values[0] 
        
        # Add job descriptions to the message along with job ID
        next_id = f'\n\nJob\'s ID:\n"""\n{id}\n"""\nJob Description:\n"""\n{job_description}\n"""'
        if (
            num_tokens(message + next_id + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_id
    return message + question


In [None]:
system_prompt="""

You are DreamJobAI, an expert in job searching. 
Users will share the “requirements” and “preferences” of their dream job.
You are going to be provided with jobs ranked by relatedness to the user's “requirements” and “preferences”, use those jobs to complete your task.
*Your task is to find jobs which either meet the user's "requirements" or the user's "requirements" and “preferences”.* 
"""


In [None]:
user_reminder = """

*Your task is to find jobs which either meet the user's "requirements" or the user's "requirements" and “preferences”.* 

"""

In [None]:
user_example_1= """

REQUIREMENTS: A job with Python. PREFERENCES: Data Engineer, Machine Learning """

assistant_example_1= """ 

These jobs meet your requirements, because they require someone fluent in Python:

Job's ID: 1
Job's ID: 2
Job's ID: 3
Job's ID: 4
Job's ID: 5

I found your Dream Job! It requires Python (requirement) and it is in Machine Learning (preferences):

Job's ID: 3

"""



In [None]:
def ask(
    #This query is your question, only parameter to fill in function
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096,
    print_messages: bool = True,
) -> str:
    #Answers a query using GPT and a dataframe of relevant texts and embeddings.
    message = query_message(query, df, model=model, token_budget=token_budget)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": message},
        {"role": "system", "name":"example_user", "content": user_example_1},
        {"role": "system", "name": "example_assistant", "content": assistant_example_1},
        {"role": "user", "content": user_reminder}
    ]
    if print_messages:
        print(messages)
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    #Tokens
    total_tokens = response['usage']['total_tokens']
    print(f"\nTOTAL TOKENS USED:{total_tokens}\n", )
    #Approximate cost
    if GPT_MODEL == "gpt-4":
        approximate_cost = round((total_tokens / 1000) * 0.045, 3)
        print(f"APPROXIMATE COST FOR QUERY:", f"${approximate_cost} USD")
    elif GPT_MODEL == "gpt-3.5-turbo":
        approximate_cost = round((total_tokens / 1000) * 0.002, 3)
        print(f"APPROXIMATE COST FOR QUERY:", f"${approximate_cost} USD")
    
    #relatednesses
    ids, relatednesses = ids_ranked_by_relatedness(query=query, df=df)
    print(f"\nTHE IDs ARE RANKED BY RELEVANCE:\n")
    for id, relatedness in zip(ids, relatednesses):
        print(f"ID: {id} has the following {relatedness=:.3f}")
    return response_message

In [None]:
print(ask('REQUIREMENTS: Python Remote Worldwide or MX (Mexico). PREFERENCES: Data Engineer or Machine Learning'))

In [None]:
#print(ask('REQUIREMENTS: Python developer. PREFERENCES: Machine Learning or Data Engineer'))