In [17]:
import os
import openai
import pandas as pd
import tiktoken  # for counting tokens
import ast  # for converting embeddings saved as strings back to arrays
from scipy import spatial
import pretty_errors
from preprocess import individual_preprocess
from displayfunction import display
from dotenv import load_dotenv

In [18]:

load_dotenv('.env')
openai.api_key = os.getenv("OPENAI_API_KEY")
SAVE_PATH = os.getenv("SAVE_PATH")

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
#GPT_MODEL = "gpt-4"

"""
Load the embedded file
"""

embeddings_path = SAVE_PATH + "/jobs_test5.csv"

df = pd.read_csv(embeddings_path)

# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

"""
## 2. Search

Now we'll define a search function that:
- Takes a user query and a dataframe with text & embedding columns
- Embeds the user query with the OpenAI API
- Uses distance between query embedding and text embeddings to rank the texts
- Returns two lists:
    - The top N texts, ranked by relevance
    - Their corresponding relevance scores

"""

In [19]:

# search function
def ids_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    #Modify this to get more jobs
    top_n: int = 20
) -> tuple[list[str], list[float]]:

    """Returns a list of strings and relatednesses, sorted from most related to least."""
    
    query_preprocess = individual_preprocess(query)
    print(query_preprocess)
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query_preprocess,
    )
    #This is the query (e.g., Data Engineer) that the model will find relatednessnes
    query_embedding = query_embedding_response["data"][0]["embedding"]
    ids_and_relatednesses = [
        (row["id"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    ids_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    ids, relatednesses = zip(*ids_and_relatednesses)
    return ids[:top_n], relatednesses[:top_n]


In [20]:
query = "Which jobs are good for a Python developer with three years of experience? I would like something related with Machine Learning or Data Engineering"

print(ids_ranked_by_relatedness(query, df))

job good python developer three year experience would like something related machine learning data engineering
(('2rPkshzK', 's9Qra2aO', 'lnstGfwL', 'QjJFvjn6', 'OFkXaI5H', '7pzvYS6y', '3qfbh5tM', 'gZaZ7bVt', '6A7XWtEj', 'DTwBPWYC', 'odXP3QLQ', 'wmvenYiI', 'GNl5gHGV', '6hlGa5dz', 'EPHJArCf', 'hHycd9Hz', 'Mb0O4TVn', 'nVHCI9OO', 'Y1y3y2fY', '0uvNXfS0'), (0.8447327505625101, 0.8384938610487017, 0.8363065683593817, 0.8356716883209322, 0.832334516584875, 0.8321505784975677, 0.8234213614182629, 0.8208861703149908, 0.8185780231693931, 0.8183886419248051, 0.8183886419248051, 0.8183886419248051, 0.813835545267674, 0.812246906023451, 0.8114809437229167, 0.810214728049277, 0.8091536896949553, 0.8091536896949553, 0.8091536896949553, 0.8091536896949553))


In [7]:
#tiktoken function -> to count tokens
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [None]:
introduction = """

To assist you, here are job IDs ranked by relatedness to the users specifications.
Review these jobs and identify 5-10 that may include the user's preferences and must meet their requirements.

"""

In [8]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    ids, relatednesses = ids_ranked_by_relatedness(query, df)
    #Basically giving the most relevant IDs from the previous function
    introduction = 'These are all the IDs of all the available jobs. Use them to find the jobs that match the skills of the user. If you cannot find a single job that might match then say "I could not find a suitable job."'
    question = f"\n\Skills of the user: {query}"
    message = introduction
    for id in ids:
        next_id = f'\n\nJob\'s ID:\n"""\n{id}\n"""'
        if (
            num_tokens(message + next_id + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_id
    return message + question


In [None]:
system_prompt="""

You are DreamJobAI, an expert in job searching. 
Users will share their dream job “preferences” and “requirements”. 
Your goal is to find jobs in your database that closely match their specifications. 
Jobs you suggest may not have all “preferences” but must meet all “requirements”. 
If no job meets all “requirements”, respond with "I am sorry, I do not have your dream job just yet."
"""


In [None]:
assistant_prompt= """
Your output must be a PostgreSQL query. For example: 

SELECT * FROM personal WHERE ID IN ('vcRRzC7K', '8JNjW9zL', '3KmzR6pF', '5GmzR6pF', '9JNjW9zL', '2KmzR6pF', '6GmzR6pF')

"""

In [9]:
def ask(
    #This query is your question, only parameter to fill in function
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096,
    print_message: bool = False,
) -> str:
    #Answers a query using GPT and a dataframe of relevant texts and embeddings.
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": message},
        {"role": "assistant", "content": assistant_prompt}
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.2
    )
    response_message = response["choices"][0]["message"]["content"]
    #Tokens
    print("'\nTOTAL TOKENS USED:\n'", response['usage']['total_tokens'])
    
    #relatednesses
    ids, relatednesses = ids_ranked_by_relatedness(query=query, df=df)
    print(f"\nTHE IDs ARE RANKED BY RELEVANCE:\n")
    for id, relatedness in zip(ids, relatednesses):
        print(f"ID: {id} has the following {relatedness=:.3f}")
    return response_message

In [None]:
print(ask('Which jobs are good for a Python developer with three years of experience? I would like something related with Machine Learning or Data Engineering'))

In [None]:
print(ask('three years of professional experience. Fluent in Python, SQL Server, PostgreSQL, MySQL, Tableau, Git, Github y PowerBI. Puesto deseado: Software Engineer, Data Scientist, Data Engineer, Data Analyst'))