In [2]:
import asyncio
import os
from openai import AsyncOpenAI
from jinja2 import Template
import pandas as pd
import json
from copy import deepcopy

In [3]:
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

## Loading the test data

In [4]:
comparison_data = []
with open('first_comparison.json') as f:
    comparison_data = json.load(f)

In [7]:
problematic_data = []
problematic_queries = []

In [8]:
for item in comparison_data:
    if item.get("problem"):
        problematic_data.append(item)
        problematic_queries.append(item["query"])

In [10]:
(problematic_queries)

['Locate human resources specialists who exhibit expertise in talent acquisition, employee engagement strategies, performance management, and HRIS system functionality. Candidates should be positioned within the dynamic environment of New York City, New York, while consciously avoiding any associations with the hospitality sector.',
 'marketplaces with revenue of USD 50m+, executive level role, based in Nordic countries',
 'Please provide HR executives locate in the midwestern united states who have at least 15-years professional experience and have recruiting, talent acquisition or management in their background but who also possess HR generalist experience. They should be at a Senior Director level or above at a company exceeding $2 Billion in revenue',
 'GM or VP for grocery companies in finland, make sure they have relevant experience',
 'general managers with full end to end P&L experience of at least 200 million at companies in the mobility, assistive device, and rehab, ostomy ca

In [11]:
DATA_AUGMENTATION_SYSTEM_PROMPT = """
You are an AI agent tasked with data augmentation for recruiter search queries. Your goal is to generate additional queries that are both similar to and diverse from a given sample query. This augmented data will be used to finetune a model, so it's crucial that the generated queries are distinct while still capturing the essence of the original query.
"""
DATA_AUGMENTATION_USER_PROMPT = """
Here is the sample query you will be working with:
<sample_query>
{{SAMPLE_QUERY}}
</sample_query>

Your task is to generate two sets of queries based on this sample:

1. Similar Queries:
Generate 5 queries that are very similar to the sample query. These should:
- Maintain the same overall intent and use case
- Use similar wording and structure
- Include minor variations in specifics (e.g., slightly different job titles, locations, or requirements)
- Be distinct enough from each other and the original to provide valuable training data

2. Diverse Queries:
Generate 5 queries that are more diverse but still related to the sample query. These should:
- Explore different aspects or variations of the original query's intent
- Use different wording, structure, or focus
- Potentially broaden or narrow the scope of the search
- Introduce related concepts or requirements not present in the original query
- Remain relevant to executive or specialized hiring scenarios

Please provide your output in the following JSON format:
<output>
{
  "similar": [
    "query1",
    "query2",
    "query3",
    "query4",
    "query5"
  ],
  "diverse": [
    "query1",
    "query2",
    "query3",
    "query4",
    "query5"
  ]
}
</output>

Additional guidelines:
- Ensure that each generated query is unique and provides value for model training
- Maintain a professional tone appropriate for executive recruitment
- Avoid introducing biases or discriminatory language
- Keep the queries realistic and relevant to actual hiring scenarios
- Do not simply replicate the sample query with minor word changes

Remember, the goal is to create a rich, varied dataset that will improve the model's understanding and performance in handling recruiter search queries.
"""

In [12]:


async def generate_queries_a(query, temperature=0.7, model="gpt-4o", **kwargs):

    """
    Function to run prompts on chatgpt

    Args:
        key (string): openai api key
        messages (list): list of object that has the chat that you want to process with chatgpt. i.e. system prompt, assistant prompt and user prompt
        temperature (float, optional): Temperature of gpt for generations. Defaults to 0.7.
        model (str, optional): The model you want to use. Defaults to "gpt-4o-mini".

    Returns:
        string: chatgpt result
    """
    # user_message = Template(NER_MANAGEMENT_LEVEL_TITLE_USER_PROMPT).render({"QUERY" : query})
    user_message = Template(DATA_AUGMENTATION_USER_PROMPT).render({"SAMPLE_QUERY" : query})
    messages = [
            # {"role": "system", "content": NER_MANAGEMENT_LEVEL_TITLE_SYSTEM_PROMPT},
            {"role": "system", "content": DATA_AUGMENTATION_SYSTEM_PROMPT},
            {"role": "user", "content": user_message},
            
    ]
    openai_object = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
    }
    aclient = AsyncOpenAI(api_key=openai_api_key)

    openai_object.update(kwargs)

    response = await aclient.chat.completions.create(**openai_object)
    response = response.__dict__
    response["choices"] = [choice.__dict__ for choice in response["choices"]]
    for choice in response["choices"]:
        choice["message"] = choice["message"].__dict__
    return response




In [22]:
augmented_queries = []


In [24]:
list_of_augmented_queries = []

In [23]:
with open("augmented_data_problem_queries.json") as f:
    augmented_queries = json.load(f)

In [25]:
for item in augmented_queries:
    for key, value in item.items():
        list_of_augmented_queries.extend(value)

In [29]:
with open("augmented_data_problem_queries.json", "w") as f:  
    json.dump(list_of_augmented_queries, f, indent=2)

In [15]:
for query in problematic_queries:
    response = await generate_queries_a(query)
    augmented_queries.append(response)

In [17]:
list_of_augmented_queries = []