In [540]:
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
import json

In [541]:
client = OpenAI()
df = pd.read_csv('../data/Cleaned_dataset.csv')
# data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
# data.to_csv('../data/Cleaned_dataset.csv')
documents = df.to_dict(orient='records')

In [542]:
prompt_template = """
You are a user of an SQL Interview Preparation Assistant application.
Based on the provided question and answer, formulate 5 specific questions that a user might ask regarding SQL interviews.
Ensure that the questions are comprehensive and relevant to the given question and answer.
Avoid using too many words from the original records.

The record:

question: {question}
answer: {answer}
category: {category}
difficulty_level: {difficulty_level}
tags: {tags}
example_query: {example_query}
explanation: {explanation}
common_mistakes: {common_mistakes}
related_questions: {related_questions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

prompt = prompt_template.format(**documents[0])

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content


In [543]:
questions = llm(prompt)

In [544]:
json.loads(questions)

{'questions': ['What is the purpose of using a LEFT JOIN in SQL queries?',
  'How does the COALESCE function work in SQL, and when should it be used?',
  'What common errors can occur when trying to aggregate data from multiple tables?',
  'How do you ensure that all records from one table are included in the results when performing a JOIN?',
  'Can you explain the significance of sorting results in SQL and how it affects data analysis?']}

In [545]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [546]:
output_path = '../data/retrieval_evaluate_dataset.csv'
with open(output_path, 'w') as f:
    f.write('id,question\n')

    for doc in tqdm(documents): 
        doc_id = doc['id']
        
        questions_raw = generate_questions(doc)
        questions = json.loads(questions_raw)

        for q in questions['questions']:
            f.write(f"{doc_id},{q}\n")



  0%|          | 0/601 [00:00<?, ?it/s]

In [564]:
dfr = pd.read_parquet('../data/retrieval_evaluate_dataset.parquet') #on_bad_lines='skip'
print(dfr.shape,dfr.columns)  
print(dfr.head())

(3005, 2) Index(['id', 'question'], dtype='object')
   id                                           question
0   0  How do I write a SQL query to display all prod...
1   0  What is the purpose of using COALESCE in SQL q...
2   0  Can you explain the difference between INNER J...
3   0  How can I sort the results of my SQL query in ...
4   0  What are common mistakes when aggregating data...
