In [221]:
# Import necessary libraries
import json
import pandas as pd
import openai
import elasticsearch
from groq import Groq
from dotenv import load_dotenv
import os
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

In [222]:
# Load environment variables
load_dotenv()

True

In [223]:
# Setup project paths
base_folder = 'D:/Projects/AI-Restaurent-Chat-bot/'
input_data_folder = base_folder + 'input_data/'


In [224]:
# Get the current working directory (optional step)
os.getcwd()

'd:\\Projects\\AI-Restaurent-Chat-bot\\notebooks'

In [225]:
# Setup the OpenAI client to use either Groq, OpenAI.com, or Ollama API
load_dotenv(override=True)
API_HOST = os.getenv("API_HOST")
API_HOST

'groq'

In [226]:
if API_HOST == "groq":
    client = client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
    MODEL_NAME = os.getenv("GROQ_MODEL")

elif API_HOST == "ollama":
    client = openai.OpenAI(
        base_url=os.getenv("OLLAMA_ENDPOINT"),
        api_key="nokeyneeded",
    )
    MODEL_NAME = os.getenv("OLLAMA_MODEL")

elif API_HOST == "github":
    client = openai.OpenAI(base_url="https://models.inference.ai.azure.com", api_key=os.getenv("GITHUB_TOKEN"))
    MODEL_NAME = os.getenv("GITHUB_MODEL")

else:
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    MODEL_NAME = os.getenv("OPENAI_MODEL")

In [227]:
client

<groq.Groq at 0x1d374321580>

In [228]:
MODEL_NAME

'llama3-8b-8192'

In [229]:
with open(input_data_folder + 'food_user_qa_dataset.json', 'rt') as f_in:
    data = json.load(f_in)

In [230]:
documents = []
for dish in data['dishes']:
    dish_name = dish['dish name']
    for doc in dish['documents']:
        doc['dish_name'] = dish_name  # Add dish_name to each document
        documents.append(doc)

In [231]:
documents[1:3]

[{'id': '142_2',
  'question': 'How many calories does it have?',
  'section': 'calories',
  'text': 'The almond fudge banana cake has 224.8 calories.',
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_3',
  'question': 'How much total fat does it contain?',
  'section': 'nutritional',
  'text': 'The almond fudge banana cake has 14 grams of total fat (PDV).',
  'dish_name': 'almond fudge banana cake'}]

In [232]:
# The prompt template for generating questions
prompt_template = """
You are emulating a user who's ordering food from a restaurant.
Based on the following information about the dish, generate 5 questions a customer might ask which have similar meanings.
Use as few words from the dish description as possible while making the questions which have similar meanings.

Make sure to include the dish name in the questions.

Don't start with "Here are the 5 questions a customer might ask: or anything similar. Just give straight away questions.
section: {section}
question: {question} + {dish_name}
answer: {text}

Provide the output in JSON format:

["question1", "question2", "question3", "question4", "question5"]

""".strip()

In [233]:
# Function to generate questions based on the document
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    try:
        # Call Groq API to generate the response
        response = client.chat.completions.create(
            model=MODEL_NAME,  # Use 'llama3-8b-8192'
            messages=[{"role": "user", "content": prompt}]
        )
        
        # Print the response to debug
        print("Response from API:", response)
        
        # Extract the content of the response
        content = response.choices[0].message.content
        
        # Use regex to extract the list items between square brackets
        match = re.search(r'\[(.*?)\]', content, re.DOTALL)
        if match:
            # Split the content between commas and clean up the formatting
            generated_questions = [item.strip().strip('"') for item in match.group(1).split(',')]
        else:
            print(f"No questions found for doc {doc['id']}")
            generated_questions = []
    
    except Exception as e:
        print(f"Error during API call for doc {doc['id']}: {e}")
        generated_questions = []
    
    return generated_questions


In [234]:
sample_documents = documents [1:5]
documents # Modify the number of samples as needed


[{'id': '142_1',
  'question': 'What are the ingredients?',
  'section': 'ingredients',
  'text': "The almond fudge banana cake contains the following ingredients: ['dole banana', 'sugar', 'margarine', 'eggs', 'amaretto liqueur', 'vanilla extract', 'all-purpose flour', 'unsweetened cocoa powder', 'baking soda', 'salt', 'dole almond']",
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_2',
  'question': 'How many calories does it have?',
  'section': 'calories',
  'text': 'The almond fudge banana cake has 224.8 calories.',
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_3',
  'question': 'How much total fat does it contain?',
  'section': 'nutritional',
  'text': 'The almond fudge banana cake has 14 grams of total fat (PDV).',
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_4',
  'question': 'What is the sugar content?',
  'section': 'nutritional',
  'text': 'The almond fudge banana cake has 87 grams of sugar (PDV).',
  'dish_name': 'almond fudge banana cake'},


In [235]:
# Store the results
results = {}
sample_documents = documents  # Adjust the number of samples as needed

In [236]:
import re

# Loop through each document and generate questions
for doc in tqdm(sample_documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    
    # Generate questions
    questions = generate_questions(doc)
    results[doc_id] = questions
    
    # Add a delay to avoid rate limits
    time.sleep(1)  # Wait 2 seconds before making the next API call

# Check the results
print(results)


  0%|          | 0/3584 [00:00<?, ?it/s]

Response from API: ChatCompletion(id='chatcmpl-5212f97e-5be5-48e1-a07e-9924386a0d20', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='["Is almond fudge banana cake made with real bananas?", "Does almond fudge banana cake contain dairy?", "Can I be sure there are no artificial flavors in almond fudge banana cake?", "Are the ingredients in almond fudge banana cake free from liqueurs?", "May I know the types of flour and sugar used in almond fudge banana cake?"]', role='assistant', function_call=None, tool_calls=None))], created=1725645363, model='llama3-8b-8192', object='chat.completion', system_fingerprint='fp_179b0f92c9', usage=CompletionUsage(completion_tokens=74, prompt_tokens=215, total_tokens=289, completion_time=0.061666667, prompt_time=0.035935813, queue_time=0.0023956440000000023, total_time=0.09760248), x_groq={'id': 'req_01j747n6arfd9v5mzxkvnsvfzk'})
Response from API: ChatCompletion(id='chatcmpl-6ea9f6d9-f49f-4a96-8755-ea58

KeyboardInterrupt: 

In [237]:
len(results)

2560

In [238]:
doc_index = {d['id']: d for d in documents}


In [239]:
# Define the course and document IDs
course_name = 'food-gold-data'
dish_name = doc_index[doc_id]['dish_name']

# Prepare data for writing to CSV
csv_data = []
for doc_id, questions in results.items():
    for question in questions:
        csv_data.append([question, dish_name, doc_id])


In [240]:
ground_df = pd.DataFrame(csv_data, columns=['question', 'course', 'document'])

In [241]:
ground_df.head()

Unnamed: 0,question,course,document
0,Is almond fudge banana cake made with real ban...,spanish lemon cake,142_1
1,Does almond fudge banana cake contain dairy?,spanish lemon cake,142_1
2,Can I be sure there are no artificial flavors ...,spanish lemon cake,142_1
3,Are the ingredients in almond fudge banana cak...,spanish lemon cake,142_1
4,May I know the types of flour and sugar used i...,spanish lemon cake,142_1


In [220]:
# Write the CSV file
ground_df.to_csv('ground-truth-data.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'ground-truth-data.csv'