In [199]:
# Import necessary libraries
import json
import pandas as pd
import openai
import elasticsearch
from groq import Groq
from dotenv import load_dotenv
import os
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

In [200]:
# Load environment variables
load_dotenv()

True

In [201]:
# Setup project paths
base_folder = 'D:/Projects/AI-Restaurent-Chat-bot/'
input_data_folder = base_folder + 'input_data/'


In [202]:
# Get the current working directory (optional step)
os.getcwd()

'd:\\Projects\\AI-Restaurent-Chat-bot\\notebooks'

In [203]:
# Setup the OpenAI client to use either Groq, OpenAI.com, or Ollama API
load_dotenv(override=True)
API_HOST = os.getenv("API_HOST")
API_HOST

'groq'

In [204]:
if API_HOST == "groq":
    client = client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
    MODEL_NAME = os.getenv("GROQ_MODEL")

elif API_HOST == "ollama":
    client = openai.OpenAI(
        base_url=os.getenv("OLLAMA_ENDPOINT"),
        api_key="nokeyneeded",
    )
    MODEL_NAME = os.getenv("OLLAMA_MODEL")

elif API_HOST == "github":
    client = openai.OpenAI(base_url="https://models.inference.ai.azure.com", api_key=os.getenv("GITHUB_TOKEN"))
    MODEL_NAME = os.getenv("GITHUB_MODEL")

else:
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    MODEL_NAME = os.getenv("OPENAI_MODEL")

In [205]:
client

<groq.Groq at 0x1d3743147a0>

In [206]:
MODEL_NAME

'llama3-8b-8192'

In [207]:
with open(input_data_folder + 'food_user_qa_dataset.json', 'rt') as f_in:
    data = json.load(f_in)

In [208]:
documents = []
for dish in data['dishes']:
    dish_name = dish['dish name']
    for doc in dish['documents']:
        doc['dish_name'] = dish_name  # Add dish_name to each document
        documents.append(doc)

In [209]:
documents[1:3]

[{'id': '142_2',
  'question': 'How many calories does it have?',
  'section': 'calories',
  'text': 'The almond fudge banana cake has 224.8 calories.',
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_3',
  'question': 'How much total fat does it contain?',
  'section': 'nutritional',
  'text': 'The almond fudge banana cake has 14 grams of total fat (PDV).',
  'dish_name': 'almond fudge banana cake'}]

In [210]:
# The prompt template for generating questions
prompt_template = """
You are emulating a user who's ordering food from a restaurant.
Based on the following information about the dish, generate 5 questions a customer might ask which have similar meanings.
Use as few words from the dish description as possible while making the questions which have similar meanings.

Make sure to include the dish name in the questions.

Don't start with "Here are the 5 questions a customer might ask: or anything similar. Just give straight away questions.
section: {section}
question: {question} + {dish_name}
answer: {text}

Provide the output in JSON format:

["question1", "question2", "question3", "question4", "question5"]

""".strip()

In [211]:
# Function to generate questions based on the document
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    try:
        # Call Groq API to generate the response
        response = client.chat.completions.create(
            model=MODEL_NAME,  # Use 'llama3-8b-8192'
            messages=[{"role": "user", "content": prompt}]
        )
        
        # Print the response to debug
        print("Response from API:", response)
        
        # Extract the content of the response
        content = response.choices[0].message.content
        
        # Use regex to extract the list items between square brackets
        match = re.search(r'\[(.*?)\]', content, re.DOTALL)
        if match:
            # Split the content between commas and clean up the formatting
            generated_questions = [item.strip().strip('"') for item in match.group(1).split(',')]
        else:
            print(f"No questions found for doc {doc['id']}")
            generated_questions = []
    
    except Exception as e:
        print(f"Error during API call for doc {doc['id']}: {e}")
        generated_questions = []
    
    return generated_questions


In [212]:
sample_documents = documents [1:5]
documents # Modify the number of samples as needed


[{'id': '142_1',
  'question': 'What are the ingredients?',
  'section': 'ingredients',
  'text': "The almond fudge banana cake contains the following ingredients: ['dole banana', 'sugar', 'margarine', 'eggs', 'amaretto liqueur', 'vanilla extract', 'all-purpose flour', 'unsweetened cocoa powder', 'baking soda', 'salt', 'dole almond']",
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_2',
  'question': 'How many calories does it have?',
  'section': 'calories',
  'text': 'The almond fudge banana cake has 224.8 calories.',
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_3',
  'question': 'How much total fat does it contain?',
  'section': 'nutritional',
  'text': 'The almond fudge banana cake has 14 grams of total fat (PDV).',
  'dish_name': 'almond fudge banana cake'},
 {'id': '142_4',
  'question': 'What is the sugar content?',
  'section': 'nutritional',
  'text': 'The almond fudge banana cake has 87 grams of sugar (PDV).',
  'dish_name': 'almond fudge banana cake'},


In [213]:
# Store the results
results = {}
sample_documents = documents  # Adjust the number of samples as needed

In [214]:
import re

# Loop through each document and generate questions
for doc in tqdm(sample_documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    
    # Generate questions
    questions = generate_questions(doc)
    results[doc_id] = questions
    
    # Add a delay to avoid rate limits
    time.sleep(1)  # Wait 2 seconds before making the next API call

# Check the results
print(results)


  0%|          | 0/3 [00:00<?, ?it/s]

Response from API: ChatCompletion(id='chatcmpl-215d03a7-7d3a-4bb8-b3c0-8d0b721f3e32', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='[\n"What\'s the calorie count of the almond fudge banana cake?",\n"How many calories do I get in an almond fudge banana cake?",\n"Is the almond fudge banana cake high in calories?",\n"Can you tell me the calorie total for the almond fudge banana cake?",\n"What\'s the almond fudge banana cake\'s calorie value?"\n]', role='assistant', function_call=None, tool_calls=None))], created=1725645346, model='llama3-8b-8192', object='chat.completion', system_fingerprint='fp_6a6771ae9c', usage=CompletionUsage(completion_tokens=70, prompt_tokens=164, total_tokens=234, completion_time=0.058333333, prompt_time=0.020221281, queue_time=0.0018132059999999978, total_time=0.078554614), x_groq={'id': 'req_01j747mngkf7fsm3fc47z1bja7'})
Response from API: ChatCompletion(id='chatcmpl-fe60471e-4bd7-4490-bd72-0ddbfd84205d', cho

In [215]:
results

{'142_2': ["What's the calorie count of the almond fudge banana cake?",
  'How many calories do I get in an almond fudge banana cake?',
  'Is the almond fudge banana cake high in calories?',
  'Can you tell me the calorie total for the almond fudge banana cake?',
  "What's the almond fudge banana cake's calorie value?"],
 '142_3': ["What's the total fat in the Almond Fudge Banana Cake?",
  'How much saturated fat does the Almond Fudge Banana Cake have?',
  "What's the calorie contribution from fat in the Almond Fudge Banana Cake?",
  'Can I ask for the fat breakdown in the Almond Fudge Banana Cake?',
  "How much of the Almond Fudge Banana Cake's calories come from fat?"],
 '142_4': ['How much sugar is in the almond fudge banana cake?',
  'Can I know the amount of sugar in almond fudge banana cake?',
  "What's the sugar level in almond fudge banana cake?",
  'Is the sugar content of almond fudge banana cake high?',
  'How many grams of sugar are in almond fudge banana cake?']}

In [216]:
doc_index = {d['id']: d for d in documents}


In [217]:
# Define the course and document IDs
course_name = 'food-gold-data'
dish_name = doc_index[doc_id]['dish_name']

# Prepare data for writing to CSV
csv_data = []
for doc_id, questions in results.items():
    for question in questions:
        csv_data.append([question, dish_name, doc_id])


In [218]:
ground_df = pd.DataFrame(csv_data, columns=['question', 'course', 'document'])

In [219]:
ground_df.head()

Unnamed: 0,question,course,document
0,What's the calorie count of the almond fudge b...,almond fudge banana cake,142_2
1,How many calories do I get in an almond fudge ...,almond fudge banana cake,142_2
2,Is the almond fudge banana cake high in calories?,almond fudge banana cake,142_2
3,Can you tell me the calorie total for the almo...,almond fudge banana cake,142_2
4,What's the almond fudge banana cake's calorie ...,almond fudge banana cake,142_2


In [220]:
# Write the CSV file
ground_df.to_csv('ground-truth-data.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'ground-truth-data.csv'