In [25]:
import openai
from groq import Groq
import pandas as pd
from dotenv import load_dotenv
import os


In [26]:
# change path for base folder
base_folder = 'D:/Projects/AI-Restaurent-Chat-bot/'
input_data_folder = base_folder+'input_data/'

In [27]:
# Setup the OpenAI client to use either Groq, OpenAI.com, or Ollama API
load_dotenv(override=True)
API_HOST = os.getenv("API_HOST")
API_HOST

'groq'

In [28]:
if API_HOST == "groq":
    client = Groq(api_key=os.getenv('GROQ_API_KEY'))
    MODEL_NAME = os.getenv("GROQ_MODEL")

elif API_HOST == "ollama":
    client = openai.OpenAI(
        base_url=os.getenv("OLLAMA_ENDPOINT"),
        api_key="nokeyneeded",
    )
    MODEL_NAME = os.getenv("OLLAMA_MODEL")

elif API_HOST == "github":
    client = openai.OpenAI(base_url="https://models.inference.ai.azure.com", api_key=os.getenv("GITHUB_TOKEN"))
    MODEL_NAME = os.getenv("GITHUB_MODEL")

else:
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    MODEL_NAME = os.getenv("OPENAI_MODEL")

In [29]:
df = pd.read_csv(input_data_folder+"cakes_data.csv")

In [30]:
df.columns

Index(['id', 'name', 'minutes', 'tags', 'n_steps', 'steps', 'ingredients',
       'n_ingredients', 'user_id', 'rating', 'review', 'rating_category',
       'calories', 'total fat (PDV)', 'sugar (PDV)', 'sodium (PDV)',
       'protein (PDV)', 'saturated fat (PDV)', 'carbohydrates (PDV)',
       'calorie_status', 'price'],
      dtype='object')

In [31]:
df.head(1)

Unnamed: 0,id,name,minutes,tags,n_steps,steps,ingredients,n_ingredients,user_id,rating,...,rating_category,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),calorie_status,price
0,142,almond fudge banana cake,110,"['weeknight', 'time-to-make', 'course', 'prepa...",13,"['mash bananas and set aside', 'beat sugar and...","['dole banana', 'sugar', 'margarine', 'eggs', ...",11,914114,4,...,tasty,224.8,14,87,10,7,9,11,Low Calory,24


In [32]:

# Step 1: Clean 'tags', 'ingredients', and 'steps' columns by removing unwanted characters
df['tags'] = df['tags'].str.replace("[", "").replace("]", "").replace("'", "")
df['ingredients'] = df['ingredients'].str.replace("[", "").replace("]", "").replace("'", "")
df['steps'] = df['steps'].str.replace("[", "").replace("]", "").replace("'", "")


In [33]:
# Step 2: Handle missing values (fill with "unknown" for categorical or median for numerical)
df['calories'].fillna(df['calories'].median(), inplace=True)
df['price'].fillna(df['price'].median(), inplace=True)
df.fillna("unknown", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['calories'].fillna(df['calories'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price'].fillna(df['price'].median(), inplace=True)


In [34]:
# Step 3: Optional - Drop duplicates if there are any
df.drop_duplicates(inplace=True)

In [35]:
df.head()

Unnamed: 0,id,name,minutes,tags,n_steps,steps,ingredients,n_ingredients,user_id,rating,...,rating_category,calories,total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV),calorie_status,price
0,142,almond fudge banana cake,110,"'weeknight', 'time-to-make', 'course', 'prepar...",13,"'mash bananas and set aside', 'beat sugar and ...","'dole banana', 'sugar', 'margarine', 'eggs', '...",11,914114,4,...,tasty,224.8,14,87,10,7,9,11,Low Calory,24
1,520,apple pie cake,40,"'60-minutes-or-less', 'time-to-make', 'course'...",4,"'cream butter , sugar and the egg', 'mix toget...","'sugar', 'egg', 'salt', 'butter', 'flour', 'ba...",12,22504,3,...,so so,359.1,26,136,18,7,40,16,Low Calory,20
2,822,heloise s cake mix cookies,8,"'15-minutes-or-less', 'time-to-make', 'course'...",4,"'mix all ingredients well to remove lumps', 's...","'cake mix', 'vegetable oil', 'eggs']",3,36327,5,...,tasty,137.4,11,46,6,2,5,5,Low Calory,44
3,916,momma s fair funnel cake,20,"'30-minutes-or-less', 'time-to-make', 'course'...",9,'beat eggs and sugar together and then add the...,"'eggs', 'sugar', 'milk', 'flour', 'salt', 'bak...",7,13640,5,...,tasty,398.9,9,34,16,27,13,23,Low Calory,31
4,1633,mc donald s pancakes,20,"'30-minutes-or-less', 'time-to-make', 'course'...",4,"'put all ingredients into a blender', 'blend o...","'carbonated lemon-lime beverage', 'egg', 'suga...",5,60908,5,...,tasty,225.6,14,39,16,8,10,10,Low Calory,31


In [46]:
def generate_questions(row):
    prompt = f"""
    Based on the following dish information:
    
    Name: {row['name']}
    Ingredients: {row['ingredients']}
    Price: {row['price']}
    Review: {row['review']}
    Rating: {row['rating']}
    Calories:{row['calories']}
    
    Generate 5 user questions that someone might ask when ordering food from a menu.
    """
    
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}],
        
    )
    
    return response['choices'][0]['message']['content']



In [47]:
# Loop through each row in the dataframe and generate questions
for index, row in df.iterrows():
    questions = generate_questions(row)
    generated_questions_list.append(questions)

TypeError: 'ChatCompletion' object is not subscriptable