In [1]:
# imports

import os
from dotenv import load_dotenv
from openai import OpenAI
import base64
from IPython.display import display, Image, Markdown
import pandas as pd
import json

In [2]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

API key found and looks good so far!


## Perform OCR and transform to images

In [3]:
# Import the libraries
from pdf2image import convert_from_path
# Create a function to convert PDF to images and store in specific path
def pdf_to_images(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert pdf into images
    images = convert_from_path(pdf_path)
    image_paths = []

    # Save images and paths
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page{i+1}.jpg")
        image.save(image_path, "JPEG")
        image_paths.append(image_path)
    return image_paths

In [4]:
pdf_path = "./resources/pdf/Things mother used to make.pdf"
output_folder = "./resources/output_images"
image_paths = pdf_to_images(pdf_path, output_folder)

In [4]:
# Set up connection to openai API

client = OpenAI(
    api_key=api_key
)
model = "gpt-4o-mini"

In [7]:
# Read and encode one image
image_path = "./resources/output_images/page23.jpg"
with open(image_path, "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode('utf-8')

In [8]:
# Define the system prompt
system_prompt = """
Please analyze the content of this image and extract any related recipe information.
"""

In [9]:
# Call the OpenAI API use the chat completion method
response = client.chat.completions.create(
    model = model,
    messages = [
        # Provide the system prompt
        {"role": "system", "content": system_prompt},

        # The user message contains both the text and image URL / path
        {"role": "user", "content": [
            "This is the image from the recipe page.",
            {"type": "image_url",
             "image_url": {"url": f"data:image/jpeg;base64,{image_data}",
                           "detail": "low"}}
        ]}
    ]
)

In [38]:
# Define a function to get the gpt-response and display in markdown
def display_gpt_response(response):
    gpt_response = response.choices[0].message.content
    return display(Markdown(gpt_response))


In [11]:
display_gpt_response(response)

Here’s the recipe information based on the image you provided:

### Things Mother Used to Make
#### Breads

**Bannocks**
- **Ingredients:**
  - 1 Cupful of Thick Sour Milk
  - ½ Cupful of Sugar
  - 2 Cupfuls of Flour
  - ½ Cupful of Indian Meal
  - 1 Teaspoonful of Soda
  - A pinch of Salt

- **Instructions:**
  1. Make the mixture stiff enough to drop from a spoon.
  2. Drop mixture, size of a walnut, into boiling fat.
  3. Serve warm, with maple syrup.

---

**Boston Brown Bread**
- **Ingredients:**
  - 1 Cupful of Rye Meal
  - 1 Cupful of Graham Meal
  - 1 Cupful of Molasses
  - 1 Cupful of Flour
  - 1 Cupful of Sweet Milk
  - 1 Cupful of Sour Milk
  - ½ Teaspoonful of Soda
  - 1 Heaping Teaspoonful of Salt

- **Instructions:**
  1. Stir the meals and salt together.
  2. Beat the soda into the molasses until it foams; add sour milk, mix well, and pour into a tin cup which has been well greased.
  3. If you have none, use a brown-bread steamer. 

Feel free to ask if you need more information or additional recipes!

In [13]:
# Define improved system prompt
system_prompt2 = """
Please analyze the content of this image and extract any related recipe information into structure components.
Specifically, extra the recipe title, list of ingredients, step by step instructions, cuisine type, dish type, any relevant tags or metadata.
The output must be formatted in a way suited for embedding in a Retrieval Augmented Generation (RAG) system.
"""

In [14]:
# Call the api to extract the information
response = client.chat.completions.create(
    model = model,
    messages = [
        # Provide the system prompt
        {"role": "system", "content": system_prompt2},

         # The user message contains both the text and image URL / path
        {"role": "user", "content": [
            "This is the image from the recipe page",
            {"type": "image_url",
             "image_url": {"url": f"data:image/jpeg;base64,{image_data}",
                           "detail": "low"}}
        ]}
    ],
    temperature = 0, # Set the temperature to 0 for deterministic output
)

In [15]:
display_gpt_response(response)

Here’s the structured information extracted from the recipe image:

### Recipe Title
**Breads**

### Ingredients
#### Bannocks
- 1 Cupful of Thick Sour Milk
- 1 Cupful of Flour
- ½ Cupful of Indian Meal
- 1 Teaspoonful of Soda
- A pinch of Salt

#### Boston Brown Bread
- 1 Cupful of Rye Meal
- 1 Cupful of Sour Milk
- 1 Cupful of Graham Meal
- 1 Cupful of Molasses
- 1 Cupful of Flour
- ½ Teaspoonful of Salt
- 1 Heaping Teaspoonful of Soda
- 1 Cupful of Sweet Milk

### Instructions
1. For Bannocks: Make the mixture stiff enough to drop from a spoon. Drop mixture, size of a walnut, into boiling fat. Serve warm, with maple syrup.
2. For Boston Brown Bread: Stir the meals and salt together. Beat the soda into the molasses until it foams; add sour milk, mix well, and pour into a tin pan which has been well greased, if you have no brown-bread steamer.

### Cuisine Type
- American

### Dish Type
- Bread

### Tags/Metadata
- Quick Bread
- Traditional Recipe
- Breakfast Item

This format is suitable for embedding in a Retrieval Augmented Generation (RAG) system.

## Reading All Images in a Dataset

In [17]:
len(image_paths)

136

In [18]:
extracted_recipes = []

for image_path in image_paths:
    print(f"Processing image {image_path}")
    
    # Reading and decoding images
    with open(image_path, "rb") as image_file:
        image_data = base64.b64encode(image_file.read()).decode("utf-8")
    
    # Call the OpenAI API
    response = client.chat.completions.create(
        model = model,
        messages = [
            # Provide the system prompt
            {"role": "system", "content": system_prompt2},

            # The user message contains both the text and image URL / path
            {"role": "user", "content": [
                "This is the image from the recipe page",
                {"type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_data}",
                            "detail": "low"}}
            ]}
        ],
        temperature = 0, # Set the temperature to 0 for deterministic output
    )

    # Extract the content and store it
    gpt_response = response.choices[0].message.content # Get the response content
    extracted_recipes.append({"image_path": image_path, "recipe_info": gpt_response}) # Store the path and extracted info
    print(f"Extracted information for {image_path}:\n{gpt_response}\n") # Print the extracted information for review    

Processing image ./resources/output_images/page1.jpg
Extracted information for ./resources/output_images/page1.jpg:
I can't extract specific recipe information from the image you provided. However, if you have text or details from the recipe, feel free to share, and I can help you organize that information!

Processing image ./resources/output_images/page2.jpg
Extracted information for ./resources/output_images/page2.jpg:
I can't extract any recipe information from the image you provided. If you have a specific recipe or text you'd like to share, please type it out, and I can help you format it or analyze it!

Processing image ./resources/output_images/page3.jpg
Extracted information for ./resources/output_images/page3.jpg:
It seems that the image you provided does not contain any recipe information, such as a title, ingredients, or instructions. It appears to be a page from a library catalog or a book reference. If you have a different image or more specific content related to a recip

### Filter out non-recipe content based on key-recipe related terms

In [19]:
filtered_recipes = []

for recipe in extracted_recipes:
    if any(keyword in recipe["recipe_info"].lower() for keyword in ["ingredients", "instructions", "recipe title"]):
        filtered_recipes.append(recipe)
    else:
        print(f"Skipping recipe: {recipe['image_path']}")

Skipping recipe: ./resources/output_images/page1.jpg
Skipping recipe: ./resources/output_images/page2.jpg
Skipping recipe: ./resources/output_images/page4.jpg
Skipping recipe: ./resources/output_images/page5.jpg
Skipping recipe: ./resources/output_images/page8.jpg
Skipping recipe: ./resources/output_images/page10.jpg
Skipping recipe: ./resources/output_images/page11.jpg
Skipping recipe: ./resources/output_images/page12.jpg
Skipping recipe: ./resources/output_images/page20.jpg
Skipping recipe: ./resources/output_images/page21.jpg
Skipping recipe: ./resources/output_images/page22.jpg
Skipping recipe: ./resources/output_images/page106.jpg
Skipping recipe: ./resources/output_images/page107.jpg
Skipping recipe: ./resources/output_images/page108.jpg
Skipping recipe: ./resources/output_images/page112.jpg
Skipping recipe: ./resources/output_images/page124.jpg
Skipping recipe: ./resources/output_images/page133.jpg
Skipping recipe: ./resources/output_images/page134.jpg
Skipping recipe: ./resourc

In [20]:
len(filtered_recipes)

116

In [6]:
# Define the output file path
output_file = "./resources/processed_data/recipe_info.json"

In [None]:
# Write the filtered list to a json file
with open(output_file, "w") as json_file:
    json.dump(filtered_recipes, json_file, indent = 4)

## Generating Embeddings

In [7]:
# Import libraries
import numpy as np

In [8]:
# Load the filtered recipes
with open(output_file, "r") as json_file:
    filtered_recipes = json.load(json_file)

### Generate embedding for each recipe

In [9]:

recipe_texts = [recipe["recipe_info"] for recipe in filtered_recipes]

## To organize your data per recipe (instead of per page), follow these steps with code snippets:

 1. **Update Preprocessing Script**
    - Iterate through your raw data and extract each recipe as a separate object.
    - Ensure each recipe object contains all relevant fields (e.g., title, ingredients, instructions, image_path).

      Example (Python):
        raw_text = """
        Page 1:
        Recipe: Chocolate Cake
        Ingredients: Flour, Sugar, Cocoa Powder, Eggs
        Instructions: Mix ingredients. Bake at 350F for 30 minutes.
        Image: images/chocolate_cake.jpg

        Recipe: Apple Pie
        Ingredients: Apples, Flour, Sugar, Butter
        Instructions: Prepare crust. Add apples. Bake at 375F for 45 minutes.
        Image: images/apple_pie.jpg
      """

      ```
        import re

        def extract_recipes(text):
            recipe_pattern = re.compile(
                r"Recipe:\s*(?P<title>.*?)\n"
                r"Ingredients:\s*(?P<ingredients>.*?)\n"
                r"Instructions:\s*(?P<instructions>.*?)\n"
                r"Image:\s*(?P<image_path>.*?)\n",
                re.DOTALL
            )
            recipes = []
            for match in recipe_pattern.finditer(text):
                recipe = {
                    "title": match.group("title").strip(),
                    "ingredients": match.group("ingredients").strip(),
                    "instructions": match.group("instructions").strip(),
                    "image_path": match.group("image_path").strip()
                }
                recipes.append(recipe)
            return recipes

        recipes = extract_recipes(raw_text)
      ```

 2. **System Prompt for Extraction**
    - If using an LLM, use a system prompt like:
      "You are a data extraction assistant. For the following text, extract each recipe as a separate JSON object with the following fields: title, ingredients, instructions, and image_path. Ignore page boundaries and focus on recipe boundaries."

 3. **Example Prompt for LLM**
    - Input:
      ```
      Page 1:
      Recipe: Chocolate Cake
      Ingredients: Flour, Sugar, Cocoa Powder, Eggs
      Instructions: Mix ingredients. Bake at 350F for 30 minutes.
      Image: images/chocolate_cake.jpg

      Recipe: Apple Pie
      Ingredients: Apples, Flour, Sugar, Butter
      Instructions: Prepare crust. Add apples. Bake at 375F for 45 minutes.
      Image: images/apple_pie.jpg
      ```
    - Output:
      ```json
        [
          {
            "title": "Chocolate Cake",
            "ingredients": "Flour, Sugar, Cocoa Powder, Eggs",
            "instructions": "Mix ingredients. Bake at 350F for 30 minutes.",
            "image_path": "images/chocolate_cake.jpg"
          },
          {
            "title": "Apple Pie",
            "ingredients": "Apples, Flour, Sugar, Butter",
            "instructions": "Prepare crust. Add apples. Bake at 375F for 45 minutes.",
            "image_path": "images/apple_pie.jpg"
          }
        ]
      ```

 4. **Post-processing**
    - Review the output to ensure each recipe is correctly separated and contains all necessary information.
    - Save the list of recipes to a file:

      ```
        import json

        output_file = "./resources/processed_data/recipe_info.json"
        with open(output_file, "w") as f:
            json.dump(recipes, f, indent=4)
      ```

 5. **Downstream Processing**
    - Now, when generating embeddings or performing retrieval, you will be working at the recipe level, not the page level.

 By following these steps and using the provided code snippets, your data will be organized per recipe, making all subsequent processing more accurate and efficient.


In [10]:
embedding_response = client.embeddings.create(
    input = recipe_texts,
    model = "text-embedding-3-large"
)

In [13]:
# Extract the embeddings
embeddings = [data.embedding for data in embedding_response.data]
len(embeddings[0]) # Each embedding vector size 3072

3072

In [14]:
# Convert the embeddings to numpy array
embedding_matrix = np.array(embeddings)

In [15]:
# Verify the embedding matrix
print(f"Generated embeddings for {len(filtered_recipes)} recipes.")
print(f"Each embedding is of size {len(embeddings[0])}")

Generated embeddings for 116 recipes.
Each embedding is of size 3072


## Retrieval System

Building FAISS Index and Metadata Integration


In [16]:
import faiss

In [17]:
# Print the embedding matrix shape
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (116, 3072)


In [18]:
# Initialize the FAISS index
# Create a FAISS index for fast similarity search using L2 (Euclidean) distance.
# FAISS (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors.
# Here, we use IndexFlatL2, which is a simple index that computes L2 (Euclidean) distances between vectors.
# The dimension of the index must match the size of each embedding vector (embedding_matrix.shape[1]).
index = faiss.IndexFlatL2(embedding_matrix.shape[1])

# Add all recipe embeddings to the FAISS index for retrieval.
# This allows us to perform fast nearest neighbor searches over the recipe embeddings.
index.add(embedding_matrix)

In [19]:
# Save the index
# Save the FAISS index to disk so it can be quickly loaded later without recomputing the embeddings or rebuilding the index.
# This is useful for efficient retrieval in future sessions or deployments.
faiss.write_index(index, "./resources/retrieval/filtered_recipe_index.index")

 ### Why Save Recipe Metadata?
 
 Saving the metadata for each recipe is crucial for the following reasons:
 
  1. **Contextual Retrieval**
 - When we use the FAISS index to retrieve similar recipes based on embeddings, the index only returns the position (or ID) of the most similar vectors.
 - The metadata file allows us to map these positions back to the actual recipe information and associated image paths.
 
  2. **Rich User Experience**
 - Metadata contains detailed information about each recipe (such as title, ingredients, instructions, etc.) and the path to its image.
 - This enables us to present complete and meaningful results to users after retrieval, rather than just abstract vector IDs.
 
  3. **Separation of Concerns**
 - By storing metadata separately from the FAISS index, we keep the retrieval system efficient and lightweight.
 - The FAISS index handles fast similarity search, while the metadata file handles the descriptive content.
 
  4. **Scalability and Maintainability**
 - If we need to update or enrich the recipe information (e.g., add new fields or images), we can do so in the metadata file without rebuilding the FAISS index.
 
  5. **Reproducibility**
 - Saving metadata ensures that anyone using the system can reproduce the mapping from embeddings to recipe details, even if the main dataset changes or is reprocessed.
 
 In summary, saving the metadata is essential for linking the results of vector-based retrieval back to human-readable and actionable recipe information, supporting both the technical and user-facing aspects of the retrieval system.


In [20]:
# Save the metadata for each recipe
metadata = [{'recipe_info': recipe['recipe_info'], # Include recipe information
             'image_path': recipe['image_path']} for recipe in filtered_recipes] # Include image path

RECIPE_METADATA = "./resources/processed_data/recipe_metadata.json"

# Write metadata to a JSON file with indentation
with open(RECIPE_METADATA, "w") as json_file:
  json.dump(metadata, json_file, indent = 4)

In [22]:
# Generate the embeddings for the query
query = "How to make bread?"
k = 5 # Number of top results to retrieve
query_embedding = client.embeddings.create(
    input = [query],
    model = "text-embedding-3-large"
).data[0].embedding
print(f"The query embedding is {query_embedding}\n")

# The OpenAI embedding API returns the embedding as a 1-dimensional list (vector) of floating point numbers.
# However, FAISS (the vector similarity search library) expects input queries to be in the form of a 2-dimensional numpy array,
# where each row is a separate query vector (even if we only have one query).
# Therefore, we first convert the 1D embedding list into a numpy array using np.array(query_embedding).
# Then, we use .reshape(1, -1) to reshape this array into a 2D array with 1 row and as many columns as needed (the embedding dimension).
# This ensures compatibility with FAISS, which requires input of shape (num_queries, embedding_dim).
query_vector = np.array(query_embedding).reshape(1, -1)
print(f"The query vector is {query_vector}\n")

The query embedding is [-0.019720381125807762, -0.028134725987911224, -0.022066915407776833, 0.016603518277406693, -0.04792621359229088, -0.04887430742383003, 0.038018617779016495, 0.017954552546143532, 0.0015480617294088006, 0.004168656188994646, -0.004832322709262371, -0.013889594934880733, -0.012917797081172466, -0.0015850967029109597, 0.038018617779016495, -0.0238090418279171, 0.017018308863043785, -0.016674624755978584, -0.0072233001701533794, 0.002310981974005699, -0.031856000423431396, 0.02015887387096882, 0.01501545775681734, -0.007821785286068916, 0.006275205407291651, 0.01784789189696312, 0.003410179866477847, -0.011229002848267555, -0.03960667923092842, 0.042285047471523285, -0.0036649806424975395, 0.023678677156567574, -0.034676581621170044, -0.02115437388420105, -0.015904298052191734, 0.011306035332381725, -0.01473102904856205, 0.005285630933940411, 0.00849730335175991, 0.019412249326705933, -0.01242004707455635, 0.007833636365830898, 0.00040108870598487556, -0.00825435388

In [23]:
# Search the FAISS index for the nearest neighbors
# Perform a similarity search in the FAISS index using the query vector.
# The 'search' method returns the 'k' nearest neighbors (or fewer if there are less than 'k' items in the metadata).
# 'distances' contains the similarity scores (lower means more similar), and 'indices' contains the indices of the matching items in the index.
distances, indices = index.search(query_vector, min(k, len(metadata))) 

# Print the distances (similarity scores) between the query and the retrieved items.
print(f"The distances are {distances}\n")

# Print the indices of the top matching items in the FAISS index.
print(f"The indices are {indices}\n")

The distances are [[1.1303246 1.187998  1.1995922 1.2144247 1.2298532]]

The indices are [[19  4 11 18 12]]



In [24]:
# Store the indices and distances
stored_indices = indices[0].tolist()
stored_distances = distances[0].tolist()
print(f"The stored indices are {stored_indices}\n")
print(f"The stored distances are {stored_distances}\n")

The stored indices are [19, 4, 11, 18, 12]

The stored distances are [1.1303246021270752, 1.1879980564117432, 1.1995922327041626, 1.214424729347229, 1.2298531532287598]



In [25]:
# The indices in 'stored_indices' correspond to the positions of the most similar items in the FAISS index,
# which are aligned with the entries in the 'metadata' list. This means that for each index in 'stored_indices',
# metadata[i] gives us the metadata (such as recipe information) for the document that is most similar to the query.
print("The metadata content for the top results (as matched by stored_indices):")
for i, dist in zip(stored_indices, stored_distances):
    if 0 <= i < len(metadata):
        print(f"Distance: {dist}, Metadata: {metadata[i]['recipe_info']}")

The metadata content for the top results (as matched by stored_indices):
Distance: 1.1303246021270752, Metadata: Here’s the structured information extracted from the recipe image:

### Recipe Title
Nut Bread and Oatmeal Bread

### Ingredients

#### Nut Bread
- 2½ Cups of Flour
- 3 Teaspoons of Baking Powder
- ¾ Cup of Milk
- ½ Cup of Sugar
- 1 Cup of Nuts, chopped (optional)

#### Oatmeal Bread
- 2¾ Cups of Rolled Oats
- 1½ Cups of Molasses
- 1 Yeast Cake
- Water

### Instructions

#### Nut Bread
1. Mix flour, baking powder, and sugar together.
2. Add milk and chopped nuts (if using).
3. Stir until well combined.
4. Pour into a greased loaf pan.
5. Bake for one hour.

#### Oatmeal Bread
1. Boil water and add rolled oats; let cool.
2. Add molasses and yeast cake to the cooled mixture.
3. Stir until well combined.
4. Let rise in a warm place until doubled in size.
5. Shape into loaves and let rise again.
6. Bake for one hour.

### Cuisine Type
American

### Dish Type
Bread

### Tags/Meta

In [26]:
# Return the results
results = [(metadata[i]['recipe_info'], dist) for i, dist in zip(stored_indices, stored_distances) if 0 <= i < len(metadata)]
results # Output the results as a list of tuples containing recipe info and distance

[('Here’s the structured information extracted from the recipe image:\n\n### Recipe Title\nNut Bread and Oatmeal Bread\n\n### Ingredients\n\n#### Nut Bread\n- 2½ Cups of Flour\n- 3 Teaspoons of Baking Powder\n- ¾ Cup of Milk\n- ½ Cup of Sugar\n- 1 Cup of Nuts, chopped (optional)\n\n#### Oatmeal Bread\n- 2¾ Cups of Rolled Oats\n- 1½ Cups of Molasses\n- 1 Yeast Cake\n- Water\n\n### Instructions\n\n#### Nut Bread\n1. Mix flour, baking powder, and sugar together.\n2. Add milk and chopped nuts (if using).\n3. Stir until well combined.\n4. Pour into a greased loaf pan.\n5. Bake for one hour.\n\n#### Oatmeal Bread\n1. Boil water and add rolled oats; let cool.\n2. Add molasses and yeast cake to the cooled mixture.\n3. Stir until well combined.\n4. Let rise in a warm place until doubled in size.\n5. Shape into loaves and let rise again.\n6. Bake for one hour.\n\n### Cuisine Type\nAmerican\n\n### Dish Type\nBread\n\n### Tags/Metadata\n- Baking\n- Quick Bread\n- Yeast Bread\n- Homemade\n\nThis fo

In [27]:
# Define a function to query the embeddings
def query_embeddings(query, index, metadata, k = 5):
  # Generate the embeddings for the query
  query_embedding = client.embeddings.create(
      input = [query],
      model = "text-embedding-3-large"
  ).data[0].embedding
  print(f"The query embedding is {query_embedding}\n")
  query_vector = np.array(query_embedding).reshape(1, -1)
  print(f"The query vector is {query_vector}\n")

  # Search faiss index
  distances, indices = index.search(query_vector, min(k, len(metadata)))
  # print(f"The distances are {distances}\n")
  # print(f"The indices are {indices}\n")

  # Store the indices and distances
  stored_indices = indices[0].tolist()
  stored_distances = distances[0].tolist()
  print(f"The stored indices are {stored_indices}\n")
  print(f"The stored distances are {stored_distances}\n")

  # # Print the metadata content
  # print("The metadata content is")
  # for i, dist in zip(stored_indices, stored_distances):
  #   if 0 <=i < len(metadata):
  #     print(f"Distance: {dist}, Metadata: {metadata[i]['recipe_info']}")

  # Return the results
  results = [(
      metadata[i]['recipe_info'], dist) for i, dist in zip(
          stored_indices, stored_distances) if 0 <= i < len(metadata)]
  return results

In [28]:
# Test the retrieval system
query = "How to make a bread?"
results = query_embeddings(query, index, metadata)
print(f"The results are {results}")

The query embedding is [-0.020172448828816414, -0.033205073326826096, -0.01535551343113184, 0.008380736224353313, -0.03682389110326767, -0.03151792287826538, 0.0364571176469326, 0.021077154204249382, -0.005040056072175503, -0.004532688297331333, -0.005309022031724453, -0.014145166613161564, -0.005370150785893202, -0.013069301843643188, 0.032569337636232376, -0.02100379951298237, 0.02326555922627449, -0.0222385972738266, -0.012947045266628265, 0.0066385697573423386, -0.030906638130545616, 0.00922431144863367, 0.01015957910567522, -0.00924264919012785, 0.011327136307954788, 0.016810374334454536, 0.006085355766117573, -0.014878709800541401, -0.033840812742710114, 0.04256998002529144, -0.006797504145652056, 0.01698153465986252, -0.03596808761358261, -0.013057076372206211, -0.026358667761087418, 0.009584969840943813, -0.006656908430159092, 0.0187542662024498, 0.009872274473309517, 0.02245866134762764, -0.01398623175919056, 0.012317419983446598, -0.011345474980771542, -0.022226370871067047, 

In [29]:
# Combine the results
len(results)

5

In [33]:
# Combine the results into a single string
def combined_retrived_content(results):
  combined_content = "\n\n".join([result[0] for result in results]) # Join the recipe information with double newlines
  return combined_content

# Get the combined content from results
combined_content = combined_retrived_content(results)
print(f"The combined content is {combined_content}")

The combined content is Here’s the structured information extracted from the recipe image:

### Recipe Title
Nut Bread and Oatmeal Bread

### Ingredients

#### Nut Bread
- 2½ Cups of Flour
- 3 Teaspoons of Baking Powder
- ¾ Cup of Milk
- ½ Cup of Sugar
- 1 Cup of Nuts, chopped (optional)

#### Oatmeal Bread
- 2¾ Cups of Rolled Oats
- 1½ Cups of Molasses
- 1 Yeast Cake
- Water

### Instructions

#### Nut Bread
1. Mix flour, baking powder, and sugar together.
2. Add milk and chopped nuts (if using).
3. Stir until well combined.
4. Pour into a greased loaf pan.
5. Bake for one hour.

#### Oatmeal Bread
1. Boil water and add rolled oats; let cool.
2. Add molasses and yeast cake to the cooled mixture.
3. Stir until well combined.
4. Let rise in a warm place until doubled in size.
5. Shape into loaves and let rise again.
6. Bake for one hour.

### Cuisine Type
American

### Dish Type
Bread

### Tags/Metadata
- Baking
- Quick Bread
- Yeast Bread
- Homemade

This format is suitable for embeddi

## Generative System

In [34]:
# Define the system prompt
system_prompt3 = f"""
You are highly experienced and expert chef specialized in providing cooking advice.
Your main task is to provide information precise and accurate on the combined content.
You answer diretly to the query using only information from the provided {combined_content}.
If you don't know the answer, just say that you don't know.
Your goal is to help the user and answer the {query}
"""

In [35]:
# Define function to retrieve a response from the API
def generate_response(query, combined_content, system_prompt):
  response = client.chat.completions.create(
      model = model,
      messages = [
          {"role": "system", "content": system_prompt3}, # Provide system prompt for guidance
          {"role": "user", "content": query}, # Provide the query as user input
          {"role": "assistant", "content": combined_content} # Provide the combined content from the results
      ],
      temperature = 0, # Set temperature to 0 for deterministic output
  )
  return response

In [36]:
# Get the results from the API
query = "How to make bread?"
combined_content = combined_retrived_content(results)
response = generate_response(query, combined_content, system_prompt3)

In [39]:
# Display the outcome
display_gpt_response(response)

To make bread, you can follow these two recipes: Nut Bread and Oatmeal Bread.

### Nut Bread

#### Ingredients:
- 2½ Cups of Flour
- 3 Teaspoons of Baking Powder
- ¾ Cup of Milk
- ½ Cup of Sugar
- 1 Cup of Nuts, chopped (optional)

#### Instructions:
1. Mix flour, baking powder, and sugar together.
2. Add milk and chopped nuts (if using).
3. Stir until well combined.
4. Pour into a greased loaf pan.
5. Bake for one hour.

---

### Oatmeal Bread

#### Ingredients:
- 2¾ Cups of Rolled Oats
- 1½ Cups of Molasses
- 1 Yeast Cake
- Water

#### Instructions:
1. Boil water and add rolled oats; let cool.
2. Add molasses and yeast cake to the cooled mixture.
3. Stir until well combined.
4. Let rise in a warm place until doubled in size.
5. Shape into loaves and let rise again.
6. Bake for one hour.

### Tips:
- Ensure your yeast is fresh for the Oatmeal Bread to rise properly.
- You can customize the Nut Bread by adding different types of nuts or even dried fruits.

Enjoy your homemade bread!

## RAG System

In [40]:
# Build the function for Retrieval-Augmented Generation (RAG)
def rag_system(query, index, metadata, system_prompt, k = 5):
  # Retrieval System: Retrieve relevant results based on the query
  results = query_embeddings(query, index, metadata, k)

  # Content Merge: Combine the retrieved content into a single string
  combined_content = combined_retrived_content(results)

  # Generation: Generate a response based on the query and combined content
  response = generate_response(query, combined_content, system_prompt)

  # Return the generated response
  return response

In [41]:
# Test the rag system
query1 = "How to make the best chocolate cake?"
response = rag_system(query1, index, metadata, system_prompt3)

The query embedding is [-0.002604733919724822, -0.03968388959765434, -0.013297276571393013, 0.007617205381393433, -0.019327564164996147, 0.022370068356394768, 0.00810969714075327, -0.005398256704211235, -0.006457113660871983, 0.037473149597644806, -0.0003669747384265065, 0.007113769184798002, -0.05170068517327309, -0.007600788958370686, 0.0003343129646964371, -0.04031865671277046, 0.021166199818253517, -0.004692351911216974, -0.004487146623432636, -0.015858232975006104, 0.015004580840468407, -0.02479969523847103, -0.011732247658073902, 0.03399287164211273, 0.0030698650516569614, 0.006758080795407295, 0.022610843181610107, -0.0006980386096984148, -0.036904048174619675, 0.05677882209420204, 0.04460880532860756, 0.0018906210316345096, -0.016208449378609657, -0.014720030128955841, -0.02911173366010189, 0.016919827088713646, -0.004768961574882269, -0.001930293976329267, 0.03475897014141083, 0.014402646571397781, -0.005751208867877722, 5.8526144130155444e-05, 0.002621150342747569, 0.01079104

In [42]:
display_gpt_response(response)

To make a delicious chocolate cake, you can follow a basic chocolate cake recipe. Here’s a simple one:

### Chocolate Cake Recipe

#### Ingredients:
- 1 ¾ cups all-purpose flour
- 1 ¾ cups granulated sugar
- ¾ cup unsweetened cocoa powder
- 1 ½ teaspoons baking powder
- 1 ½ teaspoons baking soda
- 1 teaspoon salt
- 2 large eggs
- 1 cup whole milk
- ½ cup vegetable oil
- 2 teaspoons vanilla extract
- 1 cup boiling water

#### Instructions:
1. **Preheat the Oven**: Preheat your oven to 350°F (175°C). Grease and flour two 9-inch round cake pans.
   
2. **Mix Dry Ingredients**: In a large mixing bowl, combine the flour, sugar, cocoa powder, baking powder, baking soda, and salt. Whisk together until well combined.

3. **Add Wet Ingredients**: Add the eggs, milk, vegetable oil, and vanilla extract to the dry ingredients. Beat on medium speed for about 2 minutes until well blended.

4. **Incorporate Boiling Water**: Carefully stir in the boiling water (the batter will be thin). Mix until smooth.

5. **Pour into Pans**: Divide the batter evenly between the prepared cake pans.

6. **Bake**: Bake in the preheated oven for 30-35 minutes, or until a toothpick inserted in the center comes out clean.

7. **Cool**: Allow the cakes to cool in the pans for about 10 minutes, then remove from pans and transfer to wire racks to cool completely.

8. **Frost**: Once cooled, frost with your favorite chocolate frosting.

Enjoy your homemade chocolate cake! If you need a specific frosting recipe or any variations, feel free to ask!

In [43]:
# Test with a different query
query3 = "I want something vegan"
response = rag_system(query3, index, metadata, system_prompt3)
display_gpt_response(response)

The query embedding is [-0.034548308700323105, -0.026619188487529755, -0.017104245722293854, 0.03446335345506668, -0.01730247214436531, -0.008516724221408367, 5.26128314959351e-06, -0.018930774182081223, -0.033670444041490555, 0.023518336936831474, 0.003238904057070613, -0.012955616228282452, 0.0179679524153471, -0.007178685627877712, 0.036842089146375656, -0.008821146562695503, 0.009663615375757217, -0.01138395071029663, 0.007461868692189455, -0.018406886607408524, -0.023546654731035233, 0.00599993672221899, 0.037436775863170624, -0.0069061219692230225, 0.007702574133872986, 0.008198143914341927, -0.022074105218052864, -0.012240579351782799, 0.002099093049764633, 0.01846352219581604, 0.010208742693066597, 0.024481158703565598, -0.0050618937239050865, 0.05527729541063309, -0.013698970898985863, -0.003830048255622387, 0.038739416748285294, -0.0019114842871204019, 0.015617535449564457, 0.026930689811706543, -0.04777294769883156, 0.022003307938575745, -0.024226294830441475, 0.001365472329

The recipes provided do not include any vegan options. However, if you're looking for a vegan bread recipe, I can suggest a simple one:

### Vegan Banana Bread

#### Ingredients
- 3 ripe bananas, mashed
- 1/3 cup melted coconut oil
- 1 teaspoon vanilla extract
- 1 teaspoon baking soda
- Pinch of salt
- 3/4 cup sugar (or maple syrup)
- 1 1/2 cups all-purpose flour

#### Instructions
1. Preheat your oven to 350°F (175°C).
2. In a mixing bowl, mash the ripe bananas with a fork until smooth.
3. Stir in the melted coconut oil and vanilla extract.
4. Mix in the baking soda and salt.
5. Add the sugar and mix well.
6. Finally, stir in the flour until just combined.
7. Pour the batter into a greased loaf pan.
8. Bake for 50-60 minutes, or until a toothpick inserted into the center comes out clean.
9. Let it cool before slicing.

This recipe is simple, delicious, and completely vegan! If you need more vegan recipes or specific types of bread, feel free to ask!