In [1]:
# imports

import os
from dotenv import load_dotenv
from openai import OpenAI
import base64
from IPython.display import display, Image, Markdown
import pandas as pd
import json

In [2]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

API key found and looks good so far!


## Perform OCR and transform to images

In [3]:
# Import the libraries
from pdf2image import convert_from_path
# Create a function to convert PDF to images and store in specific path
def pdf_to_images(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert pdf into images
    images = convert_from_path(pdf_path)
    image_paths = []

    # Save images and paths
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page{i+1}.jpg")
        image.save(image_path, "JPEG")
        image_paths.append(image_path)
    return image_paths

In [4]:
pdf_path = "./resources/pdf/Things mother used to make.pdf"
output_folder = "./resources/output_images"
image_paths = pdf_to_images(pdf_path, output_folder)

In [5]:
# Set up connection to openai API

client = OpenAI(
    api_key=api_key
)
model = "gpt-4o-mini"

In [7]:
# Read and encode one image
image_path = "./resources/output_images/page23.jpg"
with open(image_path, "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode('utf-8')

In [8]:
# Define the system prompt
system_prompt = """
Please analyze the content of this image and extract any related recipe information.
"""

In [9]:
# Call the OpenAI API use the chat completion method
response = client.chat.completions.create(
    model = model,
    messages = [
        # Provide the system prompt
        {"role": "system", "content": system_prompt},

        # The user message contains both the text and image URL / path
        {"role": "user", "content": [
            "This is the image from the recipe page.",
            {"type": "image_url",
             "image_url": {"url": f"data:image/jpeg;base64,{image_data}",
                           "detail": "low"}}
        ]}
    ]
)

In [10]:
# Define a function to get the gpt-response and display in markdown
def display_gpt_response(response):
    gpt_response = response.choices[0].message.content
    return display(Markdown(gpt_response))


In [11]:
display_gpt_response(response)

Here’s the recipe information based on the image you provided:

### Things Mother Used to Make
#### Breads

**Bannocks**
- **Ingredients:**
  - 1 Cupful of Thick Sour Milk
  - ½ Cupful of Sugar
  - 2 Cupfuls of Flour
  - ½ Cupful of Indian Meal
  - 1 Teaspoonful of Soda
  - A pinch of Salt

- **Instructions:**
  1. Make the mixture stiff enough to drop from a spoon.
  2. Drop mixture, size of a walnut, into boiling fat.
  3. Serve warm, with maple syrup.

---

**Boston Brown Bread**
- **Ingredients:**
  - 1 Cupful of Rye Meal
  - 1 Cupful of Graham Meal
  - 1 Cupful of Molasses
  - 1 Cupful of Flour
  - 1 Cupful of Sweet Milk
  - 1 Cupful of Sour Milk
  - ½ Teaspoonful of Soda
  - 1 Heaping Teaspoonful of Salt

- **Instructions:**
  1. Stir the meals and salt together.
  2. Beat the soda into the molasses until it foams; add sour milk, mix well, and pour into a tin cup which has been well greased.
  3. If you have none, use a brown-bread steamer. 

Feel free to ask if you need more information or additional recipes!

In [13]:
# Define improved system prompt
system_prompt2 = """
Please analyze the content of this image and extract any related recipe information into structure components.
Specifically, extra the recipe title, list of ingredients, step by step instructions, cuisine type, dish type, any relevant tags or metadata.
The output must be formatted in a way suited for embedding in a Retrieval Augmented Generation (RAG) system.
"""

In [14]:
# Call the api to extract the information
response = client.chat.completions.create(
    model = model,
    messages = [
        # Provide the system prompt
        {"role": "system", "content": system_prompt2},

         # The user message contains both the text and image URL / path
        {"role": "user", "content": [
            "This is the image from the recipe page",
            {"type": "image_url",
             "image_url": {"url": f"data:image/jpeg;base64,{image_data}",
                           "detail": "low"}}
        ]}
    ],
    temperature = 0, # Set the temperature to 0 for deterministic output
)

In [15]:
display_gpt_response(response)

Here’s the structured information extracted from the recipe image:

### Recipe Title
**Breads**

### Ingredients
#### Bannocks
- 1 Cupful of Thick Sour Milk
- 1 Cupful of Flour
- ½ Cupful of Indian Meal
- 1 Teaspoonful of Soda
- A pinch of Salt

#### Boston Brown Bread
- 1 Cupful of Rye Meal
- 1 Cupful of Sour Milk
- 1 Cupful of Graham Meal
- 1 Cupful of Molasses
- 1 Cupful of Flour
- ½ Teaspoonful of Salt
- 1 Heaping Teaspoonful of Soda
- 1 Cupful of Sweet Milk

### Instructions
1. For Bannocks: Make the mixture stiff enough to drop from a spoon. Drop mixture, size of a walnut, into boiling fat. Serve warm, with maple syrup.
2. For Boston Brown Bread: Stir the meals and salt together. Beat the soda into the molasses until it foams; add sour milk, mix well, and pour into a tin pan which has been well greased, if you have no brown-bread steamer.

### Cuisine Type
- American

### Dish Type
- Bread

### Tags/Metadata
- Quick Bread
- Traditional Recipe
- Breakfast Item

This format is suitable for embedding in a Retrieval Augmented Generation (RAG) system.

## Reading All Images in a Dataset

In [17]:
len(image_paths)

136

In [18]:
extracted_recipes = []

for image_path in image_paths:
    print(f"Processing image {image_path}")
    
    # Reading and decoding images
    with open(image_path, "rb") as image_file:
        image_data = base64.b64encode(image_file.read()).decode("utf-8")
    
    # Call the OpenAI API
    response = client.chat.completions.create(
        model = model,
        messages = [
            # Provide the system prompt
            {"role": "system", "content": system_prompt2},

            # The user message contains both the text and image URL / path
            {"role": "user", "content": [
                "This is the image from the recipe page",
                {"type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_data}",
                            "detail": "low"}}
            ]}
        ],
        temperature = 0, # Set the temperature to 0 for deterministic output
    )

    # Extract the content and store it
    gpt_response = response.choices[0].message.content # Get the response content
    extracted_recipes.append({"image_path": image_path, "recipe_info": gpt_response}) # Store the path and extracted info
    print(f"Extracted information for {image_path}:\n{gpt_response}\n") # Print the extracted information for review    

Processing image ./resources/output_images/page1.jpg
Extracted information for ./resources/output_images/page1.jpg:
I can't extract specific recipe information from the image you provided. However, if you have text or details from the recipe, feel free to share, and I can help you organize that information!

Processing image ./resources/output_images/page2.jpg
Extracted information for ./resources/output_images/page2.jpg:
I can't extract any recipe information from the image you provided. If you have a specific recipe or text you'd like to share, please type it out, and I can help you format it or analyze it!

Processing image ./resources/output_images/page3.jpg
Extracted information for ./resources/output_images/page3.jpg:
It seems that the image you provided does not contain any recipe information, such as a title, ingredients, or instructions. It appears to be a page from a library catalog or a book reference. If you have a different image or more specific content related to a recip

### Filter out non-recipe content based on key-recipe related terms

In [19]:
filtered_recipes = []

for recipe in extracted_recipes:
    if any(keyword in recipe["recipe_info"].lower() for keyword in ["ingredients", "instructions", "recipe title"]):
        filtered_recipes.append(recipe)
    else:
        print(f"Skipping recipe: {recipe['image_path']}")

Skipping recipe: ./resources/output_images/page1.jpg
Skipping recipe: ./resources/output_images/page2.jpg
Skipping recipe: ./resources/output_images/page4.jpg
Skipping recipe: ./resources/output_images/page5.jpg
Skipping recipe: ./resources/output_images/page8.jpg
Skipping recipe: ./resources/output_images/page10.jpg
Skipping recipe: ./resources/output_images/page11.jpg
Skipping recipe: ./resources/output_images/page12.jpg
Skipping recipe: ./resources/output_images/page20.jpg
Skipping recipe: ./resources/output_images/page21.jpg
Skipping recipe: ./resources/output_images/page22.jpg
Skipping recipe: ./resources/output_images/page106.jpg
Skipping recipe: ./resources/output_images/page107.jpg
Skipping recipe: ./resources/output_images/page108.jpg
Skipping recipe: ./resources/output_images/page112.jpg
Skipping recipe: ./resources/output_images/page124.jpg
Skipping recipe: ./resources/output_images/page133.jpg
Skipping recipe: ./resources/output_images/page134.jpg
Skipping recipe: ./resourc

In [20]:
len(filtered_recipes)

116

In [22]:
# Define the output file path
output_file = "./resources/processed_data/recipe_info.json"

# Write the filtered list to a json file
with open(output_file, "w") as json_file:
    json.dump(filtered_recipes, json_file, indent = 4)