In [None]:
!pip install getdaft deltalake<0.17

In [None]:
CI = False

In [None]:
# Skip this notebook execution in CI because it requires AWS credentials for presigned URL generation
if CI:
    import sys
    sys.exit()

## Multimodal data lake annotation and indexing

Let's go from: **Images in an S3 Bucket**

To: **Multimodal Data Lake** where we can run queries efficiently to power analytics, retrieval and more!

In [1]:
import daft

IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True))  # Use anonymous S3 access

daft.set_planning_config(default_io_config=IO_CONFIG)

DaftContext(_daft_execution_config=<daft.daft.PyDaftExecutionConfig object at 0x103a12b90>, _daft_planning_config=<daft.daft.PyDaftPlanningConfig object at 0x1037aff90>, _runner_config=_PyRunnerConfig(use_thread_pool=None), _disallow_set_runner=False, _runner=None)

In [2]:
df = daft.from_glob_path(
    "s3://daft-public-data/open-images/validation-images/*",
)
df.show()

path Utf8,size Int64,num_rows Int64
s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg,290621,
s3://daft-public-data/open-images/validation-images/0004886b7d043cfd.jpg,375363,
s3://daft-public-data/open-images/validation-images/000595fe6fee6369.jpg,462817,
s3://daft-public-data/open-images/validation-images/00075905539074f2.jpg,302326,
s3://daft-public-data/open-images/validation-images/0007cebe1b2ba653.jpg,970275,
s3://daft-public-data/open-images/validation-images/0007d6cf88afaa4a.jpg,614095,
s3://daft-public-data/open-images/validation-images/0008e425fb49a2bf.jpg,415082,
s3://daft-public-data/open-images/validation-images/0009bad4d8539bb4.jpg,359851,


### Working with URLs in Daft is really **easy and efficient**

* URLs are extremely common when working with multimodal data, most commonly as a `https://` URL or `s3://` object store URL
* Daft runs URL downloads using async Rust kernels, saturating your machine's network bandwidth even for millions of small files (see: [demo at PyData Global 2023](https://www.youtube.com/watch?v=QEOFwptwnXQ&ab_channel=PyData))

In [3]:
df = df.with_column("image_bytes", df["path"].url.download())
df.show()

path Utf8,size Int64,num_rows Int64,image_bytes Binary
s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg,290621,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/0004886b7d043cfd.jpg,375363,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/000595fe6fee6369.jpg,462817,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/00075905539074f2.jpg,302326,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/0007cebe1b2ba653.jpg,970275,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/0007d6cf88afaa4a.jpg,614095,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/0008e425fb49a2bf.jpg,415082,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."
s3://daft-public-data/open-images/validation-images/0009bad4d8539bb4.jpg,359851,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""..."


### Reading Images

Daft makes working with opaque file formats/encodings easy

* Native type available for images and tensors
* Support for arbitrary Python objects in columns so you can use all your favorite Python libraries as well for datatypes not yet supported by Daft (e.g. video, audio, PDFs)

In [4]:
df = df.with_column("image", df["image_bytes"].image.decode())
df.show()

path Utf8,size Int64,num_rows Int64,image_bytes Binary,image Image[MIXED]
s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg,290621,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/0004886b7d043cfd.jpg,375363,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/000595fe6fee6369.jpg,462817,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/00075905539074f2.jpg,302326,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/0007cebe1b2ba653.jpg,970275,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/0007d6cf88afaa4a.jpg,614095,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/0008e425fb49a2bf.jpg,415082,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",
s3://daft-public-data/open-images/validation-images/0009bad4d8539bb4.jpg,359851,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",


### Thumbnail creation

Easily create thumbnails for your image using the `.image.resize(...)` Daft expression.

In [5]:
df = df.with_column("image_thumbnail", df["image"].image.resize(32, 32))
df.show()

path Utf8,size Int64,num_rows Int64,image_bytes Binary,image Image[MIXED],image_thumbnail Image[MIXED]
s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg,290621,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/0004886b7d043cfd.jpg,375363,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/000595fe6fee6369.jpg,462817,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/00075905539074f2.jpg,302326,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/0007cebe1b2ba653.jpg,970275,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/0007d6cf88afaa4a.jpg,614095,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/0008e425fb49a2bf.jpg,415082,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,
s3://daft-public-data/open-images/validation-images/0009bad4d8539bb4.jpg,359851,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,


### Running multimodal LLMs

Since we are running on just our laptop, we will be offloading our "heavy compute" (running the GPT-4o model on our image) to the OpenAI API.

If instead we wanted to run our own models or algorithms, Daft also lets us run on GPUs with the `df.with_column(..., resource_request=ResourceRequest(num_gpus=1))` pattern.

In [6]:
import base64
import requests
import json
import os
import boto3

DEFAULT_PROMPT = "What’s in this image?"
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise RuntimeError("Please specify your OpenAI API key as the environment variable `OPENAI_API_KEY`.")

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

@daft.udf(return_dtype=daft.DataType.string())
def generate_presigned_url(s3_urls, expires_in=3600):
    """
    Generate a presigned Amazon S3 URLs
    """
    s3_client = boto3.client("s3")
    presigned_urls = []
    for s3_url in s3_urls.to_pylist():
        bucket, key = s3_url.strip("s3://").split("/", 1)
        url = s3_client.generate_presigned_url(
            ClientMethod="get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=expires_in
        )
        presigned_urls.append(url)
    return presigned_urls

@daft.udf(return_dtype=daft.DataType.string())
def run_gpt4o_on_urls(images_urls, prompt=DEFAULT_PROMPT):
    """Run the gpt-4o LLM by making an API call to OpenAI"""
    results = []
    for url in images_urls.to_pylist():
        payload = {
          "model": "gpt-4o",
          "messages": [
            {
              "role": "user",
              "content": [
                {
                  "type": "text",
                  "text": "What’s in this image?"
                },
                {
                  "type": "image_url",
                  "image_url": {
                    "url": url
                  }
                }
              ]
            }
          ],
          "max_tokens": 300
        }

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        results.append(json.dumps(response.json()))

    return results


In [7]:
# Generate temporary URLs with a short expiration time
df = df.with_column("image_urls", generate_presigned_url(df["path"]))

# Make remote API calls to OpenAI endpoint
df = df.with_column("gpt_results", run_gpt4o_on_urls(df["image_urls"], prompt="What’s in this image?"))

# Parse JSON outputs from OpenAI endpoint
df = df.with_column("description", df["gpt_results"].json.query(".choices[0].message.content"))

df.show(3)

path Utf8,size Int64,num_rows Int64,image_bytes Binary,image Image[MIXED],image_thumbnail Image[MIXED],image_urls Utf8,gpt_results Utf8,description Utf8
s3://daft-public-data/open-images/validation-images/0001eeaf4aed83f9.jpg,290621,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,,https://daft-public-data.s3.amazonaws.com/open-images/validation-images/0001eeaf4aed83f9.jpg?AWSAccessKeyId=ASIA5WTJMZ7Y7QE2OF4U&Signature=FYuaPydmXnUL6JvTLdBGv%2F4B5Pc%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJGMEQCIFn7KcWE9cRDdfGxb1JaVM4DrBOp9C%2Fa0%2FIF8vVRDkM5AiB7sJ5ZhEiTFhTSQXJtEcs3JzU3Sb0IwKiKpp5RjdORHiqcAwhhEAEaDDk0MTg5MjYyMDI3MyIMRJprXMJhgCCewdCDKvkCNRb2JdZ8IV9y6KEE8n%2FgBqW%2B%2FW%2FbIG9CVNGUnDbzpbzjlsVIAoM4nTfjBYrOHy2ktIIHsFVDOKYco8qPBGdRiaA%2F1NrEUizLgrLGE2qCnOI3zeHnn6kw8PE1pTalR20rTnHW%2FBkkIx9N2rDVVMa9%2BH8i3pVtWyHwlqkr8j%2BIorXge1av9fV1qkm2NgtUey5NyAoa7PpEtYpiKAk62SbXuriL%2BWPcUhoh2TcHDQxT8m6NwDDJ5N6TtwFQIX9Qpbz2NuH5ZZVS4JFaiRfP2%2FdRZq7%2BvTWjlbj%2Fg05eUU59OdT27kLe8VjDJRl%2FbIGZcbwz6aLOKLouuRF%2B0USTtjAYHtpZ%2FOH%2FFfhV1ppso4E3AWFSyjNP%2FNkiG6w1Z1M9%2BxcCqvUvZ6w3tIhfc4Goq5ArQDYb%2BJZUVsNjXGV7D3VyeFB5437d4WR8e7PbYcSln0Ar44W0iHG5EZQwWJN7UJqi6j1NdDC3gu6rrikEFjIHuPBLklQRgTPFsHQw0O2hswY6pwHFnzVt%2F1uWxqYxv0jgdofqTz%2FGKf2ePvdzlzHoRDzKqpR8FAtAR2fFr354WPwzUjz96%2F3sF%2F1EdsJEUKSjWcxIhw9%2FZcH7VT2FGo0PqCk2iNgurxnKQcIoaWgq2SsnTis2RZDbNkyWaB2ltCbwthEqH0Sg4npbHwKn5fAe9LH5W44e4uXA9PFzmMK4oAOJlwhxzxiJ4Jvs5jdLFRwvD4t7kiyhIhAlUQ%3D%3D&Expires=1718125792,"{""id"": ""chatcmpl-9YyMF9piO5bAvHaXriwQTj1FxNcEW"", ""object"": ""chat.completion"", ""created"": 1718122195, ""model"": ""gpt-4o-2024-05-13"", ""choices"": [{""index"": 0, ""message"": {""role"": ""assistant"", ""content"": ""The image shows the apron area of an airport with three airplanes parked. In the foreground, there is an airplane from American Airlines, and in the background, there are two airplanes from Virgin Atlantic. A ground service vehicle is also visible near the American Airlines plane. The scene includes some airport infrastructure, such as buildings and equipment, as well as a control tower in the distant background.""}, ""logprobs"": null, ""finish_reason"": ""stop""}], ""usage"": {""prompt_tokens"": 438, ""completion_tokens"": 76, ""total_tokens"": 514}, ""system_fingerprint"": ""fp_aa87380ac5""}","""The image shows the apron area of an airport with three airplanes parked. In the foreground, there is an airplane from American Airlines, and in the background, there are two airplanes from Virgin Atlantic. A ground service vehicle is also visible near the American Airlines plane. The scene includes some airport infrastructure, such as buildings and equipment, as well as a control tower in the distant background."""
s3://daft-public-data/open-images/validation-images/0004886b7d043cfd.jpg,375363,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,,https://daft-public-data.s3.amazonaws.com/open-images/validation-images/0004886b7d043cfd.jpg?AWSAccessKeyId=ASIA5WTJMZ7Y7QE2OF4U&Signature=XPb6RXKunqth4C2D4hlAxOP1q%2Bw%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJGMEQCIFn7KcWE9cRDdfGxb1JaVM4DrBOp9C%2Fa0%2FIF8vVRDkM5AiB7sJ5ZhEiTFhTSQXJtEcs3JzU3Sb0IwKiKpp5RjdORHiqcAwhhEAEaDDk0MTg5MjYyMDI3MyIMRJprXMJhgCCewdCDKvkCNRb2JdZ8IV9y6KEE8n%2FgBqW%2B%2FW%2FbIG9CVNGUnDbzpbzjlsVIAoM4nTfjBYrOHy2ktIIHsFVDOKYco8qPBGdRiaA%2F1NrEUizLgrLGE2qCnOI3zeHnn6kw8PE1pTalR20rTnHW%2FBkkIx9N2rDVVMa9%2BH8i3pVtWyHwlqkr8j%2BIorXge1av9fV1qkm2NgtUey5NyAoa7PpEtYpiKAk62SbXuriL%2BWPcUhoh2TcHDQxT8m6NwDDJ5N6TtwFQIX9Qpbz2NuH5ZZVS4JFaiRfP2%2FdRZq7%2BvTWjlbj%2Fg05eUU59OdT27kLe8VjDJRl%2FbIGZcbwz6aLOKLouuRF%2B0USTtjAYHtpZ%2FOH%2FFfhV1ppso4E3AWFSyjNP%2FNkiG6w1Z1M9%2BxcCqvUvZ6w3tIhfc4Goq5ArQDYb%2BJZUVsNjXGV7D3VyeFB5437d4WR8e7PbYcSln0Ar44W0iHG5EZQwWJN7UJqi6j1NdDC3gu6rrikEFjIHuPBLklQRgTPFsHQw0O2hswY6pwHFnzVt%2F1uWxqYxv0jgdofqTz%2FGKf2ePvdzlzHoRDzKqpR8FAtAR2fFr354WPwzUjz96%2F3sF%2F1EdsJEUKSjWcxIhw9%2FZcH7VT2FGo0PqCk2iNgurxnKQcIoaWgq2SsnTis2RZDbNkyWaB2ltCbwthEqH0Sg4npbHwKn5fAe9LH5W44e4uXA9PFzmMK4oAOJlwhxzxiJ4Jvs5jdLFRwvD4t7kiyhIhAlUQ%3D%3D&Expires=1718125792,"{""id"": ""chatcmpl-9YyMKH8QdViBtsU4higfq4arJV3SP"", ""object"": ""chat.completion"", ""created"": 1718122200, ""model"": ""gpt-4o-2024-05-13"", ""choices"": [{""index"": 0, ""message"": {""role"": ""assistant"", ""content"": ""The image depicts a collection of ornate, golden items. These appear to include intricately designed boxes or frames, and possibly some decorative trays or mirrors. The craftsmanship of the items is detailed, featuring ornate patterns and designs that suggest they may be vintage or antique pieces. The overall setting seems to reflect opulence or a display of fine decorative items, possibly within an antique shop or a private collection.""}, ""logprobs"": null, ""finish_reason"": ""stop""}], ""usage"": {""prompt_tokens"": 778, ""completion_tokens"": 79, ""total_tokens"": 857}, ""system_fingerprint"": ""fp_aa87380ac5""}","""The image depicts a collection of ornate, golden items. These appear to include intricately designed boxes or frames, and possibly some decorative trays or mirrors. The craftsmanship of the items is detailed, featuring ornate patterns and designs that suggest they may be vintage or antique pieces. The overall setting seems to reflect opulence or a display of fine decorative items, possibly within an antique shop or a private collection."""
s3://daft-public-data/open-images/validation-images/000595fe6fee6369.jpg,462817,,"b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01""...",,,https://daft-public-data.s3.amazonaws.com/open-images/validation-images/000595fe6fee6369.jpg?AWSAccessKeyId=ASIA5WTJMZ7Y7QE2OF4U&Signature=CczEwjR8H3FoP%2FwpQ%2F6UR33cmyA%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJGMEQCIFn7KcWE9cRDdfGxb1JaVM4DrBOp9C%2Fa0%2FIF8vVRDkM5AiB7sJ5ZhEiTFhTSQXJtEcs3JzU3Sb0IwKiKpp5RjdORHiqcAwhhEAEaDDk0MTg5MjYyMDI3MyIMRJprXMJhgCCewdCDKvkCNRb2JdZ8IV9y6KEE8n%2FgBqW%2B%2FW%2FbIG9CVNGUnDbzpbzjlsVIAoM4nTfjBYrOHy2ktIIHsFVDOKYco8qPBGdRiaA%2F1NrEUizLgrLGE2qCnOI3zeHnn6kw8PE1pTalR20rTnHW%2FBkkIx9N2rDVVMa9%2BH8i3pVtWyHwlqkr8j%2BIorXge1av9fV1qkm2NgtUey5NyAoa7PpEtYpiKAk62SbXuriL%2BWPcUhoh2TcHDQxT8m6NwDDJ5N6TtwFQIX9Qpbz2NuH5ZZVS4JFaiRfP2%2FdRZq7%2BvTWjlbj%2Fg05eUU59OdT27kLe8VjDJRl%2FbIGZcbwz6aLOKLouuRF%2B0USTtjAYHtpZ%2FOH%2FFfhV1ppso4E3AWFSyjNP%2FNkiG6w1Z1M9%2BxcCqvUvZ6w3tIhfc4Goq5ArQDYb%2BJZUVsNjXGV7D3VyeFB5437d4WR8e7PbYcSln0Ar44W0iHG5EZQwWJN7UJqi6j1NdDC3gu6rrikEFjIHuPBLklQRgTPFsHQw0O2hswY6pwHFnzVt%2F1uWxqYxv0jgdofqTz%2FGKf2ePvdzlzHoRDzKqpR8FAtAR2fFr354WPwzUjz96%2F3sF%2F1EdsJEUKSjWcxIhw9%2FZcH7VT2FGo0PqCk2iNgurxnKQcIoaWgq2SsnTis2RZDbNkyWaB2ltCbwthEqH0Sg4npbHwKn5fAe9LH5W44e4uXA9PFzmMK4oAOJlwhxzxiJ4Jvs5jdLFRwvD4t7kiyhIhAlUQ%3D%3D&Expires=1718125792,"{""id"": ""chatcmpl-9YyMQg1KK0zOu9k81Sr9SyOt2R8F5"", ""object"": ""chat.completion"", ""created"": 1718122206, ""model"": ""gpt-4o-2024-05-13"", ""choices"": [{""index"": 0, ""message"": {""role"": ""assistant"", ""content"": ""The image shows a branch of a plant with numerous red berries. The leaves are green, glossy, and somewhat elliptical. The red berries are small, round, and tightly clustered along the stems. The overall appearance is indicative of a plant commonly associated with the winter season or the holiday period, such as holly. However, without more specific details, it is difficult to definitively identify the plant.""}, ""logprobs"": null, ""finish_reason"": ""stop""}], ""usage"": {""prompt_tokens"": 778, ""completion_tokens"": 80, ""total_tokens"": 858}, ""system_fingerprint"": ""fp_aa87380ac5""}","""The image shows a branch of a plant with numerous red berries. The leaves are green, glossy, and somewhat elliptical. The red berries are small, round, and tightly clustered along the stems. The overall appearance is indicative of a plant commonly associated with the winter season or the holiday period, such as holly. However, without more specific details, it is difficult to definitively identify the plant."""


In [9]:
df = df.select(
    # Larger multimodal data (such as large images or documents) can be written as URLs
    "path",
    # Small multimodal data (such as thumbnails or full-form text) can be written inline
    df["image_thumbnail"].image.encode("JPEG"),
    # Metadata such as size in bytes and descriptions should be stored as per normal
    "size",
    "description",
)

DaftCoreException: DaftError::External Unable to create logical plan node.
Due to: DaftError::TypeError ImageEncode can only encode ImageArrays and FixedShapeImageArrays, got image_thumbnail#Binary

In [10]:
df

path Utf8,image_thumbnail Binary,size Int64,description Utf8


In [11]:
# Limit to running just 8 rows to save your OpenAI bill...
df = df.limit(8)

df.write_delta("my_table.delta_lake")

                                                                                                                                                                  

## Now we have our "Multimodal Data Lake"!

1. Thumbnails readily available for visualization
2. URLs available for access to the raw data
3. Extracted metadata (`description`) available for querying

In [12]:
read_df = daft.read_deltalake("my_table.delta_lake")
read_df

path Utf8,image_thumbnail Binary,size Int64,description Utf8


In [13]:
read_df = read_df \
    .with_column("image_thumbnail", daft.col("image_thumbnail").image.decode()) \
    .where(read_df["description"].str.contains("dog"))

In [14]:
read_df.collect()

                                                                                                                                                                  

path Utf8,size Int64,description Utf8,image_thumbnail Image[MIXED]
s3://daft-public-data/open-images/validation-images/0007cebe1b2ba653.jpg,970275,"""The image depicts a black and white dog, likely a Border Collie, energetically running on a grassy area in a park. In the background, a person wearing pink pants is visible, suggesting they might be walking or playing with the dog. Some trees and a large stick on the ground are also visible in the scene.""",
s3://daft-public-data/open-images/validation-images/0007d6cf88afaa4a.jpg,614095,"""The image shows a dog lying on the grass in an outdoor setting. The dog has a reddish-brown and black coat. The surrounding area consists of grass and some wildflowers in the background.""",
s3://daft-public-data/open-images/validation-images/0008e425fb49a2bf.jpg,415082,"""The image shows a German Shepherd dog standing on green grass with a gray wooden fence in the background. The dog appears to be panting, with its tongue hanging out. The lighting suggests it is a bright and sunny day.""",
