In [None]:
import pyarrow as pa

from daft.dataframe import DataFrame
from daft.expressions import col, udf
from daft.runners.blocks import DataBlock

from typing import List
import concurrent.futures
import threading
import PIL.Image
import boto3
import io

In [None]:
CSV_S3 = "s3://amazon-berkeley-objects/images/metadata/images.csv.gz"
CSV_GZ = "/tmp/berkeley_images.csv.gz"
CSV = "/tmp/berkeley_images.csv"

import gzip
import shutil
import pathlib
import os
from daft.filesystem import get_filesystem_from_path

fs = get_filesystem_from_path(CSV_S3)
if not os.path.exists(CSV_GZ):
    with fs.open(CSV_S3) as f:
        pathlib.Path(CSV_GZ).write_bytes(f.read())
with gzip.open(CSV_GZ, 'rb') as f_in:
    with open(CSV, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
df = DataFrame.from_csv(CSV).limit(20)

In [None]:
df.schema()

In [None]:
df = df.with_column("area", col("height") * col("width"))

In [None]:
df.schema()

In [None]:
df.to_pandas()

In [None]:
import pandas as pd

@udf(return_type=str)
def full_url(paths):
    return pd.Series(["s3://amazon-berkeley-objects/images/small/" + path for path in paths])

df = df.with_column("s3_url", full_url(col("path")))

In [None]:
df.limit(20).to_pandas()

In [None]:
@udf(return_type=PIL.Image.Image)
def download_batch(batch) -> List[PIL.Image.Image]:
    def download_single(obj: str) -> bytes:
        local = threading.local()
        if "boto_session" not in local.__dict__:
            local.boto_session = boto3.session.Session()
        s3 = local.boto_session.client('s3')
        bucket, key = obj.replace("s3://", "").split("/", maxsplit=1)
        response = s3.get_object(Bucket=bucket, Key=key)
        body = response["Body"]
        contents = body.read()
        body.close()
        return contents
    
    with concurrent.futures.ThreadPoolExecutor() as executor : 
        byte_contents = [res for res in executor.map(download_single, batch)]
        images = []
        for payload in byte_contents:
            with io.BytesIO(payload) as f:
                images.append(PIL.Image.open(f).convert("RGB"))
        return images

In [None]:
df = df.with_column("image", download_batch(col("s3_url")))

In [None]:
df

In [None]:
df.to_pandas()

In [None]:
df = df.with_column("img_bnw", col("image").as_py(PIL.Image.Image).convert("1"))

In [None]:
df.schema()

In [None]:
pddf = df.to_pandas()

In [None]:
pddf["image"][0]

In [None]:
pddf["img_bnw"][0]

In [None]:
http_endpoint = HTTPEndpoint(schema, backend=aws_lambda)

df = DataFrame.from_endpoint(http_endpoint)
df = df.with_column("image", download_batch(col("url")))
df = df.with_column("results", run_model(col("image")))
df.select(col("results")).write_endpoint(http_endpoint)

http_endpoint.deploy("my_endpoint")