# Daft Demo

## Here we show a demo of Daft of the following: 
- Initializing our cluster
- Daft Data Repos
- Use a Python Dataclass to define a Schema
- Load existing data from our Data Repos
- Write a function to download data from the web
- Write a function to decode and resize the image
- Write our own Schema for image storage
- Save downloaded images to the cloud
- Write our own embedding extractor for batch inference
- Save our embeddings to a data repo
- Preview our Schema

## Initializing our cluster

In [None]:
import daft

daft.init(ray_address="ray://localhost:10001")

## Daft Data Repos

In [None]:
from daft import datarepo

datarepo_client = datarepo.get_client()
datarepo_client.list_ids()

## Defining our Own Schema
- ORM for binary data
- Translates to parquet under the hood
- Support for logical types like images, numpy arrays and any other types that you can define yourself

In [None]:
import dataclasses
from daft import dataclass

@dataclass
class OpenImagesMetadata:
    url: str
    size: int
    id: str

## Reading Data from our Data Repo

In [None]:
datarepo = datarepo_client.from_id("openimages-dc-8000-v6")
query = datarepo.query(OpenImagesMetadata)

In [None]:
query

## Previewing our Data

In [None]:
sample = query.limit(5)
print(sample)
sample.execute().show(4)

In [None]:
import concurrent.futures
import requests
from typing import List

from daft.datarepo.query import functions as F

def download_single(url: str) -> bytes:
    r = requests.get(url)
    if r.status_code == 200:
        return r.content
    else:
        return b''

@F.batch_func(batch_size=64)
def download_batch(batch: List[str]) -> List[bytes]:
    with concurrent.futures.ThreadPoolExecutor() as exector : 
        futures = exector.map(download_single, batch)
        return list(futures)


### Download via a batched map

In [None]:
%%time

with_downloaded_column = query.with_column("payload", download_batch("url"))

In [None]:
with_downloaded_column

## Decode and Resize our downloaded images

In [None]:
import PIL.Image
import io

@F.func
def resized_pil_image(payload: bytes, size:int=256) -> PIL.Image.Image:
    """Loads a payload of bytes as a PIL image and resizes it to specified given size"""
    with io.BytesIO(payload) as f:
        try:
            img = PIL.Image.open(f)
            img = img.resize((size,size))
            img = img.convert("RGB")
        except Exception as e:
            img = PIL.Image.new("RGB", (size, size))
        return img

In [None]:
with_pil_column = with_downloaded_column.with_column("img", resized_pil_image("payload"))

In [None]:
with_pil_column

### Lets look at our images

In [None]:
%%time

ds = with_pil_column.limit(5).execute()
items = ds.take(5)
items[0].img

### Defining our function for Batch Inference

In [None]:
from typing import Tuple

import numpy as np
import torch
import torchvision


@F.batch_func(batch_size=8)
class BatchInferModel:
    def __init__(self):
        """
        Here we init our model as well as needed data transforms
        """
        self.model_name = "resnet18"
        model = torchvision.models.resnet18(pretrained=True).eval()
        self.feature_extractor = torchvision.models.feature_extraction.create_feature_extractor(
            model=model, 
            return_nodes={'avgpool': 'embedding'}
        )
        self.to_tensor = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            )]
        )
    
    def prepare_batch(self, image_data: List[PIL.Image.Image]) -> torch.Tensor:
        """
        Here we convert our PIL image to a normalized tensor
        """
        return torch.stack([self.to_tensor(img) for img in image_data])
    
    def __call__(self, image_data: List[PIL.Image.Image]) -> List[np.ndarray]:
        """
        Here we extract our embedding with resnet 18
        """
        with torch.no_grad():
            tensor = self.prepare_batch(image_data)
            embedding =  self.feature_extractor(tensor.float())['embedding'].view(len(image_data), -1)
            np_embedding = embedding.cpu().numpy()
            dim = np_embedding.shape[1]
            per_image_embedding = np.vsplit(np_embedding, np.arange(1, len(image_data)))
            return per_image_embedding

## Running large scale batch inference

In [None]:
%%time

with_embeddings = with_pil_column \
    .with_column("embedding", BatchInferModel("img")) \
    .with_column("mean", F.func(lambda e: e.mean(), return_type=float)("embedding")) \
    .with_column("std", F.func(lambda e: e.std(), return_type=float)("embedding")) \
    .with_column("dim", F.func(lambda e: e.shape[1], return_type=int)("embedding"))

In [None]:
with_embeddings

## Save our extracted embeddings to the cloud in Parquet

In [None]:
# Create Datarepo if it does not exist

@dataclass
class ProcessedEmbedding:
    url: str
    dim: int
    mean: float
    std: float
    embedding: np.ndarray

embeddings_datarepo = datarepo_client.create("open-images-8k-processed-embeddings", ProcessedEmbedding, exists_ok=True)


In [None]:
write_query = with_embeddings.write_datarepo(
    embeddings_datarepo,
    ProcessedEmbedding,
    mode="overwrite",
    rows_per_partition=128,
)

write_query

In [None]:
%%time

ds = write_query.execute()

## Take a look at what was written

In [None]:
embeddings_datarepo.schema()

In [None]:
embeddings_datarepo = datarepo_client.from_id("open-images-8k-processed-embeddings")
ds = embeddings_datarepo.query(ProcessedEmbedding).limit(5).execute()
ds.take(1)