# Training on the COCO Dataset made simple with Daft

What is the COCO Dataset?

Image dataset with labels and annotations:

![coco-example.png](https://cocodataset.org/images/coco-examples.jpg)

With Daft, we can run data querying/processing really easily for:

1. Model Training
2. Model Evaluation
3. Dataset curation
4. Data exploration/understanding

In [None]:
from daft import DataFrame

In [None]:
# !aws sso login

## Connecting Daft to Ray

In [None]:
import daft

USE_REMOTE_CLUSTER = False
RAY_ADDRESS = "ray://localhost:10001" if USE_REMOTE_CLUSTER else None

In [None]:
import ray

# Set up connection to Ray cluster if USE_REMOTE_CLUSTER=True
if USE_REMOTE_CLUSTER:
    ray.init(
        address=RAY_ADDRESS,
        runtime_env={
            "pip": [
                "getdaft",
                "pillow",
                "s3fs",
                "torch",
                "torchvision",
                "IPython",
            ]
        },
    )
    print(ray.available_resources())

In [None]:
daft.context.set_runner_ray(address=RAY_ADDRESS)

## Read some data (stored as Parquet)

In [None]:
images_df = DataFrame.read_parquet("s3://daft-public-data/coco-2017-parquet/images.parquet")
annotations_df = DataFrame.read_parquet("s3://daft-public-data/coco-2017-parquet/annotations.parquet")

In [None]:
annotations_df

In [None]:
annotations_df.show()

# Data Querying

Let's take a look at the rough distribution of data with a count of the number of rows, grouped by the "category ID"

In [None]:
annotations_df \
    .groupby("category_id") \
    .agg([(annotations_df["category_id"].alias("count"), "count")]) \
    .sort("count", desc=True) \
    .show(10)

In [None]:
annotations_df = annotations_df.where(annotations_df["category_id"] < 10)

In [None]:
annotations_df = annotations_df.select("category_id", "bbox", "image_id")

In [None]:
annotations_df.collect()

# Under the hood

How does this run on Ray? Let's take a look at Daft's "logical plan".

In [None]:
annotations_df.explain()

Daft translates this Logical Plan into a series of Ray function calls to execute the plan, from reading files to running functions and models.

## Join Annotations with Images

In [None]:
joined_df = annotations_df.join(
    images_df,
    left_on="image_id",
    right_on="id",
)
joined_df = joined_df.select(
    joined_df["id"],
    joined_df["coco_url"],
    joined_df["bbox"],
    joined_df["category_id"],
)

In [None]:
joined_df.collect()

## Repartitioning

We split our dataframe into "64 partitions", which means that Daft can operate on each partition in parallel using Ray as its resource scheduler.

In [None]:
print(f"Before: `joined_df` has {joined_df.num_partitions()} partitions.")

joined_df = joined_df.into_partitions(64).collect()

print(f"After: `joined_df` has {joined_df.num_partitions()} partitions.")

In [None]:
if not USE_REMOTE_CLUSTER:
    joined_df = joined_df.limit(128)

print(f"Dataframe has: {joined_df.count_rows()} rows")

# Visualizing Data

## Downloading data

In [None]:
joined_df = joined_df.with_column(
    "image_bytes",
    joined_df["coco_url"].url.download(),
)

In [None]:
joined_df.show()

## Load images from bytes

In [None]:
import io
import PIL.Image
import numpy as np

def bytes_to_pil(b: bytes) -> PIL.Image.Image:
    return PIL.Image.open(io.BytesIO(b)) if b is not None else None

In [None]:
joined_df = joined_df.with_column(
    "pil_image",
    joined_df["image_bytes"].apply(bytes_to_pil),
)

In [None]:
joined_df.show()

# Data Preprocessing

## Crop images

In [None]:
from daft import udf
from daft.types import ExpressionType
import numpy as np


@udf(
    return_dtype=ExpressionType.python(PIL.Image.Image),
    input_columns={"images": list, "bboxes": list},
)
def crop_images(images, bboxes):
    return [img.crop((x, y, x+w, y+h)).resize((32, 32)) for img, [x, y, w, h] in zip(images, bboxes)]


joined_df = joined_df.with_column(
    "cropped_img",
    crop_images(joined_df["pil_image"], joined_df["bbox"]),
)

## Convert image to numpy

In [None]:
def img_to_numpy(img) -> np.ndarray:
    arr = np.array(img).reshape((32, 32, -1))
    if arr.shape == (32, 32, 1):
        arr = arr.repeat(3, axis=2)
    return arr

joined_df = joined_df.with_column(
    "np_arr",
    joined_df["cropped_img"].apply(img_to_numpy),
)

In [None]:
joined_df.show()

# Daft DataFrame ➔ Ray Datasets

In [None]:
train_dataset = joined_df.select(
    joined_df["np_arr"].alias("image"),
    joined_df["category_id"].alias("label"),
).to_ray_dataset()

---

# Now... It's just Ray!

Training code adapted from Ray AIR's image classification tutorial: https://docs.ray.io/en/latest/ray-air/examples/torch_image_example.html

In [None]:
import ray

###
# Neural Network definition
###

import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
###
# Train Loop
###
    
from ray import train
from ray.air import session, Checkpoint
from ray.train.torch import TorchCheckpoint
import torch.nn as nn
import torch.optim as optim
import torchvision


def train_loop_per_worker(config):
    model = train.torch.prepare_model(Net())

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_dataset_shard = session.get_dataset_shard("train")

    for epoch in range(2):
        running_loss = 0.0
        train_dataset_batches = train_dataset_shard.iter_torch_batches(
            batch_size=config["batch_size"], device=train.torch.get_device()
        )
        for i, batch in enumerate(train_dataset_batches):
            # get the inputs and labels
            inputs, labels = batch["image"], batch["label"]

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                running_loss = 0.0

        metrics = dict(running_loss=running_loss)
        checkpoint = TorchCheckpoint.from_state_dict(model.state_dict())
        session.report(metrics, checkpoint=checkpoint)

###
# Preprocessing pipeline
###
        
from ray.data.preprocessors import TorchVisionPreprocessor
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
preprocessor = TorchVisionPreprocessor(columns=["image"], transform=transform)

###
# Trainer definition
###

from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig, RunConfig
from ray.tune.syncer import SyncConfig

use_gpu = ray.available_resources().get("GPU", 0) >= 2

run_config = None
if USE_REMOTE_CLUSTER:
    run_config = RunConfig(sync_config=SyncConfig(upload_dir="s3://eventual-dev-scratch/ray-meetup-demo/"))

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={"batch_size": 8 if not USE_REMOTE_CLUSTER else 32},
    datasets={"train": train_dataset},
    scaling_config=ScalingConfig(num_workers=2 if not USE_REMOTE_CLUSTER else 16, use_gpu=use_gpu),
    preprocessor=preprocessor,
    run_config=run_config,
)
result = trainer.fit()
latest_checkpoint = result.checkpoint

In [None]:
latest_checkpoint

**Now, use the model weights in `latest_checkpoint` to evaluate the train dataset!**

In [None]:
from ray.train.torch import TorchPredictor
from ray.train.batch_predictor import BatchPredictor

if not USE_REMOTE_CLUSTER:
    batch_predictor = BatchPredictor.from_checkpoint(
        checkpoint=latest_checkpoint,
        predictor_cls=TorchPredictor,
        model=Net(),
    )
    model_eval_results_ds: ray.data.Dataset = batch_predictor.predict(
        data=train_dataset,
        dtype=torch.float,
        feature_columns=["image"],
        keep_columns=["label"],
        # We will use GPU if available.
        num_gpus_per_worker=ray.available_resources().get("GPU", 0)
    )

# Checkpoints seem to be broken for remote execution, we write our own custom code to perform inference
else:
    import pyarrow as pa
    import pandas as pd
    from ray.data.extensions.tensor_extension import ArrowTensorArray

    uri = latest_checkpoint.uri
    def predict(batch: pd.DataFrame) -> pa.Table:
        checkpoint = Checkpoint.from_uri(uri)
        model = checkpoint.get_model(Net())
        return pa.Table.from_pydict({
            "predictions": ArrowTensorArray.from_numpy([one_hot.detach().numpy() for one_hot in model(torch.Tensor(np.array(list(batch["image"]))).permute(0, 3, 1, 2))]),
            "label": batch["label"],
        })

    model_eval_results_ds = train_dataset.map_batches(predict, batch_format="pyarrow")

In [None]:
model_eval_results_ds.schema()

---

# Ray Datasets ➔ Daft

In [None]:
predictions_df = DataFrame.from_ray_dataset(model_eval_results_ds)

In [None]:
predictions_df.collect()

In [None]:
predictions_df = predictions_df.with_column(
    "predictions",
    predictions_df["predictions"].apply(lambda x: x.argmax(), return_dtype=int)
)
predictions_df.show()

In [None]:
predictions_df = predictions_df.with_column(
    "correct",
    (predictions_df["predictions"] == predictions_df["label"]).cast(int)
)
predictions_df.show()

In [None]:
predictions_df.groupby("label").sum("correct").sort("label").show(10)