# Hugging Face CIFAR-100 Embeddings Example

In this notebook we will see how to use a pre-trained Vision Transformers (ViT) model to collect embeddings on the CIFAR-100 dataset.

This notebook demonstrates:

- Registering the `CIFAR-100` dataset from Hugging Face.
- Computing image embeddings with `transformers` and reducing them to 2D with UMAP.
- Adding the computed embeddings as metrics to a 3LC `Run`.

In [None]:
PROJECT_NAME = "CIFAR-100 Embeddings"
DEVICE = 'cuda:0'
TRAIN_DATASET_NAME="hf-cifar-100-train"
TEST_DATASET_NAME="hf-cifar-100-test"
MODEL = 'google/vit-base-patch16-224'
BATCH_SIZE = 32
INSTALL_DEPENDENCIES=False
TLC_PUBLIC_EXAMPLES_DEVELOPER_MODE = True


In [None]:
if INSTALL_DEPENDENCIES:
    %pip --quiet install ipykernel ipywidgets
    %pip --quiet install datasets transformers
    %pip --quiet install torch --index-url https://download.pytorch.org/whl/cu118
    %pip --quiet install torchvision --index-url https://download.pytorch.org/whl/cu118
    %pip --quiet install tlc[umap]

In [None]:
### HIDDEN CELL ###

## Data & Alias management
# See comments in ../mnist.ipynb for details on data and alias management.

# Set this variable to True if you just want to run this notebook for local testing purposes
if not TLC_PUBLIC_EXAMPLES_DEVELOPER_MODE:
    from tlc.client.utils import (
        TLC_PUBLIC_EXAMPLES_RUN_ROOT,
        TLC_PUBLIC_EXAMPLES_TABLE_ROOT,
        TLC_PUBLIC_EXAMPLES_CIFAR_100_DATA_ALIAS_NAME,
        TLC_PUBLIC_EXAMPLES_CIFAR_100_DATA_ALIAS_VALUE,
    )
    from tlc.core.objects.mutable_objects import Configuration
    from tlc.core.url import UrlAliasRegistry, Url

    print(f"Runs and Tables will be written to remote location: '{TLC_PUBLIC_EXAMPLES_RUN_ROOT}' and '{TLC_PUBLIC_EXAMPLES_TABLE_ROOT}'")
    Configuration.instance().run_root_url = TLC_PUBLIC_EXAMPLES_RUN_ROOT
    Configuration.instance().table_root_url = TLC_PUBLIC_EXAMPLES_TABLE_ROOT

    print(f"CIFAR-100 data will be written to local SAMPLE_ROOT: '{Configuration.instance().sample_root_url}'\n")

    LOCAL_CIFAR100_DATA_LOCATION = (Url(Configuration.instance().sample_root_url) / PROJECT_NAME).to_str()

    print(f"In this notebook, the alias '{TLC_PUBLIC_EXAMPLES_CIFAR_100_DATA_ALIAS_NAME}' refers to the cifar-10 data at '{LOCAL_CIFAR100_DATA_LOCATION}'\n")

    print("After this run has completed, the data will be uploaded using the following command:")
    print(f"\taws s3 sync {Configuration.instance().sample_root_url}/{PROJECT_NAME} {TLC_PUBLIC_EXAMPLES_CIFAR_100_DATA_ALIAS_VALUE}")

    UrlAliasRegistry.instance().register_url_alias(
        TLC_PUBLIC_EXAMPLES_CIFAR_100_DATA_ALIAS_NAME,
        LOCAL_CIFAR100_DATA_LOCATION,
    )

## Prepare the data

To read the data into 3LC, we use `load_dataset` available under the Hugging Face integration. This returns a `TLCDataset`, which presents samples under `.get_sample_at_index(index)` with the same sample structure as a Hugging Face `datasets.Dataset`.

In [None]:
from tlc.integration.huggingface import load_dataset

In [None]:
cifar100_train = load_dataset(
    "cifar100",
    split="train",
    project_name=PROJECT_NAME,
    dataset_name=TRAIN_DATASET_NAME,
    write_row_cache=True,
)

cifar100_test = load_dataset(
    "cifar100",
    split="test",
    project_name=PROJECT_NAME,
    dataset_name=TEST_DATASET_NAME,
    write_row_cache=True,
)

In [None]:
cifar100_train.sample_structure

In [None]:
cifar100_train[0]["img"]

In [None]:
label_element = cifar100_train.sample_structure.sample_structure['fine_label']

In [None]:
label_element.int_to_str(cifar100_train[0]["fine_label"])

## Compute the data

We then use the `transformers` library to compute embeddings and `umap-learn` to reduce the embeddings to two dimensions. 

In [None]:
from transformers import ViTFeatureExtractor, ViTModel
import torch
from torch.utils.data import DataLoader

device = torch.device(DEVICE)
feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL)
model = ViTModel.from_pretrained(MODEL).to(device)

In [None]:
extract_feature = lambda sample: feature_extractor(images=sample['img'], return_tensors="pt")

In [None]:
from tqdm import tqdm

def infer_on_dataset(dataset):
    activations = []
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    for inputs in tqdm(dataloader, total=dataloader.__len__()):
        inputs['pixel_values'] = inputs['pixel_values'].squeeze()
        inputs = inputs.to(DEVICE)
        outputs = model(**inputs)
        activations.append(outputs.last_hidden_state[:, 0, :].detach().cpu())

    return activations

In [None]:
activations = []
model.eval()

for dataset in (cifar100_train, cifar100_test):
    dataset = dataset.map(extract_feature)
    activations.extend(infer_on_dataset(dataset))

In [None]:
activations = torch.cat(activations).numpy()
activations.shape

In [None]:
import umap

reducer = umap.UMAP(n_components=2)
embeddings_2d = reducer.fit_transform(activations)

## Collect the embeddings as 3LC metrics

In this example the metrics are contained in a `numpy.ndarray` object. We can specify the schema of this data and provide it directly to 3LC using `Run.add_metrics_data()`.

In [None]:
import tlc

run = tlc.init(project_name=PROJECT_NAME)

In [None]:
embedding_schema = tlc.Schema(
    value=tlc.Float32Value(),
    size0=tlc.DimensionNumericValue(value_min=2, value_max=2),
)

In [None]:
embeddings_2d_train = embeddings_2d[:len(cifar100_train)]
embeddings_2d_test = embeddings_2d[len(cifar100_train):]

In [None]:
embeddings_2d_train = embeddings_2d[:len(cifar100_train)]
embeddings_2d_test = embeddings_2d[len(cifar100_train):]

In [None]:
### HIDDEN CELL ###
import numpy as np
try:
    TwoBatchPatchDataLoader
    # The two batch patch is active, so we need to split and pad embeddings_2d to match input table lengths.
    # 
    # # Calculate the number of embeddings for each dataset based on the batch processing
    num_embeddings_train = 2 * BATCH_SIZE  # 2 batches of size for training
    num_embeddings_test = len(embeddings_2d) - num_embeddings_train  # remaining for testing

    # Split the embeddings
    embeddings_2d_train = embeddings_2d[:num_embeddings_train]
    embeddings_2d_test = embeddings_2d[num_embeddings_train:num_embeddings_train + num_embeddings_test]

    # Pad the embeddings with ones to match the size of the original datasets
    pad_length_train = len(cifar100_train) - len(embeddings_2d_train)
    pad_length_test = len(cifar100_test) - len(embeddings_2d_test)

    if pad_length_train > 0:
        embeddings_2d_train = np.vstack((embeddings_2d_train, np.ones((pad_length_train, embeddings_2d_train.shape[1]))))

    if pad_length_test > 0:
        embeddings_2d_test = np.vstack((embeddings_2d_test, np.ones((pad_length_test, embeddings_2d_test.shape[1])))) 
except NameError:
    pass

In [None]:
for dataset, embeddings in ((cifar100_train, embeddings_2d_train), (cifar100_test, embeddings_2d_test)):
    run.add_metrics_data(
        {"embeddings": [row for row in embeddings]},
        override_column_schemas={"embeddings": embedding_schema},
        input_table_url=dataset.url
    )

In [None]:
### HIDDEN CELL ###

if not TLC_PUBLIC_EXAMPLES_DEVELOPER_MODE:
    print("Uploading data to S3...")
    !aws s3 sync "{(tlc.Url(Configuration.instance().sample_root_url)/PROJECT_NAME).to_str()}" {TLC_PUBLIC_EXAMPLES_CIFAR_100_DATA_ALIAS_VALUE}