# Add Embeddings to an Existing Table

In this example we will extend an existing table with embeddings computed from a pre-trained model.

- Write an initial table containing a single column of image URLs.
- Write a new table containing the input URLs and the embeddings computed from a pre-trained model.
- Apply dimensionality reduction to the extended table to get a final table containing the URLs, the embeddings, and the reduced embeddings.

In [None]:
from transformers import ViTImageProcessor, ViTModel
import torch
from torchvision import transforms
from PIL import Image
from pathlib import Path
import os
import tlc
import tqdm

## Write the initial table

We write a simple table containing a single column of image URLs from our COCO-128 dataset.

In [None]:
data_path = Path("../data/coco128/images").absolute().as_posix()
project_name = "add-embeddings"
dataset_name = "coco128"

table_writer = tlc.TableWriter(
    table_name="initial",
    dataset_name=dataset_name,
    project_name=project_name,
    description="COCO128 dataset",
    column_schemas={
        "image": tlc.ImagePath
    }
)

image_paths = []  # Store the list of image paths for later use

for image_name in os.listdir(data_path):
    image_path = os.path.join(data_path, image_name)
    image_paths.append(image_path)
    table_writer.add_row({"image": image_path})

table = table_writer.finalize()

## Extend the table with embeddings from a pre-trained model

We will use the ViT model pre-trained on ImageNet to compute embeddings for the images in the table.
A benefit of using this model is that meaningful embeddings can be extracted easily using the `last_hidden_state` attribute of the model output.

In [None]:
# Load the model and feature extractor

model_name = "google/vit-base-patch16-224"
image_processor = ViTImageProcessor.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViTModel.from_pretrained(model_name).to(device)

# The warning about 'vit.pooler.dense.bias' and 'vit.pooler.dense.weight' being newly initialized
# is not problematic for this use case because we are only extracting embeddings from the last
# hidden state and do not rely on the pooler layer.

In [None]:
# The input table returns rows of the form {"image": image_path}
# Define a map function on the table that returns the images as plain tensors instead

preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
])

table.map(lambda row: preprocess(Image.open(row["image"]).convert("RGB")))

In [None]:
# Define a TableWriter to write the embeddings-table

extended_table_writer = tlc.TableWriter(
    table_name="added-embeddings",
    dataset_name=dataset_name,
    project_name=project_name,
    description="COCO128 dataset with added embeddings",
    column_schemas={
        "image": tlc.ImagePath,                                                  # Path to the image (copied from input table)
        "embedding": tlc.Schema(
            value=tlc.Float32Value(number_role=tlc.NUMBER_ROLE_NN_EMBEDDING),    # We assign a special role to the embedding column so that it will be automatically selected for dimensionality reduction
            size0=tlc.DimensionNumericValue(768, 768),                           # The embedding size is 768
            sample_type="hidden",                                                # We don't want the embedding to be displayed in the "sample-view" of the table
            writable=False,                                                      # We do not allow editing the embedding values after they have been computed
        ),
    },
)

In [None]:
# Create a DataLoader to iterate over the images in batches for faster inference

batch_size = 4

def batched_iterator(list_, batch_size, num_batches):
    """Helper function to iterate over a list in batches."""
    return (list_[i * batch_size : (i + 1) * batch_size] for i in range(num_batches))

dataloader = torch.utils.data.DataLoader(
    table,
    batch_size=batch_size,
    num_workers=0,
    shuffle=False, # We don't shuffle the images in order to associate the tensor images with the image paths
)

num_batches = len(dataloader)


In [None]:
# Run inference on the images and write the embeddings to the extended table's TableWriter

for batch, filenames_batch in tqdm.tqdm(
    zip(
        dataloader,
        batched_iterator(image_paths, batch_size, num_batches),
    ),
    total=num_batches,
    desc="Running inference on batches",
):
    with torch.no_grad():
        outputs = model(batch.to(device))
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().squeeze().numpy()

    extended_table_writer.add_batch(
        {
            "image": filenames_batch,
            "embedding": embeddings.tolist(),
        }
    )

extended_table = extended_table_writer.finalize()

print(extended_table[0].keys())            # Notice the "embeddings" column is not present in the sample-view of the table
print(extended_table.table_rows[0].keys()) # Notice the "embeddings" column is present in the "row-view" of the table

## Reduce the embeddings to 2 dimensions

Finally we reduce the embedding-column to 2 dimensions using UMAP. The result is a table containing the URLs, the embeddings, and the reduced embeddings.

In [None]:
urls = tlc.reduce_embeddings(
    [extended_table],
    method="umap",
    n_components=2,
    metric="euclidean",
    retain_source_embedding_column=True,
)
reduced_table_url = urls[extended_table.url]

In [None]:
reduced_table = tlc.Table.from_url(reduced_table_url)

print(reduced_table.table_rows[0].keys()) # The row-view of the reduced table contains both the embeddings and the reduced embeddings