# Embedding space visualization | Fiftyone experimental

In [None]:
import fiftyone as fo
import fiftyone.zoo as foz

In [None]:
dataset = foz.load_zoo_dataset("mnist")

In [None]:
test_split = dataset.match_tags("test")

In [None]:
print(test_split)

## Computing image embeddings

In [None]:
import cv2
import numpy as np

import fiftyone.brain as fob

# Construct a ``num_samples x num_pixels`` array of images
embeddings = np.array([
    cv2.imread(f, cv2.IMREAD_UNCHANGED).ravel()
    for f in test_split.values("filepath")
])

# Compute 2D representation
results = fob.compute_visualization(
    test_split,
    embeddings=embeddings,
    num_dims=2,
    method="umap",
    brain_key="mnist_test",
    verbose=True,
    seed=51,
)

In [None]:
print(type(results))
print(results.points.shape)

## Visualizing embeddings

In [None]:
# Launch App instance
session = fo.launch_app(view=test_split)

In [None]:
# Plot embeddings colored by ground truth label
plot = results.visualize(labels="ground_truth.label")
plot.show(height=720)

# Attach plot to session
session.plots.attach(plot)

## Pre-annotation of samples

In [None]:
# Construct a ``num_samples x num_pixels`` array of images
embeddings = np.array([
    cv2.imread(f, cv2.IMREAD_UNCHANGED).ravel()
    for f in dataset.values("filepath")
])

# Compute 2D representation
results = fob.compute_visualization(
    dataset,
    embeddings=embeddings,
    num_dims=2,
    method="umap",
    brain_key="mnist",
    verbose=True,
    seed=51,
)

In [None]:
from fiftyone import ViewField as F

# Label `test` split samples by their ground truth label
# Mark all samples in `train` split as `unlabeled`
expr = F("$tags").contains("test").if_else(F("label"), "unlabeled")
labels = dataset.values("ground_truth", expr=expr)

In [None]:
# Launch a new App instance
session = fo.launch_app(dataset)

In [None]:
# Visualize results
plot = results.visualize(labels=labels)
plot.show(height=720)

# Attach plot to session
session.plots.attach(plot)

# EXPLORING IMAGE UNIQUENESS WITH FIFTYONE

## Part 1: Finding duplicate and near-duplicate images


In [None]:
import fiftyone as fo
import fiftyone.zoo as foz

# Load the CIFAR-10 test split
# Downloads the dataset from the web if necessary
dataset = foz.load_zoo_dataset("cifar10", split="test")

In [None]:
print(dataset)
session = fo.launch_app(dataset)

## Compute uniqueness

In [None]:
import fiftyone.brain as fob

fob.compute_uniqueness(dataset)

In [None]:
# Now the samples have a "uniqueness" field on them
print(dataset)
print(dataset.first())

## Visualize to find duplicate and near-duplicate images

In [None]:
# Sort in increasing order of uniqueness (least unique first)
dups_view = dataset.sort_by("uniqueness")

# Open view in the App
session.view = dups_view

In [None]:
# Get currently selected images from App
dup_ids = session.selected

# Mark as duplicates
dups_view = dataset.select(dup_ids)
dups_view.tag_samples("dups")

# Visualize duplicates-only in App
session.view = dups_view

## Part 2: Bootstrapping a dataset of unique samples

In [None]:
import fiftyone as fo

data_basedir = '/home/lcondados/workspace/data/greiburg_groceries/freiburg_groceries_dataset/images'
dataset = fo.Dataset.from_images_dir(data_basedir, recursive=True, name="groceries")

print(dataset)
print(dataset.first())

In [None]:
session = fo.launch_app(dataset)

## Compute uniqueness and analyze

In [None]:
import fiftyone.brain as fob

fob.compute_uniqueness(dataset)

# Now the samples have a "uniqueness" field on them
print(dataset)

In [None]:
print(dataset.first())

In [None]:
# Sort by uniqueness (most unique first)
rank_view = dataset.sort_by("uniqueness", reverse=True)

# Visualize in the App
session.view = rank_view

Now, just visualizing the samples is interesting, but we want more. We want to get the most unique samples from our dataset so that we can use them in our work. Let’s do just that. In the same Python session, execute the following code.

In [None]:
# Verify that the most unique sample has the maximal uniqueness of 1.0
print(rank_view.first())

# Extract paths to 10 most unique samples
ten_best = [x.filepath for x in rank_view.limit(10)]

for filepath in ten_best:
    print(filepath.split('/')[-1])

# Then you can do what you want with these.
# Output to csv or json, send images to your annotation team, seek additional
# similar data, etc.

In [None]:
rank_view.limit(10).tag_samples("unique")
dataset.persistent = True

In [None]:
session.freeze() # screenshot the active App for sharing

# Using a pre-trained model to compute embeddings

In [None]:
# import fiftyone as fo

# data_basedir = '/home/lcondados/workspace/data/greiburg_groceries/freiburg_groceries_dataset/images'
# dataset = fo.Dataset.from_images_dir(data_basedir, recursive=True, name="groceries")

print(dataset)
print(dataset.first())

## Computing image embeddings

In [None]:
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.zoo as foz

# The BDD dataset must be manually downloaded. See the zoo docs for details
# source_dir = "/path/to/dir-with-bdd100k-files"

# dataset = foz.load_zoo_dataset(
#     "bdd100k", split="validation", source_dir=source_dir,
# )

# Compute embeddings
# You will likely want to run this on a machine with GPU, as this requires
# running inference on 10,000 images
model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
embeddings = dataset.compute_embeddings(model)

# Compute visualization
results = fob.compute_visualization(
    dataset, embeddings=embeddings, seed=51, brain_key="img_viz"
)

session = fo.launch_app(dataset)