# Load Dataset

In [1]:
# import fiftyone as fo
# fo.delete_dataset("example_demo_dataset")

In [2]:
import fiftyone.core.dataset as focd
import fiftyone as fo

def create_fo_dataset(source_directory, name, dataset_type=fo.types.ImageDirectory):
    dataset = fo.load_dataset(name) if focd.dataset_exists(name)\
        else fo.Dataset.from_dir(name=name, dataset_dir=source_directory, dataset_type=dataset_type)
    return dataset

In [3]:
DATASET_NAME = "example_demo_dataset"
DATASET_DIR = "raw_dataset"
fo_dataset = create_fo_dataset(DATASET_DIR, DATASET_NAME)



 100% |███████████████████| 54/54 [41.7ms elapsed, 0s remaining, 1.3K samples/s]   


In [4]:
fo_dataset

Name:        example_demo_dataset
Media type:  image
Num samples: 54
Persistent:  False
Tags:        []
Sample fields:
    id:       fiftyone.core.fields.ObjectIdField
    filepath: fiftyone.core.fields.StringField
    tags:     fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)

# Run FiftyOne Session

In [5]:
import fiftyone as fo 
session = fo.launch_app(fo_dataset, address="localhost")

In [6]:
session

Dataset:          example_demo_dataset
Media type:       image
Num samples:      54
Selected samples: 0
Selected labels:  0
Session URL:      http://localhost:5151/

# Compute Image MetaData

In [7]:
fo_dataset.first()

<Sample: {
    'id': '653ad4a7a6d8e8c7e4844ec3',
    'media_type': 'image',
    'filepath': '/workspaces/dataops-fiftyone/raw_dataset/20230602_191106.jpg',
    'tags': [],
    'metadata': None,
}>

In [8]:
# Compute metadata
fo_dataset.compute_metadata()

Computing metadata...
 100% |███████████████████| 54/54 [292.7ms elapsed, 0s remaining, 184.5 samples/s]     


In [9]:
import fiftyone.core.utils as fou

# Iterate through sample to add metadata information
for sample in fo_dataset:
    # Compute filehash for remove duplicate filename
    sample["filehash"] = fou.compute_filehash(sample.filepath)
    sample["width"] = sample.metadata.width
    sample["height"] = sample.metadata.height
    sample["size_kb"] = sample.metadata.size_bytes
    sample.save()

In [10]:
# Check first sample
fo_dataset.first()

<Sample: {
    'id': '653ad4a7a6d8e8c7e4844ec3',
    'media_type': 'image',
    'filepath': '/workspaces/dataops-fiftyone/raw_dataset/20230602_191106.jpg',
    'tags': [],
    'metadata': <ImageMetadata: {
        'size_bytes': 1920904,
        'mime_type': 'image/jpeg',
        'width': 2992,
        'height': 2992,
        'num_channels': 3,
    }>,
    'filehash': -8294542046651660730,
    'width': 2992,
    'height': 2992,
    'size_kb': 1920904,
}>

In [11]:
from fiftyone import ViewField as F

plot1 = fo.NumericalHistogram(F("metadata.size_bytes") / (1024*1024), bins=50, xlabel="image size (MB)")
plot2 = fo.NumericalHistogram(F("metadata.width"), bins=50, xlabel="W")
plot3 = fo.NumericalHistogram(F("metadata.height"), bins=50, xlabel="H")
plot = fo.ViewGrid([plot1, plot2, plot3], init_view=fo_dataset)
plot.show()





FigureWidget({
    'data': [{'customdata': array([[0.72292137, 0.77480818],
                                   [0.77480818, 0.82669499],
                                   [0.82669499, 0.87858179],
                                   [0.87858179, 0.9304686 ],
                                   [0.9304686 , 0.98235541],
                                   [0.98235541, 1.03424222],
                                   [1.03424222, 1.08612902],
                                   [1.08612902, 1.13801583],
                                   [1.13801583, 1.18990264],
                                   [1.18990264, 1.24178945],
                                   [1.24178945, 1.29367625],
                                   [1.29367625, 1.34556306],
                                   [1.34556306, 1.39744987],
                                   [1.39744987, 1.44933668],
                                   [1.44933668, 1.50122348],
                                   [1.50122348, 1.55311029],
         

# Remove Duplicate File

In [12]:
from collections import Counter
from fiftyone import ViewField as F

def find_duplicate_file_image(fo_dataset):
    filehash_count = Counter(sample.filehash for sample in fo_dataset)
    dup_filehashes = [k for k, v in filehash_count.items() if v > 1]
    dup_view = (fo_dataset
    # Extract samples with duplicate file hashes
    .match(F("filehash").is_in(dup_filehashes))
    # Sort by file hash so duplicates will be adjacent
    .sort_by("filehash")
    )
    print("Number of images that have a duplicate: %d" % len(dup_view))
    print("Number of duplicates: %d" % (len(dup_view) - len(dup_filehashes)))
    return dup_view

In [13]:
duplicate_view = find_duplicate_file_image(fo_dataset)

Number of images that have a duplicate: 18
Number of duplicates: 9


In [14]:
# Delete duplicate view

print("Length of dataset before delete: %d" % len(fo_dataset))
temp_dup_filehash = set()
for sample in duplicate_view:
    if sample.filehash not in temp_dup_filehash:
        temp_dup_filehash.add(sample.filehash)
        continue
    del fo_dataset[sample.id]

print("Length of dataset after delete: %d" % len(fo_dataset))
# Verify that the dataset no longer contains any duplicates
print("Number of unique file hashes: %d" % len({s.filehash for s in fo_dataset}))

Length of dataset before delete: 54
Length of dataset after delete: 45
Number of unique file hashes: 45


# Create Custom Labels

In [15]:
label_list = list()
for sample in fo_dataset:
    sample["ground_truth"] = "not_labeled"
    label_list.append("not_labeled")
    sample.save()

# Compute Embedding with Pre-Trained Model FiftyOne

In [16]:
# Download model
import fiftyone.zoo as foz
model_name = "resnet50-imagenet-torch"
model = foz.load_zoo_model(model_name)

In [17]:
import numpy as np
# Compute embeddings and save it in numpy array
img_embeddings = fo_dataset.compute_embeddings(model=model)
np.save("full_frame_embeddings.npy", img_embeddings)



 100% |███████████████████| 45/45 [18.8s elapsed, 0s remaining, 2.7 samples/s]      


# Utils Function Visualization

In [18]:
from fiftyone import ViewField as F
import fiftyone.brain as fob
import fiftyone.core.utils as fou

def compute_visualization(
    dataset,
    embeddings,
    method="umap",
    brain_key="umap_embeddings",
    ):
    # Compute 2D representation using pre-computed embeddings
    viz_results = fob.compute_visualization(
        dataset,
        embeddings=embeddings,
        num_dims=2,
        method=method,
        brain_key=brain_key,
        verbose=True,
        seed=51)
    return viz_results

def compute_uniqueness(dataset, embeddings):
    fob.compute_uniqueness(dataset, embeddings=embeddings)
    dataset.sort_by("uniqueness", reverse=True)
    return dataset

def plot_img_embedding(viz_result, labels="ground_truth"):
    plot = viz_result.visualize(labels=labels)
    return plot

def plot_uniqueness(visualization):
    # num classes on each images, can be used for size of the scatter plot
    # num_objects = self.dataset.values("ground_truth", F("classifications").length())
    # Visualize embeddings, colored by ground truth label
    plot = visualization.visualize(
            sizes="metadata.size_bytes",
            labels="uniqueness",
            # classes="ground_truth.classifications.label"
    )
    return plot

# Distribution Feature Embedding

In [19]:
viz_results = compute_visualization(fo_dataset, img_embeddings)

Generating visualization...



n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP(random_state=51, verbose=True)
Thu Oct 26 21:06:15 2023 Construct fuzzy simplicial set
Thu Oct 26 21:06:16 2023 Finding Nearest Neighbors
Thu Oct 26 21:06:18 2023 Finished Nearest Neighbor Search
Thu Oct 26 21:06:20 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Oct 26 21:06:20 2023 Finished embedding


In [20]:
plot_embedding = plot_img_embedding(viz_results)

In [21]:
plot_embedding.show()





FigureWidget({
    'data': [{'customdata': array(['653ad4a7a6d8e8c7e4844ec3', '653ad4a7a6d8e8c7e4844ec4',
                                   '653ad4a7a6d8e8c7e4844ec6', '653ad4a7a6d8e8c7e4844ec8',
                                   '653ad4a7a6d8e8c7e4844eca', '653ad4a7a6d8e8c7e4844ecb',
                                   '653ad4a7a6d8e8c7e4844ecc', '653ad4a7a6d8e8c7e4844ecd',
                                   '653ad4a7a6d8e8c7e4844ecf', '653ad4a7a6d8e8c7e4844ed1',
                                   '653ad4a7a6d8e8c7e4844ed2', '653ad4a7a6d8e8c7e4844ed3',
                                   '653ad4a7a6d8e8c7e4844ed4', '653ad4a7a6d8e8c7e4844ed5',
                                   '653ad4a7a6d8e8c7e4844ed6', '653ad4a7a6d8e8c7e4844ed7',
                                   '653ad4a7a6d8e8c7e4844ed8', '653ad4a7a6d8e8c7e4844ed9',
                                   '653ad4a7a6d8e8c7e4844eda', '653ad4a7a6d8e8c7e4844edb',
                                   '653ad4a7a6d8e8c7e4844edc', '653ad4a7a6d

# Distribution Uniqueness Feature Embedding

In [22]:
fo_dataset = compute_uniqueness(fo_dataset, img_embeddings)

Computing uniqueness...
Uniqueness computation complete


In [23]:
plot_unique_res = plot_uniqueness(viz_results)

In [24]:
plot_unique_res.show()





FigureWidget({
    'data': [{'customdata': array(['653ad4a7a6d8e8c7e4844ec3', '653ad4a7a6d8e8c7e4844ec4',
                                   '653ad4a7a6d8e8c7e4844ec6', '653ad4a7a6d8e8c7e4844ec8',
                                   '653ad4a7a6d8e8c7e4844eca', '653ad4a7a6d8e8c7e4844ecb',
                                   '653ad4a7a6d8e8c7e4844ecc', '653ad4a7a6d8e8c7e4844ecd',
                                   '653ad4a7a6d8e8c7e4844ecf', '653ad4a7a6d8e8c7e4844ed1',
                                   '653ad4a7a6d8e8c7e4844ed2', '653ad4a7a6d8e8c7e4844ed3',
                                   '653ad4a7a6d8e8c7e4844ed4', '653ad4a7a6d8e8c7e4844ed5',
                                   '653ad4a7a6d8e8c7e4844ed6', '653ad4a7a6d8e8c7e4844ed7',
                                   '653ad4a7a6d8e8c7e4844ed8', '653ad4a7a6d8e8c7e4844ed9',
                                   '653ad4a7a6d8e8c7e4844eda', '653ad4a7a6d8e8c7e4844edb',
                                   '653ad4a7a6d8e8c7e4844edc', '653ad4a7a6d

# Attach Plot in Running Session

In [25]:
fo.close_app()
session = fo.launch_app(fo_dataset, address="localhost")

session.plots.attach(plot_embedding)
session.plots.attach(plot_unique_res)

# Split Dataset with Uniqueness Embedding

In [26]:
# Function to train and test split fiftyone dataset
def train_test_split(dataset, percentage=0.25):
    temp_dataset = dataset.clone()
    test_size = int(len(dataset) * percentage)
    test_samples = temp_dataset.take(test_size)
    temp_dataset.delete_samples(test_samples)
    test_dataset = test_samples.clone()
    test_dataset.persistent = True
    temp_dataset.persistent = True
    return temp_dataset, test_dataset

In [28]:
# Split to two type, common and unique
unique_dataset_samples = fo_dataset.match(F("uniqueness") > 0.6)
common_dataset_samples = fo_dataset.match(F("uniqueness") < 0.6)

In [29]:
train_common, test_common = train_test_split(common_dataset_samples)

In [30]:
train_unique, test_unique = train_test_split(unique_dataset_samples)

In [31]:
train_common.merge_samples(train_unique)

In [32]:
test_common.merge_samples(test_unique)

# Export FiftyOne Dataset

In [33]:
def merge_dataset(list_datasets: list, dataset_name: str):
    merged_dataset = fo.Dataset(name=dataset_name, overwrite=True)
    for dataset in list_datasets:
        for sample in dataset:
            merged_dataset.add_sample(sample)
    return merged_dataset

In [34]:
# Tag for training
for sample in train_common:
    sample.tags.append("train")
    sample.save()

# Tag for testing
for sample in test_common:
    sample.tags.append("test")
    sample.save()

In [35]:
# merged dataset
NEW_DATASET_NAME = "example_demo_dataset_processed"
merged_dataset = merge_dataset([train_common, test_common], NEW_DATASET_NAME)
merged_dataset

Name:        example_demo_dataset_processed
Media type:  image
Num samples: 45
Persistent:  False
Tags:        []
Sample fields:
    id:           fiftyone.core.fields.ObjectIdField
    filepath:     fiftyone.core.fields.StringField
    tags:         fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    filehash:     fiftyone.core.fields.IntField
    width:        fiftyone.core.fields.IntField
    height:       fiftyone.core.fields.IntField
    size_kb:      fiftyone.core.fields.IntField
    ground_truth: fiftyone.core.fields.StringField
    uniqueness:   fiftyone.core.fields.FloatField

In [37]:
import os

splits = ["train", "test"]
export_dir = "example_demo_dataset_processed"
label_field = "ground_truth"

# Export the splits
for split in splits:
    split_view = merged_dataset.match_tags(split)
    tag_export_dir = os.path.join(export_dir, split)
    split_view.export(
        export_dir=tag_export_dir,
        dataset_type=fo.types.ImageDirectory,
        # label_field=label_field,
        split=split,
        # classes=class_list
    )

Ignoring unsupported parameter 'split'
 100% |███████████████████| 34/34 [70.6ms elapsed, 0s remaining, 481.8 samples/s] 
Ignoring unsupported parameter 'split'
 100% |███████████████████| 11/11 [36.0ms elapsed, 0s remaining, 305.6 samples/s] 


# What's Next?

- [MLOps Concept](https://ml-ops.org/)
- [FiftyOne User Guide](https://docs.voxel51.com/user_guide/index.html)
- [Fiftyone Tutorials](https://docs.voxel51.com/tutorials/index.html)
- [Fiftyone Cheat Sheets](https://docs.voxel51.com/cheat_sheets/index.html)
- [Integrations FiftyOne with Labeling Tools and other MLOps tools](https://docs.voxel51.com/integrations/index.html)