# Load Dataset

In [1]:
# import fo
# fo.delete_dataset("example_demo_dataset")

In [2]:
import fiftyone.core.dataset as focd
import fiftyone as fo

def create_fo_dataset(source_directory, name, dataset_type=fo.types.ImageDirectory):
    dataset = fo.load_dataset(name) if focd.dataset_exists(name)\
        else fo.Dataset.from_dir(name=name, dataset_dir=source_directory, dataset_type=dataset_type)
    return dataset

In [3]:
DATASET_NAME = "example_demo_dataset"
DATASET_DIR = "raw_dataset"
fo_dataset = create_fo_dataset(DATASET_DIR, DATASET_NAME)

In [4]:
fo_dataset

Name:        example_demo_dataset
Media type:  image
Num samples: 45
Persistent:  False
Tags:        []
Sample fields:
    id:           fiftyone.core.fields.ObjectIdField
    filepath:     fiftyone.core.fields.StringField
    tags:         fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    filehash:     fiftyone.core.fields.IntField
    width:        fiftyone.core.fields.IntField
    height:       fiftyone.core.fields.IntField
    size_kb:      fiftyone.core.fields.IntField
    ground_truth: fiftyone.core.fields.StringField

# Run FiftyOne Session

In [5]:
import fiftyone as fo 
session = fo.launch_app(fo_dataset, address="localhost")

In [6]:
session

Dataset:          example_demo_dataset
Media type:       image
Num samples:      45
Selected samples: 0
Selected labels:  0
Session URL:      http://localhost:5151/

# Compute Image MetaData

In [7]:
fo_dataset.first()

<Sample: {
    'id': '6539a91c9d5b48487915b1a3',
    'media_type': 'image',
    'filepath': '/workspaces/dataops-fiftyone/raw_dataset/20230602_191106.jpg',
    'tags': [],
    'metadata': <ImageMetadata: {
        'size_bytes': 1920904,
        'mime_type': 'image/jpeg',
        'width': 2992,
        'height': 2992,
        'num_channels': 3,
    }>,
    'filehash': -7936277891674897675,
    'width': 2992,
    'height': 2992,
    'size_kb': 1920904,
    'ground_truth': 'not_labeled',
}>

In [8]:
# Compute metadata
fo_dataset.compute_metadata()

In [9]:
import fiftyone.core.utils as fou

# Iterate through sample to add metadata information
for sample in fo_dataset:
    # Compute filehash for remove duplicate filename
    sample["filehash"] = fou.compute_filehash(sample.filepath)
    sample["width"] = sample.metadata.width
    sample["height"] = sample.metadata.height
    sample["size_kb"] = sample.metadata.size_bytes
    sample.save()

In [10]:
# Check first sample
fo_dataset.first()

<Sample: {
    'id': '6539a91c9d5b48487915b1a3',
    'media_type': 'image',
    'filepath': '/workspaces/dataops-fiftyone/raw_dataset/20230602_191106.jpg',
    'tags': [],
    'metadata': <ImageMetadata: {
        'size_bytes': 1920904,
        'mime_type': 'image/jpeg',
        'width': 2992,
        'height': 2992,
        'num_channels': 3,
    }>,
    'filehash': 6413452524206915674,
    'width': 2992,
    'height': 2992,
    'size_kb': 1920904,
    'ground_truth': 'not_labeled',
}>

In [None]:
from fiftyone import ViewField as F

plot1 = fo.NumericalHistogram(F("metadata.size_bytes") / (1024*1024), bins=50, xlabel="image size (MB)")
plot2 = fo.NumericalHistogram(F("metadata.width"), bins=50, xlabel="W")
plot3 = fo.NumericalHistogram(F("metadata.height"), bins=50, xlabel="H")
plot = fo.ViewGrid([plot1, plot2, plot3], init_view=fo_dataset)
plot.show()





FigureWidget({
    'data': [{'customdata': array([[0.72292137, 0.77480818],
                                   [0.77480818, 0.82669499],
                                   [0.82669499, 0.87858179],
                                   [0.87858179, 0.9304686 ],
                                   [0.9304686 , 0.98235541],
                                   [0.98235541, 1.03424222],
                                   [1.03424222, 1.08612902],
                                   [1.08612902, 1.13801583],
                                   [1.13801583, 1.18990264],
                                   [1.18990264, 1.24178945],
                                   [1.24178945, 1.29367625],
                                   [1.29367625, 1.34556306],
                                   [1.34556306, 1.39744987],
                                   [1.39744987, 1.44933668],
                                   [1.44933668, 1.50122348],
                                   [1.50122348, 1.55311029],
         

# Remove Duplicate File

In [None]:
from collections import Counter
from fiftyone import ViewField as F

def find_duplicate_file_image(fo_dataset):
    filehash_count = Counter(sample.filehash for sample in fo_dataset)
    dup_filehashes = [k for k, v in filehash_count.items() if v > 1]
    dup_view = (fo_dataset
    # Extract samples with duplicate file hashes
    .match(F("filehash").is_in(dup_filehashes))
    # Sort by file hash so duplicates will be adjacent
    .sort_by("filehash")
    )
    print("Number of images that have a duplicate: %d" % len(dup_view))
    print("Number of duplicates: %d" % (len(dup_view) - len(dup_filehashes)))
    return dup_view

In [None]:
duplicate_view = find_duplicate_file_image(fo_dataset)

Number of images that have a duplicate: 0
Number of duplicates: 0


In [None]:
# Delete duplicate view

print("Length of dataset before delete: %d" % len(fo_dataset))
temp_dup_filehash = set()
for sample in duplicate_view:
    if sample.filehash not in temp_dup_filehash:
        temp_dup_filehash.add(sample.filehash)
        continue
    del fo_dataset[sample.id]

print("Length of dataset after delete: %d" % len(fo_dataset))
# Verify that the dataset no longer contains any duplicates
print("Number of unique file hashes: %d" % len({s.filehash for s in fo_dataset}))

Length of dataset before delete: 45
Length of dataset after delete: 45
Number of unique file hashes: 45


# Create Custom Labels

In [None]:
label_list = list()
for sample in fo_dataset:
    sample["ground_truth"] = "not_labeled"
    label_list.append("not_labeled")
    sample.save()

# Compute Embedding with Pre-Trained Model FiftyOne

In [None]:
import fiftyone.zoo as foz
model_name = "resnet50-imagenet-torch"
model = foz.load_zoo_model(model_name)


CUDA initialization: The NVIDIA driver on your system is too old (found version 11080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)



# Distribution Feature Embedding

In [None]:
fo_dataset.compute_embeddings(model=model, num_workers=0)

Model does not support batching


# Distribution Uniqueness Feature Embedding

# Split Dataset with Uniqueness Embedding

# Export FiftyOne Dataset

# What's Next?