In [None]:
import os
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
%cd .
%load_ext autoreload
%autoreload 2

## Data Exploration & Wrangling


In [None]:
%pip install fiftyone imagehash
%pip uninstall fiftyone-db -y
%pip install fiftyone-db-ubuntu2204 --force-reinstall

In [5]:
import fiftyone as fo
import fiftyone.utils.yolo as fouy
from fiftyone import ViewField as F

from PIL import Image
import imagehash
from tqdm import tqdm

### Load Dataset


In [10]:
name = "til23plush"
dataset_dir = "data/til23plush"
splits = "train", "val", "test"

# NOTE: Uncomment to recache dataset
# fo.delete_dataset(name)

In [11]:
if name in fo.list_datasets():
    ds = fo.load_dataset(name)

    # Delete any predictions still attached
    if ds.has_field("predictions"):
        ds.delete_sample_field("predictions")
    if ds.has_field("eval_tp"):
        ds.delete_sample_field("eval_tp")
        ds.delete_sample_field("eval_fp")
        ds.delete_sample_field("eval_fn")
else:
    ds = fo.Dataset(name=name, persistent=True)
    for split in splits:
        ds.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            include_all_data=True,
            split=split,
            tags=split,
        )

    # Add Perceptual Hashes for Dupe Detection later
    # Due to multiple false positives, perceptual hash chosen is closer to cryptographic
    for sample in tqdm(ds):
        sample["phash"] = str(imagehash.dhash(Image.open(sample.filepath)))
        sample.save()

print(ds)

 100% |███████████████| 5664/5664 [8.5s elapsed, 0s remaining, 668.4 samples/s]      
 100% |█████████████████| 800/800 [1.1s elapsed, 0s remaining, 723.3 samples/s]         
 100% |███████████████| 1600/1600 [215.3ms elapsed, 0s remaining, 7.4K samples/s]     


100%|██████████| 8064/8064 [02:45<00:00, 48.77it/s]

Name:        til23plush
Media type:  image
Num samples: 8064
Persistent:  True
Tags:        []
Sample fields:
    id:           fiftyone.core.fields.ObjectIdField
    filepath:     fiftyone.core.fields.StringField
    tags:         fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    ground_truth: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
    phash:        fiftyone.core.fields.StringField





In [12]:
# Relabel all detections to plushie
view = ds.set_field(
    "ground_truth.detections", F("detections").map(F().set_field("label", "plushie"))
)

### Export


In [15]:
# See: https://docs.voxel51.com/api/fiftyone.utils.yolo.html#fiftyone.utils.yolo.YOLOv5DatasetExporter
splits = "train", "val"
config = dict(
    export_dir="data/til23plushonly",
    dataset_type=fo.types.YOLOv5Dataset,
    label_field="ground_truth",
    export_media="symlink",
    include_path=False,
)

In [16]:
for split in splits:
    v = view.filter_field("tags", F().contains([split]))
    v.export(split=split, **config)

Directory 'data/til23plushonly' already exists; export will be merged with existing files
 100% |███████████████| 5664/5664 [5.1s elapsed, 0s remaining, 1.1K samples/s]       
Directory 'data/til23plushonly' already exists; export will be merged with existing files
 100% |█████████████████| 800/800 [691.1ms elapsed, 0s remaining, 1.2K samples/s]       


### Dupe Detection

In [25]:
# Filter view to only show images where `phash` has more than 1 instance
counts = filter(lambda i: i[1] > 1, view.count_values("phash").items())
counts = [k for k, v in counts]
dupes = view.filter_field("phash", F().is_in(counts)).filter_field(
    "tags", F().contains(["train", "val"])
)

### Eval Predicted Labels

In [13]:
split = "test"
label_dir = f"runs/detect/predict/{split}/labels"

In [14]:
sview = view.filter_field("tags", F().contains([split]))
fouy.add_yolo_labels(sample_collection=sview, label_field="predictions", labels_path=label_dir, classes=["plushie"])
results = sview.evaluate_detections(
    "predictions",
    gt_field="ground_truth",
    eval_key="eval",
)
results.print_report()

Sample field 'eval_tp' does not exist
Sample field 'eval_fp' does not exist
Sample field 'eval_fn' does not exist
Evaluating detections...
 100% |███████████████| 1600/1600 [3.7s elapsed, 0s remaining, 461.7 samples/s]      
              precision    recall  f1-score   support

     plushie       0.00      0.00      0.00       0.0

   micro avg       0.00      0.00      0.00       0.0
   macro avg       0.00      0.00      0.00       0.0
weighted avg       0.00      0.00      0.00       0.0



### Preview


In [None]:
# Put whatever view or dataset below
# v = ds
# v = view
# v = dupes
v = sview
fo.launch_app(dataset=v)