In [125]:
!pip install --quiet fiftyone

In [126]:
import fiftyone as fo
from fiftyone import ViewField as F

In [127]:
dataset_name = 'lego-classification'
dataset = fo.load_dataset(dataset_name)

In [128]:
count_of_samples = 20
percent_of_training_data = 0.82
count_of_training_samples = int(count_of_samples * percent_of_training_data)
count_of_validation_samples = count_of_samples - count_of_training_samples

print("Count of training samples: ", count_of_training_samples)
print("Count of validation_samples: ", count_of_validation_samples)

Count of training samples:  16
Count of validation_samples:  4


In [129]:
labels = list(dataset.count_values("ground_truth.detections.label").keys())
print("Count of different lego bricks types: ", len(labels))

Count of different lego bricks types:  574


In [130]:
first_brick_type = dataset.filter_labels(
    "ground_truth", F("label").is_in([labels[0]])
).take(count_of_samples)

In [131]:
training_data = first_brick_type[0:count_of_training_samples].clone()
validation_data = first_brick_type[count_of_training_samples:].clone()

In [132]:
len(training_data) + len(validation_data) == count_of_samples


True

In [138]:
with fo.ProgressBar() as pb:
    for label in pb(labels[1:]):
        samples_per_label = dataset.filter_labels(
            "ground_truth", F("label").is_in([label])
        ).take(count_of_samples)

        training_data.merge_samples(samples_per_label[0:count_of_training_samples])
        validation_data.merge_samples(samples_per_label[count_of_training_samples:])


 100% |█████████████████| 573/573 [8.9m elapsed, 0s remaining, 1.0 samples/s]      


In [143]:
if not (len(training_data) == len(labels) * count_of_training_samples):
    print("Wrong size of training data:" )
    print(len(training_data))
    print(len(labels) * count_of_training_samples)

len(training_data) / len(labels)

Wrong size of training data:
9043
9184


15.754355400696864

In [141]:
if not (len(validation_data) == len(labels) * count_of_validation_samples):
    print("Wrong size of training data:" )
    print(len(validation_data))
    print(len(labels) * count_of_validation_samples)

Wrong size of training data:
2301
2296


In [154]:
training_data.export(labels_path="data/train_small.json", dataset_type=fo.types.COCODetectionDataset, label_field='ground_truth')

 100% |███████████████| 9043/9043 [6.4s elapsed, 0s remaining, 1.5K samples/s]        


In [155]:
validation_data.export(labels_path="data/valid_small.json", dataset_type=fo.types.COCODetectionDataset, label_field='ground_truth')

 100% |███████████████| 2301/2301 [1.4s elapsed, 0s remaining, 1.7K samples/s]         
