In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Install FiftyOne
!pip install fiftyone


Mounted at /content/drive
Collecting fiftyone
  Downloading fiftyone-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting argcomplete (from fiftyone)
  Downloading argcomplete-3.6.2-py3-none-any.whl.metadata (16 kB)
Collecting async_lru>=2 (from fiftyone)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting boto3 (from fiftyone)
  Downloading boto3-1.40.0-py3-none-any.whl.metadata (6.7 kB)
Collecting dacite<1.8.0,>=1.6.0 (from fiftyone)
  Downloading dacite-1.7.0-py3-none-any.whl.metadata (14 kB)
Collecting Deprecated (from fiftyone)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting ftfy (from fiftyone)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting hypercorn>=0.13.2 (from fiftyone)
  Downloading hypercorn-0.17.3-py3-none-any.whl.metadata (5.4 kB)
Collecting kaleido!=0.2.1.post1 (from fiftyone)
  Downloading kaleido-1.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting mongoengine~=0.29.1 (from fiftyone)
  Downl

In [43]:

# Step 3: Load the datasets
import fiftyone as fo
import fiftyone.types as fot


In [44]:

dataset1_dir = "/content/drive/MyDrive/FoodWasteHackathon_BIBI/Dataset/food_waste_part_1"
dataset2_dir = "/content/drive/MyDrive/FoodWasteHackathon_BIBI/Dataset/FiftyOne_dataset_part2"



In [80]:

# List all loaded datasets
all_datasets = fo.list_datasets()
print("Datasets found:", all_datasets)

# Delete each one
for name in all_datasets:
    print(f"Deleting dataset: {name}")
    fo.delete_dataset(name)

Datasets found: ['loaded_dataset_1', 'loaded_dataset_2']
Deleting dataset: loaded_dataset_1
Deleting dataset: loaded_dataset_2


In [81]:
import fiftyone as fo

# Load from exported directories (FiftyOne format)
dataset1 = fo.Dataset.from_dir(
    dataset_dir=dataset1_dir,
    dataset_type=fo.types.FiftyOneDataset,
    name="loaded_dataset_1"
)

dataset2 = fo.Dataset.from_dir(
    dataset_dir=dataset2_dir,
    dataset_type=fo.types.FiftyOneDataset,
    name="loaded_dataset_2"
)
def print_fields_with_types(dataset, name):
    print(f"\n{name} fields:")
    for field_name, field in dataset.get_field_schema().items():
        print(f"  {field_name}: {type(field)}")

# Print schemas
print_fields_with_types(dataset1, "Dataset 1")
print_fields_with_types(dataset2, "Dataset 2")


Importing samples...


INFO:fiftyone.utils.data.importers:Importing samples...


 100% |█████████████████| 375/375 [68.1ms elapsed, 0s remaining, 5.5K samples/s]      


INFO:eta.core.utils: 100% |█████████████████| 375/375 [68.1ms elapsed, 0s remaining, 5.5K samples/s]      


Importing samples...


INFO:fiftyone.utils.data.importers:Importing samples...


 100% |█████████████████| 255/255 [49.7ms elapsed, 0s remaining, 5.1K samples/s]      


INFO:eta.core.utils: 100% |█████████████████| 255/255 [49.7ms elapsed, 0s remaining, 5.1K samples/s]      



Dataset 1 fields:
  id: <class 'fiftyone.core.fields.ObjectIdField'>
  filepath: <class 'fiftyone.core.fields.StringField'>
  tags: <class 'fiftyone.core.fields.ListField'>
  metadata: <class 'fiftyone.core.fields.EmbeddedDocumentField'>
  created_at: <class 'fiftyone.core.fields.DateTimeField'>
  last_modified_at: <class 'fiftyone.core.fields.DateTimeField'>
  split: <class 'fiftyone.core.fields.StringField'>
  bonid: <class 'fiftyone.core.fields.IntField'>
  bon_id: <class 'fiftyone.core.fields.ListField'>
  article_number: <class 'fiftyone.core.fields.ListField'>
  ingredient_name: <class 'fiftyone.core.fields.ListField'>
  piece_article: <class 'fiftyone.core.fields.ListField'>
  number_of_portions: <class 'fiftyone.core.fields.ListField'>
  weight_per_portion: <class 'fiftyone.core.fields.ListField'>
  weight_per_plate: <class 'fiftyone.core.fields.ListField'>
  kcal_per_plate: <class 'fiftyone.core.fields.ListField'>
  kj_per_plate: <class 'fiftyone.core.fields.ListField'>
  fat

In [82]:
for sample in dataset1.select_fields("kj_before").take(9):
    print(repr(sample.kj_before ))


'1.603,440'
'1.603,440'
'1.363,620'
'1.745,200'
'1.374,62'
'1.475,600'
'1.363,620'
'1.603,440'
'1.779,680'


In [83]:
import re

def euro_str_to_float(s):
    """
    Converts strings like '1.592,50' to float 1592.50
    Handles empty/None gracefully.
    """
    if s is None or (isinstance(s, str) and s.strip() == ""):
        return None
    if isinstance(s, (int, float)):
        return float(s)
    s = str(s).strip()
    # Remove thousand separators (dots), replace decimal comma with dot
    s = s.replace('.', '').replace(',', '.')
    try:
        return float(s)
    except Exception:
        return None


In [79]:
# 1. Make sure first assignment is a float (not None)
for sample in dataset1:
    val = sample.kcal_before
    float_val = euro_str_to_float(val)
    if float_val is not None:
        sample["kcal_before_float"] = float_val
        sample.save()
        break  # Only do this for the first convertible sample

# 2. Now do the rest (it's safe to assign None after the type is set)
for sample in dataset1:
    val = sample.kcal_before
    sample["kcal_before_float"] = euro_str_to_float(val)
    sample.save()


In [69]:
dataset1.delete_sample_field("kcal_before")
dataset1.rename_sample_field("kcal_before_float", "kcal_before")


In [84]:
fields_to_convert = {
    "kcal_before": float,
    "kj_before": float,
    "fat_before": float,
    "saturated_fat_before": float,
    "carbohydrates_before": float,
    "sugar_before": float,
    "protein_before": float,
    "salt_before": float,
    "kcal_after": int,
    "kj_after": int,
    "fat_after": int,
    "saturated_fat_after": int,
    "carbohydrates_after": int,
    "sugar_after": int,
    "protein_after": int,
    "salt_after": int,
}

def euro_str_to_float(s):
    """
    Converts European-formatted numbers as strings (e.g., '1.592,50') to float 1592.50.
    Handles None, empty, int, float, etc.
    """
    if s is None or (isinstance(s, str) and s.strip() == ""):
        return None
    if isinstance(s, (int, float)):
        return float(s)
    s = str(s).strip()
    # Remove thousand separators (dots), replace decimal comma with dot
    s = s.replace('.', '').replace(',', '.')
    try:
        return float(s)
    except Exception:
        return None

def safe_cast(val, dtype):
    num = euro_str_to_float(val)
    if num is None:
        return None
    if dtype is int:
        return int(round(num))
    return num  # float

for field, dtype in fields_to_convert.items():
    print(f"Converting {field} to {dtype}")
    temp_field = f"{field}_tmp"

    # 1. Ensure first valid (not None) value is assigned
    for sample in dataset1:
        val = sample[field]
        cast_val = safe_cast(val, dtype)
        if cast_val is not None:
            sample[temp_field] = cast_val
            sample.save()
            break  # Only need the first valid one

    # 2. Now assign to the rest (it's safe now, even for None)
    for sample in dataset1:
        val = sample[field]
        sample[temp_field] = safe_cast(val, dtype)
        sample.save()

    # 3. Delete the original field
    dataset1.delete_sample_field(field)
    # 4. Rename temp to original
    dataset1.rename_sample_field(temp_field, field)

print("All mismatched fields converted and renamed!")


Converting kcal_before to <class 'float'>
Converting kj_before to <class 'float'>
Converting fat_before to <class 'float'>
Converting saturated_fat_before to <class 'float'>
Converting carbohydrates_before to <class 'float'>
Converting sugar_before to <class 'float'>
Converting protein_before to <class 'float'>
Converting salt_before to <class 'float'>
Converting kcal_after to <class 'int'>
Converting kj_after to <class 'int'>
Converting fat_after to <class 'int'>
Converting saturated_fat_after to <class 'int'>
Converting carbohydrates_after to <class 'int'>
Converting sugar_after to <class 'int'>
Converting protein_after to <class 'int'>
Converting salt_after to <class 'int'>
All mismatched fields converted and renamed!


## list fields dtyp

In [89]:
import fiftyone as fo

def print_fields_with_types_and_list_element_types(dataset, name, num_elements=3):
    print(f"\n{name} fields:")
    schema = dataset.get_field_schema()
    sample = next(iter(dataset), None)
    for field_name, field in schema.items():
        field_type = type(field)
        print(f"  {field_name}: {field_type}", end="")
        if field_type.__name__ == "ListField":
            elem_types = set()
            if sample is not None:
                lst = getattr(sample, field_name, None)
                if isinstance(lst, list) and lst:
                    elem_types = {type(x) for x in lst[:num_elements]}
            print(f"  --> element types (sample): {', '.join([t.__name__ for t in elem_types])}")
        else:
            print()

# Print schemas with element types
print_fields_with_types_and_list_element_types(dataset1, "Dataset 1")
print_fields_with_types_and_list_element_types(dataset2, "Dataset 2")



Dataset 1 fields:
  id: <class 'fiftyone.core.fields.ObjectIdField'>
  filepath: <class 'fiftyone.core.fields.StringField'>
  tags: <class 'fiftyone.core.fields.ListField'>  --> element types (sample): 
  metadata: <class 'fiftyone.core.fields.EmbeddedDocumentField'>
  created_at: <class 'fiftyone.core.fields.DateTimeField'>
  last_modified_at: <class 'fiftyone.core.fields.DateTimeField'>
  split: <class 'fiftyone.core.fields.StringField'>
  bonid: <class 'fiftyone.core.fields.IntField'>
  bon_id: <class 'fiftyone.core.fields.ListField'>  --> element types (sample): int
  article_number: <class 'fiftyone.core.fields.ListField'>  --> element types (sample): int
  ingredient_name: <class 'fiftyone.core.fields.ListField'>  --> element types (sample): str
  piece_article: <class 'fiftyone.core.fields.ListField'>  --> element types (sample): str
  number_of_portions: <class 'fiftyone.core.fields.ListField'>  --> element types (sample): int
  weight_per_portion: <class 'fiftyone.core.fields

In [90]:
# Inspect first few 'kcal_per_plate' values
for sample in dataset1.take(5):
    print(sample.kcal_per_plate)


['56,050', '36,400', '18,800', '127,040', '9,150', '29,400', '82,600']
['129,960', '36,400', '18,800', '42,000', '90,090']
['129.96', '28', '3.5', '31.5', '204.3']
['98,4', '18,2', '85', '144,72', '11,1']
['56,050', '36,400', '18,800', '82,600', '90,090']


In [91]:
import re

def euro_or_us_str_to_float(x):
    if x is None or (isinstance(x, str) and x.strip() == ''):
        return None
    if isinstance(x, (int, float)):
        return float(x)
    x = x.strip()
    # If both comma and dot: dot is thousands, comma is decimal
    if ',' in x and '.' in x:
        x = x.replace('.', '').replace(',', '.')
    # Only comma: treat comma as decimal
    elif ',' in x:
        x = x.replace(',', '.')
    # Only dot: treat dot as decimal
    # else: already standard
    try:
        return float(x)
    except Exception:
        return None


In [92]:
field = 'kcal_per_plate'
temp_field = f"{field}_tmp"

# 1. Set first value for schema inference
for sample in dataset1:
    vals = getattr(sample, field, None)
    if isinstance(vals, list) and any(v not in (None, '', []) for v in vals):
        sample[temp_field] = [euro_or_us_str_to_float(v) for v in vals]
        sample.save()
        break

# 2. Assign for all samples
for sample in dataset1:
    vals = getattr(sample, field, None)
    if isinstance(vals, list):
        sample[temp_field] = [euro_or_us_str_to_float(v) for v in vals]
        sample.save()

# 3. Delete old field and rename tmp to original
dataset1.delete_sample_field(field)
dataset1.rename_sample_field(temp_field, field)


In [93]:
for sample in dataset1.take(5):
    print(sample.kcal_per_plate)


[68.4, 64.0, 186.3, 33.6]
[129.96, 24.5, 82.6, 172.92]
[56.05, 64.0, 82.6, 172.92]
[173.6, 251.2]
[129.96, 45.6, 42.0, 109.2]


In [94]:
import fiftyone as fo

# List all fields you want to harmonize to lists of floats
fields_to_convert = [
    "weight_per_portion",
    "weight_per_plate",
    "kcal_per_plate",
    "kj_per_plate",
    "fat_per_plate",
    "saturated_fat_per_plate",
    "carbohydrates_per_plate",
    "sugar_per_plate",
    "protein_per_plate",
    "salt_per_plate",
    "return_quantity",
    "return_percentage",
]

def euro_or_us_str_to_float(x):
    if x is None or (isinstance(x, str) and x.strip() == ''):
        return None
    if isinstance(x, (int, float)):
        return float(x)
    x = x.strip()
    # If both comma and dot: dot is thousands, comma is decimal
    if ',' in x and '.' in x:
        x = x.replace('.', '').replace(',', '.')
    elif ',' in x:
        x = x.replace(',', '.')
    # If only dot: already standard float
    try:
        return float(x)
    except Exception:
        return None

def convert_list_field_to_floats(dataset, field):
    tmp_field = field + "_tmp"
    # 1. Set the first (non-None) value for schema inference
    for sample in dataset:
        vals = getattr(sample, field, None)
        if isinstance(vals, list) and any(v not in (None, '', []) for v in vals):
            sample[tmp_field] = [euro_or_us_str_to_float(v) for v in vals]
            sample.save()
            break
    # 2. Assign for all samples
    for sample in dataset:
        vals = getattr(sample, field, None)
        if isinstance(vals, list):
            sample[tmp_field] = [euro_or_us_str_to_float(v) for v in vals]
            sample.save()
    # 3. Delete old field and rename tmp to original
    dataset.delete_sample_field(field)
    dataset.rename_sample_field(tmp_field, field)

# Run batch conversion on all relevant fields
for field in fields_to_convert:
    print(f"Converting field: {field}")
    convert_list_field_to_floats(dataset1, field)

print("All specified list fields in dataset1 are now lists of floats!")


Converting field: weight_per_portion
Converting field: weight_per_plate
Converting field: kcal_per_plate
Converting field: kj_per_plate
Converting field: fat_per_plate
Converting field: saturated_fat_per_plate
Converting field: carbohydrates_per_plate
Converting field: sugar_per_plate
Converting field: protein_per_plate
Converting field: salt_per_plate
Converting field: return_quantity
Converting field: return_percentage
All specified list fields in dataset1 are now lists of floats!


In [100]:
existing_paths = set(sample.filepath for sample in dataset1)
num_added = 0
for sample in dataset2:
    if sample.filepath in existing_paths:
        continue
    new_sample = sample.copy()
    new_sample.id = None
    dataset1.add_sample(new_sample)
    num_added += 1

print(f"Added {num_added} samples from dataset2 to dataset1.")


Added 255 samples from dataset2 to dataset1.


In [102]:
dataset1.name = "merged_food_dataset"
dataset1.save()
print("Renamed dataset to:", dataset1.name)


Renamed dataset to: merged_food_dataset


In [103]:
output_dir = "/content/drive/MyDrive/FoodWasteHackathon_BIBI/Dataset/food_waste_part_nerged"

dataset1.export(
    export_dir=output_dir,
    dataset_type=fo.types.FiftyOneDataset ,
    export_media=True
)
print(f"Dataset exported to: {output_dir}")


Exporting samples...


INFO:fiftyone.utils.data.exporters:Exporting samples...


 100% |████████████████████| 630/630 [7.9m elapsed, 0s remaining, 1.9 docs/s]      


INFO:eta.core.utils: 100% |████████████████████| 630/630 [7.9m elapsed, 0s remaining, 1.9 docs/s]      


Dataset exported to: /content/drive/MyDrive/FoodWasteHackathon_BIBI/Dataset/food_waste_part_nerged


## Upload to hugging face

In [104]:
# 1. Install requirements (if not already installed)
!pip install huggingface_hub --quiet


In [105]:
from google.colab import userdata
hf_token=userdata.get('HF_token')

In [112]:

# 2. Set up your Hugging Face token (assumes `keys` dict with 'hf_token')
from huggingface_hub import HfApi



# 4. Import FiftyOne and load your dataset
import fiftyone as fo

# (Assuming your dataset is already loaded/merged into `dataset1`)
# If not, load it here:
# dataset1 = fo.Dataset.from_dir(...)

# 5. Push to Hugging Face with push_to_hub
from fiftyone.utils.huggingface import push_to_hub

# Specify your Hugging Face repo: "username/dataset-name"
repo_id = "food-waste-dataset5"  # CHANGE this to your username/repo

push_to_hub(
    dataset1,
    repo_id,
    license="mit",      # or any SPDX license, e.g., "cc-by-4.0"
    exist_ok=True,      # allow updating the dataset if it already exists
    chunk_size=100,     # tune this if you have a very large dataset
    token=hf_token      # use your programmatic token
)

print(f"Dataset uploaded to: https://huggingface.co/datasets/{repo_id}")


Directory '/tmp/tmp13z6qbi9' already exists; export will be merged with existing files




Exporting samples...


INFO:fiftyone.utils.data.exporters:Exporting samples...


 100% |████████████████████| 630/630 [9.7s elapsed, 0s remaining, 78.1 docs/s]       


INFO:eta.core.utils: 100% |████████████████████| 630/630 [9.7s elapsed, 0s remaining, 78.1 docs/s]       
Uploading media files in 14 batches of size 100: 100%|██████████| 14/14 [01:49<00:00,  7.82s/it]


RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-688cc9c8-20e53e536415628549434e6b;33d05bc6-4ffe-4fd6-ba71-072e99c3b880)

Repository Not Found for url: https://huggingface.co/api/datasets/shahabdaiani/food-waste-dataset5/preupload/main.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.
Note: Creating a commit assumes that the repo already exists on the Huggingface Hub. Please use `create_repo` if it's not the case.

In [122]:
from fiftyone.utils.huggingface import load_from_hub
recovered_dataset = load_from_hub('FoodWasteProjectBIBI/food_waste_merged_fiftyOneDS')

Downloading config file fiftyone.yml from FoodWasteProjectBIBI/food_waste_merged_fiftyOneDS


INFO:fiftyone.utils.huggingface:Downloading config file fiftyone.yml from FoodWasteProjectBIBI/food_waste_merged_fiftyOneDS


Loading dataset


INFO:fiftyone.utils.huggingface:Loading dataset


ValueError: Dataset name 'FoodWasteProjectBIBI/food_waste_merged_fiftyOneDS' is not available

In [None]:
recovered_dataset