In [5]:
from datasets import load_dataset

dataset = load_dataset('openfoodfacts/product-database') 
print(dataset)

Generating food split: 3858359 examples [01:42, 37477.03 examples/s] 
Generating beauty split: 60124 examples [00:00, 122900.39 examples/s]


DatasetDict({
    food: Dataset({
        num_rows: 3858359
    })
    beauty: Dataset({
        num_rows: 60124
    })
})


In [31]:
for key, value in enumerate(dataset['food'].features):
    print(f"{key}: {value}")

0: additives_n
1: additives_tags
2: allergens_tags
3: brands_tags
4: brands
5: categories
6: categories_tags
7: checkers_tags
8: ciqual_food_name_tags
9: cities_tags
10: code
11: compared_to_category
12: complete
13: completeness
14: correctors_tags
15: countries_tags
16: created_t
17: creator
18: data_quality_errors_tags
19: data_quality_info_tags
21: data_sources_tags
22: ecoscore_data
23: ecoscore_grade
24: ecoscore_score
25: ecoscore_tags
26: editors
27: emb_codes_tags
28: emb_codes
29: entry_dates_tags
30: food_groups_tags
31: generic_name
32: images
33: informers_tags
34: ingredients_analysis_tags
35: ingredients_from_palm_oil_n
36: ingredients_n
37: ingredients_original_tags
38: ingredients_percent_analysis
39: ingredients_tags
40: ingredients_text
41: ingredients_with_specified_percent_n
42: ingredients_with_unspecified_percent_n
43: ingredients_without_ciqual_codes_n
44: ingredients_without_ciqual_codes
45: ingredients
46: known_ingredients_n
47: labels_tags
48: labels
49: lan

In [35]:
dataset['food'][0]['images']

[{'key': 'front_fr',
  'imgid': 1,
  'rev': 4,
  'sizes': {'100': {'h': 100, 'w': 75},
   '200': {'h': 200, 'w': 150},
   '400': {'h': 400, 'w': 300},
   'full': {'h': 1200, 'w': 901}},
  'uploaded_t': None,
  'uploader': None},
 {'key': '2',
  'imgid': None,
  'rev': None,
  'sizes': {'100': {'h': 75, 'w': 100},
   '200': None,
   '400': {'h': 300, 'w': 400},
   'full': {'h': 1200, 'w': 1599}},
  'uploaded_t': 1519297021,
  'uploader': 'kiliweb'},
 {'key': '1',
  'imgid': None,
  'rev': None,
  'sizes': {'100': {'h': 100, 'w': 75},
   '200': None,
   '400': {'h': 400, 'w': 300},
   'full': {'h': 1200, 'w': 901}},
  'uploaded_t': 1519297019,
  'uploader': 'kiliweb'},
 {'key': 'nutrition_fr',
  'imgid': 3,
  'rev': 15,
  'sizes': {'100': {'h': 94, 'w': 100},
   '200': {'h': 188, 'w': 200},
   '400': {'h': 376, 'w': 400},
   'full': {'h': 1200, 'w': 1278}},
  'uploaded_t': None,
  'uploader': None},
 {'key': 'ingredients_fr',
  'imgid': 2,
  'rev': 10,
  'sizes': {'100': {'h': 50, 'w': 1

In [33]:
import os
import random
from typing import Dict, List, Any, Optional

class MockDataset:
    def __init__(self, data):
        self.data = data
        self.features = {
            "images": {}, 
            "nutriments": {},
            "product_name": {},
            "nutriscore_grade": {},
            "ecoscore_grade": {},
            "code": {},
            "quantity": {}
        } 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
food_dataset = MockDataset(dataset['food'])


In [37]:

class SimpleNutritionalConfig:
    """
    Minimal configuration for extracting nutritional facts.
    """
    def __init__(self,
                 output_dir: str = "simple_nutri_data",
                 nutritional_fields: List[str] = ['energy-kcal_100g', 'fat_100g', 'proteins_100g'],
                 split_ratio: float = 0.8, # Simple train/test split
                 seed: int = 42
                ):
        self.output_dir = output_dir
        self.nutritional_fields = nutritional_fields
        self.split_ratio = split_ratio
        self.seed = seed

class NutritionalFactEntry:
    """
    Represents a single entry for nutritional fact extraction (simplified).
    """
    def __init__(self,
                 code: str,
                 product_name: str,
                 image_url: str, # We'll just store the URL for simplicity
                 ground_truth: Dict[str, float]):
        self.code = code
        self.product_name = product_name
        self.image_url = image_url
        self.ground_truth = ground_truth

    def __repr__(self):
        return (f"NutritionalFactEntry(code='{self.code}', product='{self.product_name}', "
                f"image_url='{self.image_url}', ground_truth={self.ground_truth})")

def prepare_simple_nutritional_dataset(
    dataset: Any,
    config: SimpleNutritionalConfig
) -> Dict[str, List[NutritionalFactEntry]]:
    """
    Prepares a simplified dataset for nutritional fact extraction.
    Only takes a few samples and features.
    """
    random.seed(config.seed)
    all_entries: List[NutritionalFactEntry] = []

    print(f"Processing {len(dataset)} simulated entries...")

    for i, item in enumerate(dataset):

        if i == 1000:
            break
  
        image_url = item.get('images'," sample url ")

        ground_truth_nutrients = item.get('nutriments', {})
        

        # 3. Create NutritionalFactEntry
        entry = NutritionalFactEntry(
            code=item.get('code', f"no_code_{i}"),
            product_name=item.get('product_name', 'Unknown Product'),
            image_url=image_url, # We're just storing the URL for this simplified version
            ground_truth=ground_truth_nutrients
        )
        all_entries.append(entry)

    print(f"Found {len(all_entries)} valid entries for processing.")

    # 4. Simple Data Splitting (Train/Test)
    random.shuffle(all_entries)
    train_size = int(len(all_entries) * config.split_ratio)

    splits = {
        'train': all_entries[:train_size],
        'test': all_entries[train_size:]
    }

    print(f"Dataset split: Train={len(splits['train'])}, Test={len(splits['test'])}")

    return splits

# --- Example Usage ---
if __name__ == "__main__":
    # Define your simplified configuration
    simple_config = SimpleNutritionalConfig(
        output_dir="my_nutri_dataset", 
        nutritional_fields=['energy-kcal_100g', 'fat_100g'], 
        split_ratio=0.7 # 70% train, 30% test
    )

    # Prepare the dataset splits
    prepared_data = prepare_simple_nutritional_dataset(food_dataset, simple_config)

    # Access the splits
    train_set = prepared_data['train']
    test_set = prepared_data['test']

    print("\n--- Sample from Training Data ---")
    for entry in train_set:
        print(entry)
        print("-" * 20)

    print("\n--- Sample from Test Data ---")
    for entry in test_set:
        print(entry)
        print("-" * 20)


Processing 3858359 simulated entries...
Found 1000 valid entries for processing.
Dataset split: Train=700, Test=300

--- Sample from Training Data ---
NutritionalFactEntry(code='0009542018276', product='[{'lang': 'main', 'text': 'Milk Chocolate Elf'}, {'lang': 'en', 'text': 'Milk Chocolate Elf'}]', image_url='[{'key': 'front_en', 'imgid': 1, 'rev': 5, 'sizes': {'100': {'h': 36, 'w': 100}, '200': {'h': 71, 'w': 200}, '400': {'h': 143, 'w': 400}, 'full': {'h': 344, 'w': 963}}, 'uploaded_t': None, 'uploader': None}, {'key': '1', 'imgid': None, 'rev': None, 'sizes': {'100': {'h': 36, 'w': 100}, '200': None, '400': {'h': 143, 'w': 400}, 'full': {'h': 344, 'w': 963}}, 'uploaded_t': 1670988405, 'uploader': 'kiliweb'}]', ground_truth=[{'name': 'vitamin-c', 'value': 0.0, '100g': 0.0, 'serving': 0.0, 'unit': 'mg', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'cholesterol', 'value': 20.0, '100g': 0.05000000074505806, 'serving': 0.019999