In [1]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.utils import load_img, img_to_array
from sklearn.model_selection import train_test_split

In [2]:
# Load the nutritional value CSV
nutritional_data = pd.read_csv("nutritional_values.csv")

# Ensure the CSV contains 'image_name' column for image filenames
print(nutritional_data.head())

   image_id  type                                           taxonomy  \
0  Ajwa_001  Ajwa  Kingdom - Plantae, Order - Arecales, Family - ...   
1  Ajwa_002  Ajwa  Kingdom - Plantae, Order - Arecales, Family - ...   
2  Ajwa_003  Ajwa  Kingdom - Plantae, Order - Arecales, Family - ...   
3  Ajwa_004  Ajwa  Kingdom - Plantae, Order - Arecales, Family - ...   
4  Ajwa_005  Ajwa  Kingdom - Plantae, Order - Arecales, Family - ...   

                                    vernacular_names  calories  carbohydrate  \
0  Arabic-Nakhleh, English-Date friut, French-Dat...        23         74.97   
1  Arabic-Nakhleh, English-Date friut, French-Dat...        25         78.39   
2  Arabic-Nakhleh, English-Date friut, French-Dat...        24         75.41   
3  Arabic-Nakhleh, English-Date friut, French-Dat...        23         75.67   
4  Arabic-Nakhleh, English-Date friut, French-Dat...        25         77.63   

   proteins  total_fat  cholesterol  dietary_fiber  ...  Magnesium (mg)  \
0      1.81

In [3]:
# Set parameters for image processing
image_folder = "Dates"  # Folder containing images
target_size = (224, 224)  # ResNet/EfficientNet input size

In [4]:
# Load and preprocess images
images = []
labels = []

for index, row in nutritional_data.iterrows():
    image_id = row['image_id'] + '.jpg'  # Add .jpg extension
    image_path = os.path.join(image_folder, image_id)

    # Check if the image file exists
    if os.path.exists(image_path):
        # Load image
        img = load_img(image_path, target_size=target_size)
        img_array = img_to_array(img) / 255.0  # Normalize to [0, 1]
        images.append(img_array)

        # Add labels (nutritional values)
        labels.append(row[[ 'calories', 'proteins', 'total_fat',
                           'glucose', 'cholesterol', 'water', 'Energy (Kcal)']].values)


In [5]:
# Convert to NumPy arrays
images = np.array(images)
labels = np.array(labels)

print(f"Images shape: {images.shape}, Labels shape: {labels.shape}")

Images shape: (2707, 224, 224, 3), Labels shape: (2707, 7)


In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")

Training set: (2165, 224, 224, 3), (2165, 7)
Testing set: (542, 224, 224, 3), (542, 7)


In [7]:
# Save to .npy files
np.save("images_preprocessed.npy", images)
np.save("nutritional_values.npy", labels)

In [8]:
# images = np.load("images_preprocessed.npy")
# labels = np.load("nutritional_values.npy")