<a href="https://colab.research.google.com/github/DerekHertz/food-freshness-qc/blob/working-branch/food_freshness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import kagglehub
from pathlib import Path
import tensorflow as tf
import numpy as np
from sklearn.metrics import f1_score,confusion_matrix

from tensorflow.keras.layers import Rescaling
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

In [5]:
dataset_path = kagglehub.dataset_download("ulnnproject/food-freshness-dataset")
ROOT_DIR = Path(dataset_path) / "Dataset"

Downloading from https://www.kaggle.com/api/v1/datasets/download/ulnnproject/food-freshness-dataset?dataset_version_number=1...


100%|██████████| 5.89G/5.89G [02:36<00:00, 40.4MB/s]

Extracting files...





In [6]:
# input parameters for MobileNetV2
IMG_SIZE = (224, 224)
BATCH_SIZE = 32


In [20]:
def commit_and_push(message, branch='working-branch'):
    """Commit changes and push to GitHub"""
    !git add .
    !git status
    !git commit -m "{message}"
    !git push https://{os.environ['GITHUB_TOKEN']}@github.com/YOUR_USERNAME/YOUR_REPO.git {branch}
    print("successfully pushed!.")

In [8]:
def inspect_nested_dataset(root_dir):
    """Inspect the nested dataset structure"""
    print(f"Dataset root: {root_dir}\n")

    for class_dir in root_dir.iterdir():
        if class_dir.is_dir():
            print(f"Class: {class_dir.name}")
            subdirs = [d for d in class_dir.iterdir() if d.is_dir()]
            print(f"  Subdirectories: {len(subdirs)}")

            total_images = 0
            for subdir in subdirs:
                image_count = len(list(subdir.glob('*.jpg')) +
                                list(subdir.glob('*.jpeg')) +
                                list(subdir.glob('*.png')))
                total_images += image_count
                print(f"    - {subdir.name}: {image_count} images")

            print(f"  Total images in {class_dir.name}: {total_images}\n")



In [9]:
def load_nested_dataset(root_dir, img_size, batch_size, validation_split=0.2):
    """Load dataset from nested directory structure"""

    # Collect all image paths with labels
    all_files = []
    all_labels = []

    # Note: Using 'Fresh' and 'Rotten' with capital letters as shown in inspection
    class_mapping = {'Fresh': 0, 'Rotten': 1}

    # Valid image extensions
    valid_extensions = {'.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG',
                       '.bmp', '.BMP', '.gif', '.GIF'}

    stats = {}

    for class_name, label_value in class_mapping.items():
        class_dir = root_dir / class_name

        if not class_dir.exists():
            print(f"{class_dir} does not exist")
            continue

        class_files = []

        # Get all images from all subdirectories
        for food_type_dir in sorted(class_dir.iterdir()):
            if not food_type_dir.is_dir():
              continue

            food_files = []

            # Get all files in this subdirectory
            for file_path in food_type_dir.iterdir():
                if file_path.is_file() and file_path.suffix in valid_extensions:
                    # Additional check: ensure file size > 0
                    if file_path.stat().st_size > 0:
                        all_files.append(str(file_path))
                        all_labels.append(label_value)
                        food_files.append(file_path)

            if food_files:
                stats[f"{class_name}/{food_type_dir.name}"] = len(food_files)


    print("\nDataset Statistics:")
    for category, count in sorted(stats.items()):
        print(f"  {category}: {count} images")

    total_images = len(all_files)
    print(f"\nTotal valid images: {total_images}")
    print(f"Fresh: {all_labels.count(0)}, Rotten: {all_labels.count(1)}")

    if total_images == 0:
        raise ValueError("No valid images found in the dataset!")

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((all_files, all_labels))

    # Shuffle the entire dataset
    dataset = dataset.shuffle(buffer_size=total_images, seed=42, reshuffle_each_iteration=False)

    # Calculate split sizes
    train_size = int(total_images * (1 - validation_split))
    val_size = total_images - train_size

    # Split the dataset
    train_dataset = dataset.take(train_size)
    val_dataset = dataset.skip(train_size)

    print(f"\nDataset split:")
    print(f"  Training: {train_size} images")
    print(f"  Validation: {val_size} images")


    def preprocess_image(file_path, label):
        # Read file
        image = tf.io.read_file(file_path)

        # Decode image with error handling
        # Try JPEG first (most common), then PNG
        image = tf.py_function(
            func=decode_image_safe,
            inp=[image, file_path],
            Tout=tf.uint8
        )

        # Set shape (py_function loses shape information)
        image.set_shape([None, None, 3])

        # Convert to float and normalize
        image = tf.cast(image, tf.float32) / 255.0

        # Resize to target size
        image = tf.image.resize(image, img_size, method='bilinear')

        return image, label


    # Helper function for safe image decoding
    def decode_image_safe(image_data, file_path):
        """Safely decode an image, trying different formats"""
        try:
            # Convert tensors to numpy for easier handling
            image_data = image_data.numpy()
            file_path_str = file_path.numpy().decode('utf-8')

            # Try decoding based on file extension
            if file_path_str.lower().endswith(('.jpg', '.jpeg')):
                image = tf.image.decode_jpeg(image_data, channels=3)
            elif file_path_str.lower().endswith('.png'):
                image = tf.image.decode_png(image_data, channels=3)
            elif file_path_str.lower().endswith('.bmp'):
                image = tf.image.decode_bmp(image_data, channels=3)
            elif file_path_str.lower().endswith('.gif'):
                image = tf.image.decode_gif(image_data)
                # Take first frame if animated
                if len(image.shape) == 4:
                    image = image[0]
            else:
                # Try generic decode
                image = tf.image.decode_image(image_data, channels=3, expand_animations=False)

            return image.numpy()

        except Exception as e:
            # Return a placeholder image on error
            print(f"Error decoding {file_path_str}: {str(e)}")
            return np.zeros((224, 224, 3), dtype=np.uint8)


    # Apply processing and batching
    AUTOTUNE = tf.data.AUTOTUNE

    train_ds = (train_dataset
                .map(preprocess_image, num_parallel_calls=AUTOTUNE)
                .batch(batch_size)
                .prefetch(buffer_size=AUTOTUNE))

    val_ds = (val_dataset
              .map(preprocess_image, num_parallel_calls=AUTOTUNE)
              .batch(batch_size)
              .prefetch(buffer_size=AUTOTUNE))

    # Define class names
    class_names = ['Fresh', 'Rotten']

    return train_ds, val_ds, class_names


In [10]:
def load_data_pipeline(root_dir, img_size, batch_size):
    # Load data
    all_data_ds = tf.keras.utils.image_dataset_from_directory(
        directory=root_dir,
        labels='inferred',
        label_mode='binary',
        image_size=img_size,
        batch_size=batch_size,
        validation_split=0.2,
        subset='both',
        seed=42,
        shuffle=True
    )

    # Unpack datasets
    train_ds = all_data_ds[0]
    val_ds = all_data_ds[1]

    class_names = train_ds.class_names
    print(f"Inferred Classes: {class_names}")
    print(f"Training batches: {tf.data.experimental.cardinality(train_ds).numpy()}")
    print(f"Validation batches: {tf.data.experimental.cardinality(val_ds).numpy()}")

    # Rescaling layer
    rescale_layer = Rescaling(1./255)

    # Apply rescaling to datasets
    train_ds = train_ds.map(lambda image, label: (rescale_layer(image), label))
    val_ds = val_ds.map(lambda image, label: (rescale_layer(image), label))

    # Performance optimization
    # .cache() keeps images in memory after first load
    # .prefetch() allows the gpu to process current batch while next batch is being prepared
    train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

    return train_ds, val_ds, class_names


In [11]:
def build_model(img_size):
    # Load MobileNetV2 as base model
    base_model = MobileNetV2(
        input_shape=(img_size[0], img_size[1], 3),
        include_top=False,
        weights='imagenet'
    )

    # Freeze base layer
    base_model.trainable = False

    # Build custom classification head
    x = base_model.output
    x = GlobalAveragePooling2D()(x) # Reduce 3d feature made to a 1d vector
    x = Dense(128, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)

    # Final model
    model = Model(inputs=base_model.input, outputs=output)

    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', Precision(name='precision'), Recall(name='recall')]
    )
    print("\nModel Summary (Frozen Base): \n")
    print(model.summary())
    return model, base_model


In [12]:
def train_model(model, base_model, train_ds, val_ds, initial_epochs=5, fine_tune_epochs=5):
    # Model training
    history = model.fit(
        train_ds,
        epochs=initial_epochs,
        validation_data=val_ds,
    )

    # Fine-tuning
    base_model.trainable = True # unfreeze all base layers
    # Define layer fine-tuning point
    fine_tune_at = 100

    for layer in base_model.layers[:fine_tune_at]:
        layer.trainable = False

    # recompile with low learning rate
    model.compile(
        optimizer=Adam(learning_rate=1e-5),
        loss='binary_crossentropy',
        metrics=['accuracy', Precision(name='precision'), Recall(name='recall')]
    )

    total_epochs = initial_epochs + fine_tune_epochs

    history_fine_tune = model.fit(
        train_ds,
        epochs=total_epochs,
        initial_epoch=initial_epochs,
        validation_data=val_ds,
    )

    return model, history_fine_tune

In [13]:
inspect_nested_dataset(ROOT_DIR)

Dataset root: /root/.cache/kagglehub/datasets/ulnnproject/food-freshness-dataset/versions/1/Dataset

Class: Fresh
  Subdirectories: 13
    - FreshBanana: 3470 images
    - FreshStrawberry: 603 images
    - FreshOrange: 10830 images
    - FreshCarrot: 9896 images
    - FreshBellpepper: 610 images
    - FreshTomato: 13679 images
    - FreshCucumber: 1098 images
    - FreshBittergroud: 327 images
    - FreshPotato: 1148 images
    - FreshApple: 3431 images
    - FreshCapciscum: 990 images
    - FreshMango: 605 images
    - FreshOkara: 635 images
  Total images in Fresh: 47322

Class: Rotten
  Subdirectories: 13
    - RottenTomato: 4012 images
    - RottenBellpepper: 591 images
    - RottenStrawberry: 596 images
    - RottenOrange: 3292 images
    - RottenBittergroud: 357 images
    - RottenBanana: 4032 images
    - RottenMango: 593 images
    - RottenOkra: 338 images
    - RottenApple: 4431 images
    - RottenCapsicum: 901 images
    - RottenCarrot: 2335 images
    - RottenPotato: 1386 im

In [14]:
# 1. DATASET DOWNLOAD (ONLY NEEDED ONCE BY NEW USERS)
# ----------------------------------------------------------------------
# To run this project, first ensure you have the Kaggle API installed
# and configured. Uncomment the line below to download the dataset
# to your local Kaggle cache directory.

# print("Attempting to download/verify dataset...")
# kagglehub.dataset_download("ulnnproject/food-freshness-dataset")
# print("Dataset verified in cache.")

# Data preprocessing and loading
train_ds, val_ds, class_names = load_nested_dataset(ROOT_DIR, IMG_SIZE, BATCH_SIZE)

# Build the model
model, base_model = build_model(IMG_SIZE)

# Train the model
trained_model, training_history = train_model(model, base_model, train_ds, val_ds)

# Evaluate the model
loss, acc, precision, recall = trained_model.evaluate(val_ds)

print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {acc:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")

# Calculate f1-score
probabilities = trained_model.predict(val_ds)
predictions = np.round(probabilities).flatten()
true_labels = np.concatenate([y.numpy() for x, y in val_ds], axis=0).flatten()
f1 = f1_score(true_labels, predictions)

print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix: \n{confusion_matrix(true_labels, predictions)}")



Dataset Statistics:
  Fresh/FreshApple: 3431 images
  Fresh/FreshBanana: 3472 images
  Fresh/FreshBellpepper: 611 images
  Fresh/FreshBittergroud: 327 images
  Fresh/FreshCapciscum: 990 images
  Fresh/FreshCarrot: 9898 images
  Fresh/FreshCucumber: 1104 images
  Fresh/FreshMango: 605 images
  Fresh/FreshOkara: 635 images
  Fresh/FreshOrange: 10835 images
  Fresh/FreshPotato: 1150 images
  Fresh/FreshStrawberry: 603 images
  Fresh/FreshTomato: 13681 images
  Rotten/RottenApple: 4432 images
  Rotten/RottenBanana: 4035 images
  Rotten/RottenBellpepper: 591 images
  Rotten/RottenBittergroud: 357 images
  Rotten/RottenCapsicum: 901 images
  Rotten/RottenCarrot: 2418 images
  Rotten/RottenCucumber: 1014 images
  Rotten/RottenMango: 593 images
  Rotten/RottenOkra: 338 images
  Rotten/RottenOrange: 3292 images
  Rotten/RottenPotato: 1386 images
  Rotten/RottenStrawberry: 596 images
  Rotten/RottenTomato: 4012 images

Total valid images: 71307
Fresh: 47342, Rotten: 23965

Dataset split:
  Trai

None
Epoch 1/5
[1m  64/1783[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:22[0m 118ms/step - accuracy: 0.7740 - loss: 0.4727 - precision: 0.6681 - recall: 0.5791Error decoding /root/.cache/kagglehub/datasets/ulnnproject/food-freshness-dataset/versions/1/Dataset/Rotten/RottenBellpepper/rottenPepper (1).jpg: {{function_node __wrapped__DecodeJpeg_device_/job:localhost/replica:0/task:0/device:CPU:0}} Unknown image file format. One of JPEG, PNG, GIF, BMP required. [Op:DecodeJpeg]
[1m1595/1783[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m18s[0m 99ms/step - accuracy: 0.9176 - loss: 0.2058 - precision: 0.8906 - recall: 0.8534Error decoding /root/.cache/kagglehub/datasets/ulnnproject/food-freshness-dataset/versions/1/Dataset/Fresh/FreshCarrot/freshCarrot (415).jpg: {{function_node __wrapped__DecodeJpeg_device_/job:localhost/replica:0/task:0/device:CPU:0}} Unknown image file format. One of JPEG, PNG, GIF, BMP required. [Op:DecodeJpeg]
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [26]:
!git checkout -b "working-branch"

Switched to a new branch 'working-branch'


In [27]:
commit_and_push("Move main.py into a notebook and fragment code into functions, add more preprocessing after running into errors when building/training model, complete outputs for model training: 2, 3, and 4.")

On branch working-branch
nothing to commit, working tree clean
On branch working-branch
nothing to commit, working tree clean
error: src refspec {branch} does not match any
[31merror: failed to push some refs to 'https://github.com/YOUR_USERNAME/YOUR_REPO.git'
[msuccessfully pushed!.
