# Yusuf Dataset Preprocessing

In [12]:
%pip install python-dotenv
%pip install roboflow
%pip install imagehash pillow tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# loads dataset
from roboflow import Roboflow
from dotenv import load_dotenv
# from google.colab import userdata
import os

load_dotenv()  # loads variables from .env into the environment

# os.environ["YF_API_KEY"] = userdata.get("YF_API_KEY")
api_key = os.getenv("YF_API_KEY")

rf = Roboflow(api_key=api_key)
project = rf.workspace("caretech").project("food-dataset-uj20h-w2s4m")
version = project.version(1)
dataset = version.download("yolov8")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Food-Dataset-1 to yolov8:: 100%|██████████| 149379/149379 [00:07<00:00, 20031.58it/s]





Extracting Dataset Version Zip to Food-Dataset-1 in yolov8:: 100%|██████████| 9832/9832 [00:01<00:00, 5100.76it/s]


In [3]:
# Sanity Check Dataset

import os

DATASET_PATH = dataset.location

TRAIN_IMAGES = os.path.join(DATASET_PATH, "train/images")
TRAIN_LABELS = os.path.join(DATASET_PATH, "train/labels")

VAL_IMAGES = os.path.join(DATASET_PATH, "valid/images")
VAL_LABELS = os.path.join(DATASET_PATH, "valid/labels")

TEST_IMAGES = os.path.join(DATASET_PATH, "test/images")
TEST_LABELS = os.path.join(DATASET_PATH, "test/labels")

print("Train images:", len(os.listdir(TRAIN_IMAGES)))
print("Train labels:", len(os.listdir(TRAIN_LABELS)))
print("Val images:", len(os.listdir(VAL_IMAGES)))
print("Val labels:", len(os.listdir(VAL_LABELS)))
print("Test images:", len(os.listdir(TEST_IMAGES)))
print("Test labels:", len(os.listdir(TEST_LABELS)))



Train images: 3917
Train labels: 3917
Val images: 982
Val labels: 982
Test images: 11
Test labels: 11


In [4]:
# Check for any missing labels
import glob

train_images = sorted(glob.glob(os.path.join(TRAIN_IMAGES, "*")))
missing_labels = []

for img_path in train_images:
    base = os.path.splitext(os.path.basename(img_path))[0]
    label_path = os.path.join(TRAIN_LABELS, base + ".txt")
    if not os.path.exists(label_path):
        missing_labels.append(base)

print(f"Missing labels in training set: {len(missing_labels)}")


Missing labels in training set: 0


In [29]:
# Check Class Distribution
from collections import Counter
import pandas as pd
import yaml

# Load class names from data.yaml
with open(os.path.join(DATASET_PATH, "data.yaml")) as f:
    data_yaml = yaml.safe_load(f)

class_names = data_yaml["names"]

label_dir = os.path.join(DATASET_PATH, "train", "labels")

def plot_class_distribution(class_names, label_dir):
    class_counts = Counter()

    for label_file in glob.glob(os.path.join(label_dir, "*.txt")):
        with open(label_file) as f:
            for line in f:
                class_id = int(line.split()[0])
                class_counts[class_id] += 1

    df_class_dist = pd.DataFrame(
        [(k, v, class_names[k]) for k, v in class_counts.items()],
        columns=["class_id", "bbox_count", "class_name"]
    ).sort_values("bbox_count", ascending=False)

    return df_class_dist

df_class_dist = plot_class_distribution(class_names, label_dir)
df_class_dist


Unnamed: 0,class_id,bbox_count,class_name
12,20,454,rice
2,19,239,ramen-noodle
19,3,188,beef-curry
20,14,173,hamburger
10,30,154,toast
0,12,132,fried-rice
23,22,130,sandwiches
25,17,120,pork-cutlet-on-rice
17,25,119,sushi
18,16,117,pizza


In [None]:
import os
import glob
from collections import defaultdict
from PIL import Image
import imagehash
from tqdm import tqdm

def find_duplicate_images(image_dir, hash_size=8):
    """find duplicate images using perceptual hashing"""
    hash_dict = defaultdict(list)
    
    # get all image files
    image_extensions = ['*.jpg', '*.jpeg', '*.png']
    image_paths = []
    for ext in image_extensions:
        image_paths.extend(glob.glob(os.path.join(image_dir, '**', ext), recursive=True))
    
    # remove any duplicates in the path list itself (shouldn't happen, but let's be safe)
    image_paths = list(set(image_paths))
    
    print(f"Found {len(image_paths)} images to process")
    
    # hash all images
    for img_path in tqdm(image_paths, desc="Calculating hashes"):
        try:
            with Image.open(img_path) as img:
                img_hash = imagehash.phash(img.convert("RGB"), hash_size=hash_size)
                hash_dict[str(img_hash)].append(img_path)  # convert hash to string for dict key
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
    
    # filter to only groups with duplicates
    duplicates = {h: paths for h, paths in hash_dict.items() if len(paths) > 1}
    
    print(f"\nFound {len(duplicates)} duplicate groups")
    total_dups = sum(len(paths) - 1 for paths in duplicates.values())
    print(f"Total duplicate images: {total_dups}")
    
    return duplicates

In [23]:
duplicates = find_duplicate_images(DATASET_PATH)

# display duplicate groups (showing different file paths)
for i, (hash_val, paths) in enumerate(list(duplicates.items())[:5]):
    print(f"\nGroup {i+1}: {len(paths)} duplicates (hash: {hash_val})")
    for p in paths[:5]:  # show first 5 paths per group
        print(f"  - {os.path.basename(p)}")
    if len(paths) > 5:
        print(f"  ... and {len(paths) - 5} more")

Found 4910 images to process


Calculating hashes: 100%|██████████| 4910/4910 [00:09<00:00, 533.81it/s]


Found 157 duplicate groups
Total duplicate images: 159

Group 1: 2 duplicates (hash: 8eb5396a616a625b)
  - 15002_jpg.rf.c3cf6e7b4c5f6291e920703a5d746955.jpg
  - 14905_jpg.rf.d318089ed65a85e1e14f5269c1c942b0.jpg

Group 2: 2 duplicates (hash: c02a037d7ee46bb1)
  - 10710_jpg.rf.8e3cc1f882cdb9006c5cd70cb3918eab.jpg
  - 9179_jpg.rf.bfe9f5f028f61ab41fb1a32e34fe95be.jpg

Group 3: 2 duplicates (hash: c38f7c3972c76086)
  - 9229_jpg.rf.70e3af1e15f620c4db00bbc5893d6583.jpg
  - 10763_jpg.rf.7e50cf84fac15d60345f5a3fb8f8b9cc.jpg

Group 4: 2 duplicates (hash: d42dd25589f215ce)
  - 11082_jpg.rf.61c75eac95279575c3028f6fed4458b6.jpg
  - 9330_jpg.rf.646bbcb95e169f57d691929f578da19e.jpg

Group 5: 2 duplicates (hash: c2a995566a4ecd33)
  - 11118_jpg.rf.f142d0f4a7dad7d5c08c5b41a18ecd11.jpg
  - 9295_jpg.rf.d93ad7d3ad808b0da311c716d8a73847.jpg





In [24]:
def remove_duplicates(duplicates, keep='first'):
    """remove duplicates from dataset"""
    removed = []
    for hash_val, paths in duplicates.items():
        # keep first image (or last), remove rest
        to_remove = paths[1:] if keep == 'first' else paths[:-1]
        for path in to_remove:
            os.remove(path)
            removed.append(path)
            # remove associated label file if it exists
            label_path = path.replace('/images/', '/labels/').rsplit('.', 1)[0] + '.txt'
            if os.path.exists(label_path):
                os.remove(label_path)

    print(f"Removed {len(removed)} duplicate images")
    return removed

# remove duplicates
removed = remove_duplicates(duplicates, keep='first')

Removed 159 duplicate images


In [None]:
# double check duplicate images

duplicates = find_duplicate_images(DATASET_PATH)

# display duplicate groups (showing different file paths)
for i, (hash_val, paths) in enumerate(list(duplicates.items())[:5]):
    print(f"\nGroup {i+1}: {len(paths)} duplicates (hash: {hash_val})")
    for p in paths[:5]:  # show first 5 paths per group
        print(f"  - {os.path.basename(p)}")
    if len(paths) > 5:
        print(f"  ... and {len(paths) - 5} more")

Found 4751 images to process


Calculating hashes: 100%|██████████| 4751/4751 [00:09<00:00, 506.09it/s]


Found 0 duplicate groups
Total duplicate images: 0





In [30]:
# find class distribution
df_class_dist = plot_class_distribution(class_names, label_dir)
df_class_dist


Unnamed: 0,class_id,bbox_count,class_name
12,20,454,rice
2,19,239,ramen-noodle
19,3,188,beef-curry
20,14,173,hamburger
10,30,154,toast
0,12,132,fried-rice
23,22,130,sandwiches
25,17,120,pork-cutlet-on-rice
17,25,119,sushi
18,16,117,pizza


In [32]:
# undersampling logic

# get rice class count and second highest
rice_count = df_class_dist[df_class_dist['class_name'] == 'rice']['bbox_count'].values[0]
second_highest = df_class_dist[df_class_dist['class_name'] != 'rice']['bbox_count'].nlargest(2).values[0]

# calculate undersampling ratio
ratio = rice_count / second_highest

print(f"Rice count: {rice_count}")
print(f"Second highest: {second_highest}")
print(f"Undersampling ratio: {ratio:.2f}")


Rice count: 454
Second highest: 239
Undersampling ratio: 1.90


In [33]:
# undersample rice class to TARGET_RATIO of second highest class
TARGET_RATIO = 1.25  # adjust between 1.0 and 1.5 as needed

rice_class_id = class_names.index("rice")
target_rice_count = int(second_highest * TARGET_RATIO)
annotations_to_remove = rice_count - target_rice_count

print(f"Target rice count: {target_rice_count}")
print(f"Rice annotations to remove: {annotations_to_remove}")

# find images containing rice and count rice annotations per image
rice_images = {}  # {base_name: rice_annotation_count}
for label_file in glob.glob(os.path.join(TRAIN_LABELS, "*.txt")):
    base = os.path.splitext(os.path.basename(label_file))[0]
    with open(label_file) as f:
        rice_count_in_file = sum(1 for line in f if int(line.split()[0]) == rice_class_id)
    if rice_count_in_file > 0:
        rice_images[base] = rice_count_in_file

print(f"Images containing rice: {len(rice_images)}")

# select images to remove (prioritize images with more rice annotations)
sorted_by_rice = sorted(rice_images.items(), key=lambda x: -x[1])
images_to_remove = []
removed_annotations = 0

for base, count in sorted_by_rice:
    if removed_annotations >= annotations_to_remove:
        break
    images_to_remove.append(base)
    removed_annotations += count

print(f"Will remove {len(images_to_remove)} images ({removed_annotations} rice annotations)")

Target rice count: 298
Rice annotations to remove: 156
Images containing rice: 454
Will remove 156 images (156 rice annotations)


In [34]:
# execute undersampling - remove selected images and labels
removed_count = 0
for base in tqdm(images_to_remove, desc="Removing rice images"):
    # remove image file
    for ext in ['.jpg', '.jpeg', '.png']:
        img_path = os.path.join(TRAIN_IMAGES, base + ext)
        if os.path.exists(img_path):
            os.remove(img_path)
            break
    # remove label file
    label_path = os.path.join(TRAIN_LABELS, base + ".txt")
    if os.path.exists(label_path):
        os.remove(label_path)
    removed_count += 1

print(f"Removed {removed_count} images and labels")

Removing rice images: 100%|██████████| 156/156 [00:00<00:00, 1843.64it/s]

Removed 156 images and labels





In [35]:
# verify new class distribution
df_new_dist = plot_class_distribution(class_names, TRAIN_LABELS)
new_rice = df_new_dist[df_new_dist['class_name'] == 'rice']['bbox_count'].values[0]
new_second = df_new_dist[df_new_dist['class_name'] != 'rice']['bbox_count'].max()

print(f"New rice count: {new_rice}")
print(f"Second highest: {new_second}")
print(f"New ratio: {new_rice / new_second:.2f}x")
print(f"\nNew class distribution:")
df_new_dist

New rice count: 298
Second highest: 239
New ratio: 1.25x

New class distribution:


Unnamed: 0,class_id,bbox_count,class_name
31,20,298,rice
2,19,239,ramen-noodle
18,3,188,beef-curry
19,14,173,hamburger
10,30,154,toast
0,12,132,fried-rice
22,22,130,sandwiches
24,17,120,pork-cutlet-on-rice
16,25,119,sushi
17,16,117,pizza


In [36]:
# sanity check - verify image/label counts after undersampling
print("After undersampling:")
print(f"Train images: {len(os.listdir(TRAIN_IMAGES))}")
print(f"Train labels: {len(os.listdir(TRAIN_LABELS))}")

After undersampling:
Train images: 3629
Train labels: 3629
