In [None]:
# --- üì¶ SETUP ---
!pip install icrawler pillow tqdm

import os
import random
from icrawler.builtin import GoogleImageCrawler
from PIL import Image
from tqdm import tqdm

# --- üóÇÔ∏è Mount Drive ---
from google.colab import drive
drive.mount('/content/drive')



In [None]:
!pip install simple_image_download pillow tqdm

import os
import random
from simple_image_download import simple_image_download as simp
from PIL import Image
from tqdm import tqdm


detect the ones that didnt do too hot

In [None]:
#!pip install icrawler pillow tqdm --quiet

import os, json, random
from tqdm import tqdm
from PIL import Image
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler

# === Paths ===
BASE_DIR = '/content/drive/MyDrive/food-101_data/food-101'
IMAGES_DIR = os.path.join(BASE_DIR, 'images')
META_DIR = os.path.join(BASE_DIR, 'meta')
JSON_PATH = os.path.join(META_DIR, 'foods_by_cuisine.json')

os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(META_DIR, exist_ok=True)

# === Egyptian foods to exclude ===
egyptian_foods = [
    "koshari", "molokhia", "mahshi", "hawawshi", "bamia", "ful medames",
    "falafel", "feteer meshaltet", "sayadiya", "alexandrian liver sandwich",
    "grilled kofta", "stuffed pigeon", "shawarma", "roz meammar", "fish tagine",
    "okra stew", "egyptian salad", "tahini salad", "pickled eggplant",
    "lamb fattah", "chicken fattah", "bechamel pasta", "sambousek",
    "egyptian lentil soup", "white beans stew", "bessara", "torly", "molokhia with rabbit",
    "chicken pane", "chicken shawarma", "baba ganoush", "chicken liver", "stuffed grape leaves",
    "chicken molokhia", "fried tilapia", "grilled fish egyptian style",
    "sayadiya rice", "macarona bechamel", "basbousa", "kunafa", "baklava", "roz bel laban",
    "om ali", "qatayef", "kahk", "goulash dessert", "rice pudding egyptian"
]

# === Load JSON and filter non-Egyptian cuisines ===
with open(JSON_PATH, 'r') as f:
    foods_by_cuisine = json.load(f)
exclude_until = "french"
keys = list(foods_by_cuisine.keys())

# Find the index of "french"
cut_index = keys.index(exclude_until.lower()) + 1

# Keep everything after "french"
non_egyptian = {k: foods_by_cuisine[k] for k in keys[cut_index:]}

#non_egyptian = {k: v for k, v in foods_by_cuisine.items() if k.lower() != ("egyptian")}

# Save filtered version (optional)
filtered_path = os.path.join(META_DIR, 'foods_non_egyptian.json')
with open(filtered_path, 'w') as f:
    json.dump(non_egyptian, f, indent=2, ensure_ascii=False)
print(f"‚úÖ Saved non-Egyptian cuisines to: {filtered_path}")

# === Utility: clean and resize images ===
def clean_and_resize(folder):
    if not os.path.exists(folder): return
    for f in os.listdir(folder):
        path = os.path.join(folder, f)
        try:
            img = Image.open(path).convert('RGB')
            img = img.resize((512, 512))
            img.save(path)
        except Exception:
            os.remove(path)

# === Utility: image downloader ===
def download_images(cuisine, dish, max_num=100):
    safe_dish = dish.replace(" ", "_")
    save_dir = os.path.join(IMAGES_DIR, cuisine, safe_dish)
    os.makedirs(save_dir, exist_ok=True)

    # Skip if enough images exist
    existing = len([f for f in os.listdir(save_dir) if f.endswith(('.jpg', '.png'))])
    if existing >= 50:
        return

    print(f"üü¢ {cuisine} ‚Üí {dish}: downloading images...")

    search_terms = [
        f"{dish} {cuisine} food",
        f"{dish} {cuisine} cuisine",
        f"{dish} traditional {cuisine}",
        f"{dish} meal"
    ]

    total_downloaded = 0
    for query in search_terms:
        if total_downloaded >= max_num:
            break
        for crawler_cls in [GoogleImageCrawler, BingImageCrawler]:
            crawler = crawler_cls(storage={'root_dir': save_dir})
            try:
                crawler.crawl(keyword=query, max_num=max_num // len(search_terms), file_idx_offset=total_downloaded)
                total_downloaded = len(os.listdir(save_dir))
            except Exception as e:
                print(f"‚ö†Ô∏è {crawler_cls.__name__} failed for {query}: {e}")
    clean_and_resize(save_dir)
    print(f"üì∏ Done: {len(os.listdir(save_dir))} images saved for {dish}")

# === Download all non-Egyptian dishes ===
classes = []
for cuisine, dishes in non_egyptian.items():
    print(f"\nüçΩÔ∏è Processing cuisine: {cuisine}")
    for dish in tqdm(dishes):
        dish_lower = dish.lower()
        # Skip dishes that appear Egyptian by name
        if any(e.lower() in dish_lower for e in egyptian_foods):
            continue

        download_images(cuisine, dish)
        classes.append(f"{cuisine}/{dish.replace(' ', '_')}")

# === Generate Food-101 meta files ===
print("\nüßæ Generating meta files...")
classes_path = os.path.join(META_DIR, "classes.txt")
train_path = os.path.join(META_DIR, "train.txt")
test_path = os.path.join(META_DIR, "test.txt")

# Write classes.txt
with open(classes_path, "w") as f:
    for c in classes:
        f.write(c + "\n")

# Build train/test split
train_lines, test_lines = [], []
for c in classes:
    cuisine, dish = c.split("/")
    folder = os.path.join(IMAGES_DIR, cuisine, dish)
    imgs = [f"{c}/{img}" for img in os.listdir(folder) if img.endswith(('.jpg', '.png'))]
    if len(imgs) < 5:
        continue  # skip underpopulated classes
    random.shuffle(imgs)
    split = int(len(imgs) * 0.8)
    train_lines += imgs[:split]
    test_lines += imgs[split:]

with open(train_path, "w") as f:
    f.write("\n".join(train_lines))
with open(test_path, "w") as f:
    f.write("\n".join(test_lines))

print(f"\n‚úÖ All done!")
print(f"Total cuisines: {len(non_egyptian)}")
print(f"Total classes: {len(classes)}")
print(f"Train samples: {len(train_lines)}, Test samples: {len(test_lines)}")


In [None]:
import os
import random
from pathlib import Path

# === Base paths ===
BASE_DIR = '/content/drive/MyDrive/food-101_data/food-101'
IMAGES_DIR = os.path.join(BASE_DIR, 'images')
META_DIR = os.path.join(BASE_DIR, 'meta')
os.makedirs(META_DIR, exist_ok=True)

# === Egyptian foods to add ===
egyptian_foods = [
    "koshari", "molokhia", "mahshi", "hawawshi", "bamia", "ful_medames",
    "falafel", "feteer_meshaltet", "sayadiya", "alexandrian_liver_sandwich",
    "grilled_kofta", "stuffed_pigeon", "shawarma", "roz_meammar",
    "fish_tagine", "okra_stew", "egyptian_salad", "tahini_salad",
    "pickled_eggplant", "lamb_fattah", "chicken_fattah", "bechamel_pasta",
    "sambousek", "egyptian_lentil_soup", "white_beans_stew", "bessara",
    "torly", "molokhia_with_rabbit", "chicken_pane", "baba_ganoush",
    "stuffed_grape_leaves", "fried_tilapia", "macarona_bechamel",
    "basbousa", "kunafa", "roz_bel_laban", "om_ali", "qatayef", "kahk"
]

# === Make sure each food directory exists ===
valid_foods = []
for food in egyptian_foods:
    folder = os.path.join(IMAGES_DIR, food)
    if os.path.exists(folder) and len(os.listdir(folder)) > 0:
        valid_foods.append(food)
    else:
        print(f"‚ö†Ô∏è Skipping {food} (no images found)")

print(f"\n‚úÖ Found {len(valid_foods)} Egyptian food classes with images.\n")

# === Create train/test splits ===
train_ratio = 0.8  # 80% train, 20% test
train_split = []
test_split = []

for food in valid_foods:
    img_dir = os.path.join(IMAGES_DIR, food)
    imgs = [os.path.splitext(f)[0] for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png'))]
    random.shuffle(imgs)

    split_idx = int(len(imgs) * train_ratio)
    train_imgs = imgs[:split_idx]
    test_imgs = imgs[split_idx:]

    # Append paths in the Food-101 format (class_name/image_name)
    for img in train_imgs:
        train_split.append(f"{food}/{img}")
    for img in test_imgs:
        test_split.append(f"{food}/{img}")

# === Save updated classes, train.txt, and test.txt ===
classes_path = os.path.join(META_DIR, 'classes.txt')
train_path = os.path.join(META_DIR, 'train.txt')
test_path = os.path.join(META_DIR, 'test.txt')

# Read existing classes if available
if os.path.exists(classes_path):
    with open(classes_path, 'r') as f:
        existing_classes = [line.strip() for line in f.readlines()]
else:
    existing_classes = []

# Add new ones if not already there
for food in valid_foods:
    if food not in existing_classes:
        existing_classes.append(food)

# Save updated class list
with open(classes_path, 'w') as f:
    f.write("\n".join(sorted(existing_classes)))

# Merge with existing train/test if they exist
def append_or_create(path, new_lines):
    if os.path.exists(path):
        with open(path, 'a') as f:
            f.write("\n" + "\n".join(new_lines))
    else:
        with open(path, 'w') as f:
            f.write("\n".join(new_lines))

append_or_create(train_path, train_split)
append_or_create(test_path, test_split)

print("‚úÖ Train/test splits and class list updated successfully!")

# === Optional: print summary ===
print(f"\nüìÅ Classes total: {len(existing_classes)}")
print(f"üß© Egyptian classes added: {len(valid_foods)}")
print(f"üìò Train samples added: {len(train_split)}")
print(f"üìó Test samples added: {len(test_split)}")


In [None]:
from pathlib import Path

classes_path = os.path.join(META_DIR, 'classes.txt')

# === Read classes.txt ===
if os.path.exists(classes_path):
    with open(classes_path, 'r') as f:
        all_classes = [line.strip() for line in f.readlines()]
else:
    all_classes = []

# === Collect folder names in /images ===
image_folders = [d for d in os.listdir(IMAGES_DIR) if os.path.isdir(os.path.join(IMAGES_DIR, d))]

# === Comparisons ===
in_classes_not_egyptian = [c for c in all_classes if c not in egyptian_foods]
in_egyptian_and_classes = [c for c in all_classes if c in egyptian_foods]
in_classes_but_missing_folder = [c for c in all_classes if c not in image_folders]
valid_foods_in_classes = [c for c in all_classes if c in valid_foods]

# === Print results ===
print("\n================= DATASET CONSISTENCY REPORT =================")

print(f"üìò Total classes in classes.txt: {len(all_classes)}")
print(f"üá™üá¨ Egyptian classes (list): {len(egyptian_foods)}")
print(f"üìÅ Valid Egyptian folders found: {len(valid_foods)}\n")

print("‚úÖ Classes both in classes.txt and Egyptian list:")
for c in in_egyptian_and_classes:
    print(f"  - {c}")

print("\n‚ö†Ô∏è Classes in classes.txt but NOT in Egyptian list (original Food-101):")
for c in in_classes_not_egyptian[:20]:
    print(f"  - {c}")
if len(in_classes_not_egyptian) > 20:
    print(f"  ... and {len(in_classes_not_egyptian) - 20} more ...")

print("\n‚ùå Classes listed in classes.txt but have NO image folder:")
for c in in_classes_but_missing_folder:
    print(f"  - {c}")

print("\n‚úÖ Valid foods currently in both classes.txt and /images/:")
for c in valid_foods_in_classes:
    print(f"  - {c}")

print("\n==============================================================")


In [None]:
import os
import json
import random
from sklearn.model_selection import train_test_split

# Path to your dataset root
dataset_dir = '/content/drive/MyDrive/food-101_data/food-101'

# --------------------------------------
# 1Ô∏è‚É£ Collect all image paths and classes
# --------------------------------------
data = []
classes = set()

for cuisine in os.listdir(dataset_dir):
    cuisine_path = os.path.join(dataset_dir, cuisine)
    if not os.path.isdir(cuisine_path):
        continue

    for food_class in os.listdir(cuisine_path):
        class_path = os.path.join(cuisine_path, food_class)
        if not os.path.isdir(class_path):
            continue

        # Register this class
        classes.add(food_class)

        # Collect all images under this class
        for img in os.listdir(class_path):
            if img.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".webp")):
                img_path = os.path.join(class_path, img)
                data.append({"path": img_path, "class": food_class, "cuisine": cuisine})

# Sort for consistency
classes = sorted(list(classes))

# --------------------------------------
# 2Ô∏è‚É£ Create Train/Val/Test Splits
# --------------------------------------
# Shuffle for randomness
random.shuffle(data)

# 80% train, 10% val, 10% test
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# --------------------------------------
# 3Ô∏è‚É£ Build metadata summary
# --------------------------------------
metadata = {
    "total_images": len(data),
    "num_classes": len(classes),
    "classes": classes,
    "split": {
        "train": len(train_data),
        "val": len(val_data),
        "test": len(test_data)
    },
    "example_structure": {
        "path": data[0]["path"] if data else None,
        "class": data[0]["class"] if data else None,
        "cuisine": data[0]["cuisine"] if data else None
    }
}

# --------------------------------------
# 4Ô∏è‚É£ Save metadata and splits
# --------------------------------------
output_dir = os.path.join(dataset_dir, "metadata")
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=4)

with open(os.path.join(output_dir, "train.json"), "w") as f:
    json.dump(train_data, f, indent=4)

with open(os.path.join(output_dir, "val.json"), "w") as f:
    json.dump(val_data, f, indent=4)

with open(os.path.join(output_dir, "test.json"), "w") as f:
    json.dump(test_data, f, indent=4)

print(f"‚úÖ Metadata and splits saved in {output_dir}")
print(f"Classes found: {len(classes)}")
