In [4]:
import os
from pathlib import Path
from PIL import Image
import json

print("\n" + "="*80)
print("üîç COMPLETE DATASET VERIFICATION")
print("="*80)

# Define paths
data_root = Path('C:/Users/USER/fruit_veg_mushroom_identifier/data/raw')
fruits_dir = data_root / 'Fruits'
vegetables_dir = data_root / 'Vegetables'
mushrooms_dir = data_root / 'Mushrooms'

def count_images_by_class(base_dir, category_name):
    """Count images in each class"""
    print(f"\n{'‚îÄ'*80}")
    print(f"üìä {category_name.upper()}")
    print(f"{'‚îÄ'*80}")
    
    if not base_dir.exists():
        print(f"‚ùå PATH NOT FOUND: {base_dir}")
        return 0, 0, {}
    
    total_images = 0
    total_classes = 0
    class_info = {}
    
    # Handle both flat structure and train/val/test structure
    subdirs = []
    for item in os.listdir(base_dir):
        item_path = base_dir / item
        if item_path.is_dir():
            subdirs.append(item)
    
    # Check if it's train/val/test structure
    if set(subdirs) & {'train', 'validation', 'test'}:
        # It's a split structure
        print("üìÅ Structure: train/validation/test splits\n")
        
        for split in ['train', 'validation', 'test']:
            split_path = base_dir / split
            if split_path.exists():
                split_total = 0
                for class_name in sorted(os.listdir(split_path)):
                    class_path = split_path / class_name
                    if class_path.is_dir():
                        images = [f for f in os.listdir(class_path) 
                                 if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
                        split_total += len(images)
                        total_images += len(images)
                
                print(f"  {split:12s}: {split_total:4d} images")
        
        # Count total unique classes
        train_path = base_dir / 'train'
        if train_path.exists():
            total_classes = len([d for d in os.listdir(train_path) 
                               if (train_path / d).is_dir()])
    else:
        # Flat structure - classes directly in folder
        print("üìÅ Structure: Direct class folders\n")
        
        for class_name in sorted(os.listdir(base_dir)):
            class_path = base_dir / class_name
            if not class_path.is_dir():
                continue
            
            images = [f for f in os.listdir(class_path) 
                     if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            
            count = len(images)
            total_images += count
            total_classes += 1
            class_info[class_name] = count
            
            status = "‚úÖ" if count >= 30 else "‚ö†Ô∏è "
            bar = "‚ñà" * (count // 10)
            print(f"  {status} {class_name:30s}: {count:3d} images {bar}")
    
    print(f"\n  Total classes: {total_classes}")
    print(f"  Total images: {total_images}")
    
    return total_images, total_classes, class_info


def verify_mushroom_split(base_dir):
    """Verify mushroom EDIBLE/TOXIC split"""
    print(f"\n{'‚îÄ'*80}")
    print(f"üçÑ MUSHROOM SAFETY CLASSIFICATION")
    print(f"{'‚îÄ'*80}")
    
    if not base_dir.exists():
        print(f"‚ùå PATH NOT FOUND: {base_dir}")
        return 0, 0, False
    
    edible_path = base_dir / 'EDIBLE'
    toxic_path = base_dir / 'TOXIC'
    
    # Check lowercase versions too
    if not edible_path.exists():
        edible_path = base_dir / 'edible'
    if not toxic_path.exists():
        toxic_path = base_dir / 'toxic'
    
    edible_found = edible_path.exists()
    toxic_found = toxic_path.exists()
    
    edible_total = 0
    toxic_total = 0
    
    print(f"\nüü¢ EDIBLE MUSHROOMS: {'‚úÖ Found' if edible_found else '‚ùå Not found'}")
    if edible_found:
        edible_classes = sorted([d for d in os.listdir(edible_path) 
                               if (edible_path / d).is_dir()])
        print(f"   Classes: {len(edible_classes)}\n")
        
        for cls in edible_classes:
            cls_path = edible_path / cls
            images = [f for f in os.listdir(cls_path) 
                     if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            count = len(images)
            edible_total += count
            
            status = "‚úÖ" if count >= 50 else "‚ö†Ô∏è "
            bar = "‚ñà" * (count // 5)
            print(f"   {status} {cls:28s}: {count:3d} images {bar}")
        
        print(f"\n   Total EDIBLE: {edible_total} images")
    
    print(f"\nüî¥ TOXIC MUSHROOMS: {'‚úÖ Found' if toxic_found else '‚ùå Not found'}")
    if toxic_found:
        toxic_classes = sorted([d for d in os.listdir(toxic_path) 
                              if (toxic_path / d).is_dir()])
        print(f"   Classes: {len(toxic_classes)}\n")
        
        for cls in toxic_classes:
            cls_path = toxic_path / cls
            images = [f for f in os.listdir(cls_path) 
                     if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            count = len(images)
            toxic_total += count
            
            status = "‚úÖ" if count >= 50 else "‚ö†Ô∏è "
            bar = "‚ñà" * (count // 5)
            print(f"   {status} {cls:28s}: {count:3d} images {bar}")
        
        print(f"\n   Total TOXIC: {toxic_total} images")
    
    mushroom_total = edible_total + toxic_total
    split_ok = edible_found and toxic_found
    
    return mushroom_total, len(edible_classes) + len(toxic_classes), split_ok


# Run all verifications
fruits_imgs, fruits_classes, fruits_info = count_images_by_class(fruits_dir, 'Fruits')
veg_imgs, veg_classes, veg_info = count_images_by_class(vegetables_dir, 'Vegetables')
mushroom_imgs, mushroom_classes, mushroom_split_ok = verify_mushroom_split(mushrooms_dir)

# FINAL SUMMARY
print(f"\n{'='*80}")
print(f"üìä FINAL DATASET SUMMARY")
print(f"{'='*80}")

summary_data = {
    'Fruits': {'images': fruits_imgs, 'classes': fruits_classes},
    'Vegetables': {'images': veg_imgs, 'classes': veg_classes},
    'Mushrooms': {'images': mushroom_imgs, 'classes': mushroom_classes}
}

for category, data in summary_data.items():
    print(f"{category:15s}: {data['images']:5d} images / {data['classes']:2d} classes")

total_images = fruits_imgs + veg_imgs + mushroom_imgs
total_classes = fruits_classes + veg_classes + mushroom_classes

print(f"{'‚îÄ'*80}")
print(f"{'TOTAL':15s}: {total_images:5d} images / {total_classes:2d} classes")
print(f"{'='*80}")

# REQUIREMENTS CHECK
print(f"\n‚úÖ REQUIREMENTS CHECK:")
print(f"{'‚îÄ'*80}")

checks = [
    ("Total images ‚â• 5,000", total_images >= 5000, total_images),
    ("Total classes ‚â• 40", total_classes >= 40, total_classes),
    ("Fruits present", fruits_imgs > 0, fruits_imgs),
    ("Vegetables present", veg_imgs > 0, veg_imgs),
    ("Mushrooms present", mushroom_imgs > 0, mushroom_imgs),
    ("Mushroom EDIBLE/TOXIC split", mushroom_split_ok, "‚úì" if mushroom_split_ok else "‚úó"),
]

all_pass = True
for check_name, result, value in checks:
    status = "‚úÖ PASS" if result else "‚ùå FAIL"
    print(f"{status:10s} | {check_name:35s} | {value}")
    if not result:
        all_pass = False

print(f"{'='*80}")

if all_pass:
    print("üéâ ALL CHECKS PASSED - READY FOR WEEK 1!")
    print(f"{'='*80}")
else:
    print("‚ö†Ô∏è  SOME CHECKS FAILED - REVIEW ABOVE")
    print(f"{'='*80}")

# Save summary to file
summary_output = {
    'fruits': {'images': fruits_imgs, 'classes': fruits_classes},
    'vegetables': {'images': veg_imgs, 'classes': veg_classes},
    'mushrooms': {'images': mushroom_imgs, 'classes': mushroom_classes},
    'total': {
        'images': total_images,
        'classes': total_classes,
        'all_checks_pass': all_pass
    }
}

with open('C:/Users/USER/fruit_veg_mushroom_identifier/data/verification_summary.json', 'w') as f:
    json.dump(summary_output, f, indent=2)

print("\n‚úÖ Verification summary saved to: C:/Users/USER/fruit_veg_mushroom_identifier/data/verification_summary.json")



üîç COMPLETE DATASET VERIFICATION

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìä FRUITS
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìÅ Structure: Direct class folders

  ‚úÖ apple fruit                   :  40 images ‚ñà‚ñà‚ñà‚ñà
  ‚úÖ banana fruit                  :  40 images ‚ñà‚ñà‚ñà‚ñà
  ‚úÖ cherry fruit                  :  40 images ‚ñà‚ñà‚ñà‚ñà
  ‚úÖ chickoo fruit                 :  40 images ‚ñà‚ñà‚ñà‚ñà
  ‚úÖ grapes fruit                  :  40 images ‚ñà‚ñà‚ñà‚ñà
  ‚úÖ kiwi fruit                    :  40 images ‚ñà‚ñà‚ñà‚ñà
  ‚úÖ mango fruit                   :  39 images ‚ñà‚ñà‚ñà
