##  Prepare data in expected format for training(to be run in kaggle notebooks)

In [None]:
import os
import shutil
import random
from pathlib import Path
import json
from collections import Counter


dataset_path = Path('/kaggle/input/fashion-product-images-dataset/fashion-dataset')

# Categories to keep
TARGET_CATEGORIES = ['Shirts', 'Watches', 'Casual Shoes', 'Tops', 'Handbags']
IMAGES_PER_CLASS = 500
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# Output directory
output_dir = Path('/kaggle/working/fashion_subset')
output_dir.mkdir(exist_ok=True)


print("Exploring dataset structure...")
for root, dirs, files in os.walk(dataset_path):
    print(f"{root}: {len(files)} files, {len(dirs)} dirs")
    if len(files) > 0:
        print(f"Sample files: {files[:3]}")
    break


import pandas as pd
styles_path = dataset_path / 'styles.csv'
if styles_path.exists():
    df = pd.read_csv(styles_path, on_bad_lines='skip')
    print(f"\nDataset columns: {df.columns.tolist()}")
    print(f"Total images: {len(df)}")
    print(f"\nCategory distribution:\n{df['articleType'].value_counts().head(10)}")
    

    df_filtered = df[df['articleType'].isin(TARGET_CATEGORIES)]
    print(f"\nFiltered to {len(df_filtered)} images across {TARGET_CATEGORIES}")
    print(f"Distribution: {df_filtered['articleType'].value_counts()}")
    
    sampled_images = []
    for category in TARGET_CATEGORIES:
        cat_df = df_filtered[df_filtered['articleType'] == category]
        sample_size = min(IMAGES_PER_CLASS, len(cat_df))
        sampled = cat_df.sample(n=sample_size, random_state=42)
        sampled_images.append(sampled)
        print(f"{category}: sampled {sample_size} images")
    
    df_sampled = pd.concat(sampled_images)
    print(f"\nTotal sampled: {len(df_sampled)} images")
    
    # Split into train/val/test
    splits = {'train': [], 'val': [], 'test': []}
    
    for category in TARGET_CATEGORIES:
        cat_images = df_sampled[df_sampled['articleType'] == category]['id'].tolist()
        random.seed(42)
        random.shuffle(cat_images)
        
        n_train = int(len(cat_images) * TRAIN_RATIO)
        n_val = int(len(cat_images) * VAL_RATIO)
        
        splits['train'].extend([(img, category) for img in cat_images[:n_train]])
        splits['val'].extend([(img, category) for img in cat_images[n_train:n_train+n_val]])
        splits['test'].extend([(img, category) for img in cat_images[n_train+n_val:]])
    
    print(f"\nSplit sizes:")
    print(f"Train: {len(splits['train'])}")
    print(f"Val: {len(splits['val'])}")
    print(f"Test: {len(splits['test'])}")
    
    # Copy images to organized structure
    images_dir = dataset_path / 'images'
    
    for split_name, images in splits.items():
        for img_id, category in images:
            src = images_dir / f"{img_id}.jpg"
            if not src.exists():
                continue
            
            dst_dir = output_dir / split_name / category.lower()
            dst_dir.mkdir(parents=True, exist_ok=True)
            dst = dst_dir / f"{img_id}.jpg"
            
            shutil.copy2(src, dst)
    
    # Save dataset info
    dataset_info = {
        'categories': TARGET_CATEGORIES,
        'images_per_class': IMAGES_PER_CLASS,
        'splits': {
            'train': len(splits['train']),
            'val': len(splits['val']),
            'test': len(splits['test'])
        },
        'split_ratios': {
            'train': TRAIN_RATIO,
            'val': VAL_RATIO,
            'test': TEST_RATIO
        }
    }
    
    with open(output_dir / 'dataset_info.json', 'w') as f:
        json.dump(dataset_info, f, indent=2)
    
    print("\n✓ Dataset preparation complete!")
    print(f"Output directory: {output_dir}")
    
    # Verify structure
    print("\nFinal structure:")
    for split in ['train', 'val', 'test']:
        split_dir = output_dir / split
        if split_dir.exists():
            for cat_dir in split_dir.iterdir():
                if cat_dir.is_dir():
                    count = len(list(cat_dir.glob('*.jpg')))
                    print(f"{split}/{cat_dir.name}: {count} images")
    
    # Create zip for download
    shutil.make_archive('/kaggle/working/fashion_subset', 'zip', output_dir)
    print("\n✓ Created fashion_subset.zip for download")
    print(f"Size: {os.path.getsize('/kaggle/working/fashion_subset.zip') / 1024 / 1024:.2f} MB")