In [1]:
import sys
from pathlib import Path
from collections import Counter, defaultdict
from tqdm import tqdm
import re

In [8]:
# Add project root to path
notebook_dir = Path.cwd()
if notebook_dir.name == 'notebooks':
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

sys.path.insert(0, str(project_root))

from datasets import load_from_disk

print("=== DEEPFASHION DATA AUDIT & CLEANING ===\n")

# Load dataset
print("1. Loading dataset...")
ds = load_from_disk("../data/processed/hf")
train_ds = ds["train"]

print(f"   Dataset loaded with {len(train_ds)} examples")

# Initialize audit counters
audit_results = {
    'total_samples': len(train_ds),
    'corrupted_images': 0,
    'empty_images': 0,
    'tiny_images': 0,
    'missing_text': 0,
    'empty_text': 0,
    'category3_none_count': 0,
    'fabric_mentions': 0,
    'design_attributes': 0,
    'text_lengths': [],
    'image_sizes': [],
    'category1_distribution': Counter(),
    'category2_distribution': Counter(),
    'category3_values': Counter(),
    'samples_to_remove': []
}

print("\n2. Auditing images and basic fields...")

# Audit each sample
for idx, example in enumerate(tqdm(train_ds, desc="Auditing samples")):
    try:
        # Check image
        image = example['image']
        if image is None:
            audit_results['samples_to_remove'].append(idx)
            audit_results['empty_images'] += 1
            continue
            
        # Check image size (PIL images have size attribute)
        width, height = image.size
        audit_results['image_sizes'].append((width, height))
        
        # Flag tiny images (< 32x32 pixels)
        if width < 32 or height < 32:
            audit_results['samples_to_remove'].append(idx)
            audit_results['tiny_images'] += 1
            continue
            
        # Check text
        text = example.get('text', '')
        if text is None:
            audit_results['missing_text'] += 1
            audit_results['samples_to_remove'].append(idx)
            continue
            
        text = str(text).strip()
        if len(text) == 0:
            audit_results['empty_text'] += 1
            audit_results['samples_to_remove'].append(idx)
            continue
            
        audit_results['text_lengths'].append(len(text))
        
        # Check category3
        category3 = example.get('category3')
        if category3 is None:
            audit_results['category3_none_count'] += 1
        audit_results['category3_values'][str(category3)] += 1
        
        # Count distributions
        audit_results['category1_distribution'][example.get('category1', 'unknown')] += 1
        audit_results['category2_distribution'][example.get('category2', 'unknown')] += 1
        
        # Text quality analysis
        text_lower = text.lower()
        
        # Check for fabric mentions
        fabric_keywords = ['cotton', 'silk', 'wool', 'linen', 'polyester', 'nylon', 'spandex', 'rayon', 'denim', 'leather', 'suede']
        if any(fabric in text_lower for fabric in fabric_keywords):
            audit_results['fabric_mentions'] += 1
            
        # Check for design attributes
        design_keywords = ['pattern', 'striped', 'checked', 'printed', 'solid', 'neckline', 'sleeve', 'collar', 'crew', 'v-neck', 'hood']
        if any(design in text_lower for design in design_keywords):
            audit_results['design_attributes'] += 1
            
    except Exception as e:
        print(f"   Error processing sample {idx}: {e}")
        audit_results['corrupted_images'] += 1
        audit_results['samples_to_remove'].append(idx)

print("\n3. Generating audit report...")

# Calculate statistics
total_samples = audit_results['total_samples']
image_sizes = audit_results['image_sizes']
text_lengths = audit_results['text_lengths']

print(f"""
=== DATA AUDIT REPORT ===
Total samples: {total_samples}
{'─' * 40}

IMAGE AUDIT:
• Corrupted images: {audit_results['corrupted_images']} ({audit_results['corrupted_images']/total_samples*100:.1f}%)
• Empty images: {audit_results['empty_images']} ({audit_results['empty_images']/total_samples*100:.1f}%)
• Tiny images (<32x32): {audit_results['tiny_images']} ({audit_results['tiny_images']/total_samples*100:.1f}%)

TEXT AUDIT:
• Missing text: {audit_results['missing_text']} ({audit_results['missing_text']/total_samples*100:.1f}%)
• Empty text: {audit_results['empty_text']} ({audit_results['empty_text']/total_samples*100:.1f}%)
• Average text length: {sum(text_lengths)/len(text_lengths):.1f} characters
• Min text length: {min(text_lengths) if text_lengths else 0}
• Max text length: {max(text_lengths) if text_lengths else 0}

CATEGORY3 AUDIT:
• None values: {audit_results['category3_none_count']} ({audit_results['category3_none_count']/total_samples*100:.1f}%)
• Unique non-null values: {len([k for k in audit_results['category3_values'].keys() if k != 'None'])}

TEXT QUALITY ANALYSIS:
• Samples with fabric mentions: {audit_results['fabric_mentions']} ({audit_results['fabric_mentions']/total_samples*100:.1f}%)
• Samples with design attributes: {audit_results['design_attributes']} ({audit_results['design_attributes']/total_samples*100:.1f}%)

CLEANING SUMMARY:
• Total samples to remove: {len(set(audit_results['samples_to_remove']))}
• Clean samples remaining: {total_samples - len(set(audit_results['samples_to_remove']))}
""")

# Show top categories
print("\nTOP 10 CATEGORIES (category2):")
for cat, count in audit_results['category2_distribution'].most_common(10):
    print(f"  {cat}: {count} ({count/total_samples*100:.1f}%)")

print("\nCATEGORY3 VALUES:")
for val, count in audit_results['category3_values'].most_common():
    print(f"  {val}: {count} ({count/total_samples*100:.1f}%)")
# Create cleaned dataset
print("\n4. Creating cleaned dataset...")
clean_indices = [i for i in range(total_samples) if i not in set(audit_results['samples_to_remove'])]
cleaned_train_ds = train_ds.select(clean_indices)

# Remove category3 column since it's mostly None
if audit_results['category3_none_count'] / total_samples > 0.95:  # If >95% None
    print("   Removing category3 column (mostly None)...")
    cleaned_train_ds = cleaned_train_ds.remove_columns(['category3'])

print(f"   Cleaned dataset: {len(cleaned_train_ds)} samples")

# Save cleaned dataset
print("\n5. Saving cleaned dataset...")
output_path = "../data/processed/cleaned_hf"
cleaned_train_ds.save_to_disk(output_path)
print(f"   Saved to: {output_path}")

print("\n✅ Data audit and cleaning completed!")
print(f"   Original: {total_samples} samples")
print(f"   Cleaned:  {len(cleaned_train_ds)} samples")
print(f"   Removed:  {len(set(audit_results['samples_to_remove']))} samples")

  from .autonotebook import tqdm as notebook_tqdm


=== DEEPFASHION DATA AUDIT & CLEANING ===

1. Loading dataset...
   Dataset loaded with 38283 examples

2. Auditing images and basic fields...


Auditing samples: 100%|██████████| 38283/38283 [00:29<00:00, 1303.22it/s]



3. Generating audit report...

=== DATA AUDIT REPORT ===
Total samples: 38283
────────────────────────────────────────

IMAGE AUDIT:
• Corrupted images: 0 (0.0%)
• Empty images: 0 (0.0%)
• Tiny images (<32x32): 0 (0.0%)

TEXT AUDIT:
• Missing text: 0 (0.0%)
• Empty text: 0 (0.0%)
• Average text length: 225.8 characters
• Min text length: 23
• Max text length: 514

CATEGORY3 AUDIT:
• None values: 38283 (100.0%)
• Unique non-null values: 0

TEXT QUALITY ANALYSIS:
• Samples with fabric mentions: 33687 (88.0%)
• Samples with design attributes: 38273 (100.0%)

CLEANING SUMMARY:
• Total samples to remove: 0
• Clean samples remaining: 38283


TOP 10 CATEGORIES (category2):
  tees: 11982 (31.3%)
  blouses: 6742 (17.6%)
  dresses: 6001 (15.7%)
  sweaters: 2980 (7.8%)
  jackets: 1949 (5.1%)
  rompers: 1477 (3.9%)
  shorts: 1356 (3.5%)
  sweatshirts: 1267 (3.3%)
  cardigans: 1258 (3.3%)
  graphic: 1016 (2.7%)

CATEGORY3 VALUES:
  None: 38283 (100.0%)

4. Creating cleaned dataset...
   Removing c

Saving the dataset (1/1 shards): 100%|██████████| 38283/38283 [00:00<00:00, 80123.86 examples/s]

   Saved to: ../data/processed/cleaned_hf

✅ Data audit and cleaning completed!
   Original: 38283 samples
   Cleaned:  38283 samples
   Removed:  0 samples



