In [None]:
# Standard imports
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple
from tqdm import tqdm
import hashlib

# GifLab imports
import sys
sys.path.append('../src')
from giflab import meta, config
from giflab.meta import extract_gif_metadata, GifMetadata
from giflab.config import CompressionConfig, PathConfig

# Configuration
SEED_CONFIG = {
    'version': '1.0',
    'max_files_scan': None,  # None = scan all files
    'batch_size': 1000,
    'validation_enabled': True,
    'backup_existing': True
}

print("🌱 GifLab Seed JSON Builder")
print(f"🔧 Configuration: {SEED_CONFIG}")
print("✅ Setup complete!")

# Set up paths
raw_dir = Path("../data/raw")
seed_dir = Path("../seed")
csv_dir = Path("../data/csv")

# Create directories if they don't exist
seed_dir.mkdir(exist_ok=True)
csv_dir.mkdir(exist_ok=True, parents=True)

print(f"📁 Raw directory: {raw_dir}")
print(f"📁 Seed directory: {seed_dir}")
print(f"📁 CSV directory: {csv_dir}")


In [None]:
def scan_gif_directory_comprehensive(raw_dir: Path, max_files: Optional[int] = None) -> Dict[str, Any]:
    """
    Comprehensive scan of GIF directory with metadata extraction.
    
    Args:
        raw_dir: Path to directory containing raw GIF files
        max_files: Maximum number of files to process (None = all files)
        
    Returns:
        Dictionary containing comprehensive file information
    """
    print(f"🔍 Starting comprehensive scan of: {raw_dir}")
    
    if not raw_dir.exists():
        print(f"⚠️  Directory not found: {raw_dir}")
        return {
            'scan_completed': False,
            'error': f"Directory not found: {raw_dir}",
            'total_files_found': 0,
            'valid_gifs': [],
            'corrupted_gifs': [],
            'duplicate_groups': {}
        }
    
    # Find all GIF files recursively
    gif_files = list(raw_dir.rglob("*.gif"))
    total_files = len(gif_files)
    
    print(f"📁 Found {total_files} GIF files")
    
    # Sample files if requested
    if max_files and max_files < total_files:
        import random
        random.seed(42)  # For reproducible sampling
        gif_files = random.sample(gif_files, max_files)
        print(f"📊 Sampling {max_files} files for analysis")
    
    # Process files
    valid_gifs = []
    corrupted_gifs = []
    sha_to_paths = {}  # For duplicate detection
    
    for gif_path in tqdm(gif_files, desc="Processing GIFs"):
        try:
            # Get file stats
            stat = gif_path.stat()
            file_size_bytes = stat.st_size
            file_modified = datetime.fromtimestamp(stat.st_mtime).isoformat()
            
            # Extract metadata
            try:
                metadata = extract_gif_metadata(gif_path)
                
                # Calculate derived metrics
                aspect_ratio = metadata.orig_width / metadata.orig_height if metadata.orig_height > 0 else 0
                duration_seconds = metadata.orig_frames / metadata.orig_fps if metadata.orig_fps > 0 else 0
                pixels_total = metadata.orig_width * metadata.orig_height
                
                # Calculate complexity score
                complexity_score = calculate_complexity_score(metadata, pixels_total)
                complexity_category = classify_complexity(complexity_score)
                
                gif_info = {
                    'gif_sha': metadata.gif_sha,
                    'orig_filename': metadata.orig_filename,
                    'file_path': str(gif_path.relative_to(raw_dir)),
                    'absolute_path': str(gif_path),
                    'file_size_bytes': file_size_bytes,
                    'orig_kilobytes': metadata.orig_kilobytes,
                    'orig_width': metadata.orig_width,
                    'orig_height': metadata.orig_height,
                    'orig_frames': metadata.orig_frames,
                    'orig_fps': metadata.orig_fps,
                    'orig_n_colors': metadata.orig_n_colors,
                    'entropy': metadata.entropy,
                    'aspect_ratio': aspect_ratio,
                    'duration_seconds': duration_seconds,
                    'pixels_total': pixels_total,
                    'complexity_score': complexity_score,
                    'complexity_category': complexity_category,
                    'file_modified': file_modified,
                    'processing_priority': get_processing_priority(complexity_category)
                }
                
                valid_gifs.append(gif_info)
                
                # Track for duplicate detection
                sha = metadata.gif_sha
                if sha not in sha_to_paths:
                    sha_to_paths[sha] = []
                sha_to_paths[sha].append(gif_path)
                
            except Exception as meta_error:
                corrupted_gifs.append({
                    'file_path': str(gif_path.relative_to(raw_dir)),
                    'absolute_path': str(gif_path),
                    'file_size_bytes': file_size_bytes,
                    'error': str(meta_error),
                    'file_modified': file_modified
                })\n                \n        except Exception as file_error:\n            print(f"⚠️  Error accessing {gif_path}: {file_error}")\n    \n    # Identify duplicate groups\n    duplicate_groups = {}\n    for sha, paths in sha_to_paths.items():\n        if len(paths) > 1:\n            duplicate_groups[f"group_{len(duplicate_groups) + 1}"] = {\n                'canonical_sha': sha,\n                'duplicate_paths': [str(p.relative_to(raw_dir)) for p in paths[1:]],\n                'canonical_path': str(paths[0].relative_to(raw_dir))\n            }\n    \n    return {\n        'scan_completed': True,\n        'scan_timestamp': datetime.now().isoformat(),\n        'total_files_found': total_files,\n        'files_processed': len(gif_files),\n        'valid_gifs': valid_gifs,\n        'corrupted_gifs': corrupted_gifs,\n        'duplicate_groups': duplicate_groups,\n        'success_rate': len(valid_gifs) / len(gif_files) if gif_files else 0\n    }\n\ndef calculate_complexity_score(metadata: GifMetadata, pixels_total: int) -> float:\n    """Calculate complexity score for a GIF based on its characteristics."""\n    score = 0\n    \n    # Entropy contribution (0-40 points)\n    entropy = metadata.entropy or 0\n    if entropy > 6:\n        score += 40\n    elif entropy > 4:\n        score += 25\n    else:\n        score += 10\n    \n    # Frame count contribution (0-30 points)\n    frames = metadata.orig_frames\n    if frames > 50:\n        score += 30\n    elif frames > 20:\n        score += 20\n    else:\n        score += 5\n    \n    # Dimension contribution (0-30 points)\n    if pixels_total > 500000:  # ~720p\n        score += 30\n    elif pixels_total > 200000:  # ~480p\n        score += 20\n    else:\n        score += 10\n    \n    return score\n\ndef classify_complexity(score: float) -> str:\n    """Classify complexity based on score."""\n    if score >= 80:\n        return 'high_complexity'\n    elif score >= 50:\n        return 'medium_complexity'\n    else:\n        return 'low_complexity'\n\ndef get_processing_priority(complexity_category: str) -> str:\n    """Get processing priority based on complexity."""\n    priority_map = {\n        'high_complexity': 'high',\n        'medium_complexity': 'medium',\n        'low_complexity': 'low'\n    }\n    return priority_map.get(complexity_category, 'medium')\n\n# Perform comprehensive scan\nprint("🚀 Starting comprehensive GIF directory scan...")\nscan_results = scan_gif_directory_comprehensive(raw_dir, max_files=SEED_CONFIG['max_files_scan'])\n\nprint(f"\n📊 Scan Results Summary:")\nprint(f"   • Total files found: {scan_results['total_files_found']}")\nprint(f"   • Files processed: {scan_results['files_processed']}")\nprint(f"   • Valid GIFs: {len(scan_results['valid_gifs'])}")\nprint(f"   • Corrupted files: {len(scan_results['corrupted_gifs'])}")\nprint(f"   • Duplicate groups: {len(scan_results['duplicate_groups'])}")\nprint(f"   • Success rate: {scan_results['success_rate']:.1%}")


In [None]:
def generate_metadata_seed(scan_results: Dict[str, Any]) -> Dict[str, Any]:
    """Generate comprehensive metadata seed file."""
    
    print("🔧 Generating metadata seed file...")
    
    if not scan_results['scan_completed'] or not scan_results['valid_gifs']:
        print("⚠️  No valid GIFs found for metadata seed generation")
        return {
            'version': SEED_CONFIG['version'],
            'generated_at': datetime.now().isoformat(),
            'total_gifs': 0,
            'gif_metadata': {},
            'statistics': {},
            'scan_info': scan_results
        }
    
    # Build metadata index
    gif_metadata = {}
    for gif_info in scan_results['valid_gifs']:
        sha = gif_info['gif_sha']
        gif_metadata[sha] = {
            'orig_filename': gif_info['orig_filename'],
            'file_path': gif_info['file_path'],
            'orig_kilobytes': gif_info['orig_kilobytes'],
            'orig_width': gif_info['orig_width'],
            'orig_height': gif_info['orig_height'],
            'orig_frames': gif_info['orig_frames'],
            'orig_fps': gif_info['orig_fps'],
            'orig_n_colors': gif_info['orig_n_colors'],
            'entropy': gif_info['entropy'],
            'aspect_ratio': gif_info['aspect_ratio'],
            'duration_seconds': gif_info['duration_seconds'],
            'complexity_score': gif_info['complexity_score'],
            'complexity_category': gif_info['complexity_category'],
            'processing_priority': gif_info['processing_priority'],
            'file_modified': gif_info['file_modified'],
            'pixels_total': gif_info['pixels_total']
        }
    
    # Calculate statistics
    df = pd.DataFrame(scan_results['valid_gifs'])
    
    statistics = {
        'total_gifs': len(scan_results['valid_gifs']),
        'avg_file_size_kb': float(df['orig_kilobytes'].mean()) if not df.empty else 0,
        'avg_frames': float(df['orig_frames'].mean()) if not df.empty else 0,
        'avg_fps': float(df['orig_fps'].mean()) if not df.empty else 0,
        'avg_duration_seconds': float(df['duration_seconds'].mean()) if not df.empty else 0,
        'complexity_distribution': df['complexity_category'].value_counts().to_dict() if not df.empty else {},
        'most_common_dimensions': [],
        'most_common_aspect_ratios': [],
        'file_size_percentiles': {},
        'entropy_stats': {}
    }
    
    if not df.empty:
        # Most common dimensions
        dimension_counts = df.apply(lambda x: f"{x['orig_width']}x{x['orig_height']}", axis=1).value_counts()
        statistics['most_common_dimensions'] = dimension_counts.head(5).index.tolist()
        
        # Most common aspect ratios (rounded)
        aspect_ratio_counts = df['aspect_ratio'].round(2).value_counts()
        statistics['most_common_aspect_ratios'] = aspect_ratio_counts.head(5).index.tolist()
        
        # File size percentiles
        size_percentiles = df['orig_kilobytes'].quantile([0.25, 0.5, 0.75, 0.9, 0.95])
        statistics['file_size_percentiles'] = {
            f'p{int(k*100)}': float(v) for k, v in size_percentiles.items()
        }
        
        # Entropy statistics
        statistics['entropy_stats'] = {
            'mean': float(df['entropy'].mean()),
            'std': float(df['entropy'].std()),
            'min': float(df['entropy'].min()),
            'max': float(df['entropy'].max())
        }
    
    metadata_seed = {
        'version': SEED_CONFIG['version'],
        'generated_at': datetime.now().isoformat(),
        'total_gifs': len(scan_results['valid_gifs']),
        'gif_metadata': gif_metadata,
        'statistics': statistics,
        'scan_info': {
            'total_files_found': scan_results['total_files_found'],
            'files_processed': scan_results['files_processed'],
            'corrupted_files': len(scan_results['corrupted_gifs']),
            'duplicate_groups': len(scan_results['duplicate_groups']),
            'success_rate': scan_results['success_rate']
                 }
    }
    
    print(f"✅ Generated metadata seed for {len(gif_metadata)} GIFs")
    return metadata_seed

# Generate metadata seed
metadata_seed = generate_metadata_seed(scan_results)

print(f"\n📊 Metadata Seed Summary:")
print(f"   • Total GIFs indexed: {metadata_seed['total_gifs']}")
if metadata_seed['total_gifs'] > 0:
    stats = metadata_seed['statistics']
    print(f"   • Average file size: {stats['avg_file_size_kb']:.1f} KB")
    print(f"   • Average frames: {stats['avg_frames']:.1f}")
    print(f"   • Average FPS: {stats['avg_fps']:.1f}")
    print(f"   • Complexity distribution: {stats['complexity_distribution']}")
    print(f"   • Common dimensions: {stats['most_common_dimensions'][:3]}")


In [None]:
def generate_processing_seed(scan_results: Dict[str, Any]) -> Dict[str, Any]:
    """Generate processing optimization seed file."""
    
    print("🔧 Generating processing optimization seed file...")
    
    if not scan_results['scan_completed'] or not scan_results['valid_gifs']:
        print("⚠️  No valid GIFs found for processing seed generation")
        return {
            'version': SEED_CONFIG['version'],
            'generated_at': datetime.now().isoformat(),
            'processing_batches': {},
            'parameter_recommendations': {},
            'duplicate_groups': scan_results.get('duplicate_groups', {})
        }
    
    # Group GIFs by complexity
    complexity_groups = {
        'high_complexity': [],
        'medium_complexity': [],
        'low_complexity': []
    }
    
    for gif_info in scan_results['valid_gifs']:
        category = gif_info['complexity_category']
        complexity_groups[category].append(gif_info['gif_sha'])
    
    # Define processing characteristics and recommendations
    processing_batches = {}
    for category, gifs in complexity_groups.items():
        if gifs:  # Only include categories with GIFs
            if category == 'high_complexity':
                characteristics = "High entropy, many frames, large dimensions"
                batch_size = 50
                estimated_time = 45.2
            elif category == 'medium_complexity':
                characteristics = "Moderate entropy, average frames"
                batch_size = 100
                estimated_time = 22.8
            else:  # low_complexity
                characteristics = "Low entropy, few frames, small dimensions"
                batch_size = 200
                estimated_time = 8.5
            
            processing_batches[category] = {
                'gifs': gifs,
                'count': len(gifs),
                'characteristics': characteristics,
                'recommended_batch_size': batch_size,
                'estimated_time_per_gif_seconds': estimated_time
            }
    
    # Generate parameter recommendations for each GIF
    parameter_recommendations = {}
    for gif_info in scan_results['valid_gifs']:
        sha = gif_info['gif_sha']
        category = gif_info['complexity_category']
        
        # Define recommendations based on complexity
        if category == 'high_complexity':
            rec = {
                'optimal_engines': ['gifsicle', 'animately'],
                'recommended_lossy': [0, 40],
                'recommended_frame_ratios': [1.0, 0.8, 0.6, 0.4],
                'recommended_colors': [256, 128, 64],
                'skip_combinations': [
                    {'engine': 'animately', 'lossy': 120, 'reason': 'poor_quality_on_complex_gifs'}
                ],
                'quality_target_ssim': 0.9
            }
        elif category == 'medium_complexity':
            rec = {
                'optimal_engines': ['gifsicle', 'animately'],
                'recommended_lossy': [0, 40, 80],
                'recommended_frame_ratios': [1.0, 0.8, 0.6],
                'recommended_colors': [256, 128, 64],
                'skip_combinations': [],
                'quality_target_ssim': 0.8
            }
        else:  # low_complexity
            rec = {
                'optimal_engines': ['gifsicle', 'animately'],
                'recommended_lossy': [0, 40, 80, 120],
                'recommended_frame_ratios': [1.0, 0.8, 0.6, 0.4],
                'recommended_colors': [256, 128, 64],
                'skip_combinations': [],
                'quality_target_ssim': 0.7
            }
        
        # Add GIF-specific context
        rec['gif_context'] = {
            'complexity_score': gif_info['complexity_score'],
            'orig_frames': gif_info['orig_frames'],
            'orig_fps': gif_info['orig_fps'],
            'entropy': gif_info['entropy'],
            'pixels_total': gif_info['pixels_total']
        }
        
        parameter_recommendations[sha] = rec
    
    processing_seed = {
        'version': SEED_CONFIG['version'],
        'generated_at': datetime.now().isoformat(),
        'processing_batches': processing_batches,
        'parameter_recommendations': parameter_recommendations,
        'duplicate_groups': scan_results.get('duplicate_groups', {}),
        'batch_summary': {
            'total_gifs': len(scan_results['valid_gifs']),
            'high_complexity_count': len(complexity_groups['high_complexity']),
            'medium_complexity_count': len(complexity_groups['medium_complexity']),
            'low_complexity_count': len(complexity_groups['low_complexity'])
        }
    }
    
    print(f"✅ Generated processing seed for {len(parameter_recommendations)} GIFs")
    return processing_seed

# Generate processing seed
processing_seed = generate_processing_seed(scan_results)

print(f"\n📊 Processing Seed Summary:")
print(f"   • Total GIFs with recommendations: {len(processing_seed['parameter_recommendations'])}")
print(f"   • Processing batches: {len(processing_seed['processing_batches'])}")
print(f"   • Duplicate groups: {len(processing_seed['duplicate_groups'])}")

if processing_seed['processing_batches']:
    print(f"   • Batch distribution:")
    for category, batch_info in processing_seed['processing_batches'].items():
        print(f"     - {category}: {batch_info['count']} GIFs (batch size: {batch_info['recommended_batch_size']})")


In [None]:
def generate_resume_state(csv_files: List[Path], scan_results: Dict[str, Any]) -> Dict[str, Any]:
    """Generate resume state from existing CSV files and scan results."""
    
    print("🔧 Generating resume state seed file...")
    
    # Initialize resume state structure
    resume_state = {
        'version': SEED_CONFIG['version'],
        'last_updated': datetime.now().isoformat(),
        'processing_sessions': {},
        'completed_jobs': {},
        'failed_jobs': {},
        'progress_summary': {
            'total_gifs': len(scan_results.get('valid_gifs', [])),
            'completed_gifs': 0,
            'failed_gifs': len(scan_results.get('corrupted_gifs', [])),
            'pending_gifs': 0,
            'completion_percentage': 0.0
        }
    }
    
    # Add corrupted files to failed jobs
    for corrupted_gif in scan_results.get('corrupted_gifs', []):
        file_path = corrupted_gif['file_path']
        resume_state['failed_jobs'][file_path] = {
            'error': corrupted_gif['error'],
            'attempts': 1,
            'last_attempt': datetime.now().isoformat(),
            'moved_to_bad_gifs': False,
            'file_size_bytes': corrupted_gif.get('file_size_bytes', 0)
        }
    
    # Process existing CSV files to determine completion state
    all_completed_jobs = {}
    
    for csv_file in csv_files:
        if csv_file.exists():
            try:
                print(f"📄 Processing CSV file: {csv_file}")
                 df = pd.read_csv(csv_file)
                
                for _, row in df.iterrows():
                    gif_sha = row['gif_sha']
                    engine = row['engine']
                    
                    if gif_sha not in all_completed_jobs:
                        all_completed_jobs[gif_sha] = {}
                    
                    if engine not in all_completed_jobs[gif_sha]:
                        all_completed_jobs[gif_sha][engine] = {
                            'completed_variants': [],
                            'pending_variants': []
                        }
                    
                    variant = {
                        'lossy': int(row['lossy']),
                        'frame_ratio': float(row['frame_keep_ratio']),
                        'colors': int(row['color_keep_count']),
                        'timestamp': row.get('timestamp', datetime.now().isoformat()),
                        'kilobytes': float(row['kilobytes']),
                        'ssim': float(row['ssim']) if pd.notna(row.get('ssim')) else None
                    }
                    
                     all_completed_jobs[gif_sha][engine]['completed_variants'].append(variant)
                    
            except Exception as e:
                print(f"⚠️  Error processing CSV {csv_file}: {e}")\n    \n    # Update resume state with completed jobs\n    resume_state['completed_jobs'] = all_completed_jobs\n    \n    # Calculate progress summary\n    total_gifs = len(scan_results.get('valid_gifs', []))\n    completed_gifs = len(all_completed_jobs)\n    failed_gifs = len(scan_results.get('corrupted_gifs', []))\n    pending_gifs = max(0, total_gifs - completed_gifs)\n    \n    resume_state['progress_summary'] = {\n        'total_gifs': total_gifs,\n        'completed_gifs': completed_gifs,\n        'failed_gifs': failed_gifs,\n        'pending_gifs': pending_gifs,\n        'completion_percentage': (completed_gifs / total_gifs * 100) if total_gifs > 0 else 0.0\n    }\n    \n    # Generate pending variants for incomplete GIFs\n    compression_config = CompressionConfig()\n    \n    for gif_info in scan_results.get('valid_gifs', []):\n        gif_sha = gif_info['gif_sha']\n        \n        if gif_sha not in all_completed_jobs:\n            all_completed_jobs[gif_sha] = {}\n        \n        # Check what variants are missing for each engine\n        for engine in compression_config.ENGINES:\n            if engine not in all_completed_jobs[gif_sha]:\n                all_completed_jobs[gif_sha][engine] = {\n                    'completed_variants': [],\n                    'pending_variants': []\n                }\n            \n            completed_variants = all_completed_jobs[gif_sha][engine]['completed_variants']\n            completed_combinations = set()\n            \n            for variant in completed_variants:\n                combo = (variant['lossy'], variant['frame_ratio'], variant['colors'])\n                completed_combinations.add(combo)\n            \n            # Generate all possible combinations\n            all_combinations = set()\n            for lossy in compression_config.LOSSY_LEVELS:\n                for ratio in compression_config.FRAME_KEEP_RATIOS:\n                    for colors in compression_config.COLOR_KEEP_COUNTS:\n                        all_combinations.add((lossy, ratio, colors))\n            \n            # Find pending combinations\n            pending_combinations = all_combinations - completed_combinations\n            \n            pending_variants = []\n            for lossy, ratio, colors in pending_combinations:\n                pending_variants.append({\n                    'lossy': lossy,\n                    'frame_ratio': ratio,\n                    'colors': colors\n                })\n            \n            all_completed_jobs[gif_sha][engine]['pending_variants'] = pending_variants\n    \n    resume_state['completed_jobs'] = all_completed_jobs\n    \n    print(f"✅ Generated resume state for {total_gifs} GIFs")\n    return resume_state\n\n# Find existing CSV files\ncsv_files = list(csv_dir.glob("*.csv")) if csv_dir.exists() else []\nprint(f"📄 Found {len(csv_files)} existing CSV files")\n\n# Generate resume state\nresume_seed = generate_resume_state(csv_files, scan_results)\n\nprint(f"\n📊 Resume State Summary:")\nprint(f"   • Total GIFs: {resume_seed['progress_summary']['total_gifs']}")\nprint(f"   • Completed GIFs: {resume_seed['progress_summary']['completed_gifs']}")\nprint(f"   • Failed GIFs: {resume_seed['progress_summary']['failed_gifs']}")\nprint(f"   • Pending GIFs: {resume_seed['progress_summary']['pending_gifs']}")\nprint(f"   • Completion: {resume_seed['progress_summary']['completion_percentage']:.1f}%")


In [None]:
def validate_seed_data(seed_files: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
    """Validate integrity and consistency of all seed files."""
    
    print("🔍 Validating seed data integrity...")
    
    validation_results = {
        'overall_valid': True,
        'validation_timestamp': datetime.now().isoformat(),
        'file_validations': {},
        'cross_validation': {},
        'recommendations': []
    }
    
    # Validate each seed file individually
    for seed_name, seed_data in seed_files.items():
        print(f"   Validating {seed_name}...")
        
        file_validation = {
            'valid': True,
            'errors': [],
            'warnings': [],
            'stats': {}
        }
        
        try:
            # Check required fields
            required_fields = ['version', 'generated_at']
            for field in required_fields:
                if field not in seed_data:
                    file_validation['errors'].append(f"Missing required field: {field}")
                    file_validation['valid'] = False
            
            # Validate version consistency
            if 'version' in seed_data and seed_data['version'] != SEED_CONFIG['version']:
                file_validation['warnings'].append(f"Version mismatch: {seed_data['version']} != {SEED_CONFIG['version']}")
            
            # Seed-specific validations
            if seed_name == 'metadata':
                if 'gif_metadata' in seed_data:
                    gif_count = len(seed_data['gif_metadata'])
                    file_validation['stats']['gif_count'] = gif_count
                    
                    if gif_count == 0:
                        file_validation['warnings'].append("No GIF metadata found")
                    
                    # Check for required metadata fields
                    if gif_count > 0:
                        sample_gif = next(iter(seed_data['gif_metadata'].values()))
                        required_gif_fields = ['orig_filename', 'file_path', 'orig_kilobytes']
                        for field in required_gif_fields:
                            if field not in sample_gif:
                                file_validation['errors'].append(f"Missing GIF field: {field}")
                                file_validation['valid'] = False
            
            elif seed_name == 'processing':
                if 'parameter_recommendations' in seed_data:
                    rec_count = len(seed_data['parameter_recommendations'])
                    file_validation['stats']['recommendation_count'] = rec_count
                    
                    if rec_count == 0:
                        file_validation['warnings'].append("No parameter recommendations found")
            
            elif seed_name == 'resume':
                if 'progress_summary' in seed_data:
                    progress = seed_data['progress_summary']
                    file_validation['stats']['completion_percentage'] = progress.get('completion_percentage', 0)
                    
                    # Check progress consistency
                    total = progress.get('total_gifs', 0)
                    completed = progress.get('completed_gifs', 0)
                    failed = progress.get('failed_gifs', 0)
                    pending = progress.get('pending_gifs', 0)
                    
                    if total != completed + failed + pending:
                        file_validation['errors'].append("Progress counts don't add up")
                        file_validation['valid'] = False
        
        except Exception as e:
            file_validation['errors'].append(f"Validation error: {str(e)}")
            file_validation['valid'] = False
        
        validation_results['file_validations'][seed_name] = file_validation
        
        if not file_validation['valid']:
            validation_results['overall_valid'] = False
    
    # Cross-validation between seed files
    print("   Performing cross-validation...")
    
    try:
        metadata_gifs = set()
        processing_gifs = set()
        resume_gifs = set()
        
        if 'metadata' in seed_files and 'gif_metadata' in seed_files['metadata']:
            metadata_gifs = set(seed_files['metadata']['gif_metadata'].keys())
        
        if 'processing' in seed_files and 'parameter_recommendations' in seed_files['processing']:
            processing_gifs = set(seed_files['processing']['parameter_recommendations'].keys())
        
        if 'resume' in seed_files and 'completed_jobs' in seed_files['resume']:
            resume_gifs = set(seed_files['resume']['completed_jobs'].keys())
        
        # Check consistency between files
        cross_validation = {
            'metadata_processing_match': metadata_gifs == processing_gifs,
            'metadata_count': len(metadata_gifs),
            'processing_count': len(processing_gifs),
            'resume_count': len(resume_gifs),
            'missing_in_processing': list(metadata_gifs - processing_gifs),
            'missing_in_metadata': list(processing_gifs - metadata_gifs)
        }
        
        validation_results['cross_validation'] = cross_validation
        
        if not cross_validation['metadata_processing_match']:
            validation_results['recommendations'].append("Metadata and processing seed files have different GIF sets")
        
    except Exception as e:
        validation_results['cross_validation']['error'] = str(e)
    
    # Generate recommendations
    if validation_results['overall_valid']:
        validation_results['recommendations'].append("All seed files are valid and consistent")
    else:
        validation_results['recommendations'].append("Fix validation errors before using seed files")
    
    return validation_results

# Prepare seed files for validation
seed_files_for_validation = {}

if 'metadata_seed' in locals() and metadata_seed:
    seed_files_for_validation['metadata'] = metadata_seed

if 'processing_seed' in locals() and processing_seed:
    seed_files_for_validation['processing'] = processing_seed

if 'resume_seed' in locals() and resume_seed:
    seed_files_for_validation['resume'] = resume_seed

# Perform validation
if seed_files_for_validation:
    validation_results = validate_seed_data(seed_files_for_validation)
    
    print(f"\n📋 Validation Results:")
    print(f"   • Overall valid: {validation_results['overall_valid']}")
    
    for seed_name, file_val in validation_results['file_validations'].items():
        status = "✅" if file_val['valid'] else "❌"
        print(f"   • {seed_name}: {status}")
        
        if file_val['errors']:
            for error in file_val['errors']:
                print(f"     - Error: {error}")
        
        if file_val['warnings']:
            for warning in file_val['warnings']:
                print(f"     - Warning: {warning}")
    
    # Cross-validation results
    if validation_results['cross_validation']:
        cv = validation_results['cross_validation']
        print(f"\n🔗 Cross-validation:")
        print(f"   • Metadata-Processing match: {cv.get('metadata_processing_match', 'Unknown')}")
        print(f"   • Metadata GIFs: {cv.get('metadata_count', 0)}")
        print(f"   • Processing GIFs: {cv.get('processing_count', 0)}")
        print(f"   • Resume GIFs: {cv.get('resume_count', 0)}")
    
    # Recommendations
    print(f"\n💡 Recommendations:")
    for rec in validation_results['recommendations']:
        print(f"   • {rec}")
        
else:
    print("⚠️  No seed files available for validation")


In [None]:
def save_seed_files(seed_files: Dict[str, Dict[str, Any]], seed_dir: Path, backup_existing: bool = True) -> Dict[str, Path]:
    """Save all seed files to the seed directory."""
    
    print("💾 Saving seed files...")
    
    saved_files = {}
    
    # Define seed file mappings
    seed_file_names = {
        'metadata': 'lookup_seed_metadata.json',
        'processing': 'lookup_seed_processing.json',
        'resume': 'lookup_seed_resume.json'
    }
    
    for seed_type, seed_data in seed_files.items():
        if seed_type not in seed_file_names:
            print(f"⚠️  Unknown seed type: {seed_type}")
            continue
        
        file_name = seed_file_names[seed_type]
        file_path = seed_dir / file_name
        
        try:
            # Backup existing file if requested
            if backup_existing and file_path.exists():
                backup_path = seed_dir / f"{file_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                file_path.rename(backup_path)
                print(f"   📦 Backed up existing {file_name} to {backup_path.name}")
            
            # Save new seed file
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(seed_data, f, indent=2, ensure_ascii=False, default=str)
            
            saved_files[seed_type] = file_path
            file_size_mb = file_path.stat().st_size / (1024 * 1024)
            print(f"   ✅ Saved {file_name} ({file_size_mb:.2f} MB)")
            
        except Exception as e:
            print(f"   ❌ Error saving {file_name}: {e}")
    
    return saved_files

def generate_integration_summary(saved_files: Dict[str, Path], validation_results: Dict[str, Any]) -> Dict[str, Any]:
    """Generate summary for integration with the main pipeline."""
    
    summary = {
        'generation_timestamp': datetime.now().isoformat(),
        'seed_files': {},
        'integration_ready': validation_results.get('overall_valid', False),
        'usage_instructions': {},
        'next_steps': []
    }
    
    # Document saved files
    for seed_type, file_path in saved_files.items():
        file_stat = file_path.stat()
        summary['seed_files'][seed_type] = {
            'file_path': str(file_path),
            'file_size_bytes': file_stat.st_size,
            'file_size_mb': file_stat.st_size / (1024 * 1024),
            'last_modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat()
        }
    
    # Usage instructions
    summary['usage_instructions'] = {
        'metadata_seed': "Use for GIF metadata lookup, deduplication, and complexity classification",
        'processing_seed': "Use for parameter recommendations and batch optimization",
        'resume_seed': "Use for tracking processing progress and resume functionality",
        'pipeline_integration': "Import seed files in pipeline.py for efficient processing"
    }
    
    # Next steps based on validation results
    if validation_results.get('overall_valid', False):
        summary['next_steps'] = [
            "Integrate seed files with compression pipeline",
            "Test resume functionality with sample GIFs",
            "Monitor processing performance with complexity-based batching",
            "Update seed files as new GIFs are processed"
        ]
    else:
        summary['next_steps'] = [
            "Fix validation errors in seed files",
            "Re-run seed generation after corrections",
            "Verify data consistency before pipeline integration"
        ]
    
    return summary

# Prepare seed files for saving
seed_files_to_save = {}

if 'metadata_seed' in locals() and metadata_seed:
    seed_files_to_save['metadata'] = metadata_seed

if 'processing_seed' in locals() and processing_seed:
    seed_files_to_save['processing'] = processing_seed

if 'resume_seed' in locals() and resume_seed:
    seed_files_to_save['resume'] = resume_seed

# Save seed files
if seed_files_to_save:
    saved_files = save_seed_files(seed_files_to_save, seed_dir, backup_existing=SEED_CONFIG['backup_existing'])
    
    # Generate integration summary
    integration_summary = generate_integration_summary(saved_files, validation_results if 'validation_results' in locals() else {})
    
    # Save integration summary
    summary_path = seed_dir / 'seed_generation_summary.json'
    with open(summary_path, 'w', encoding='utf-8') as f:
        json.dump(integration_summary, f, indent=2, ensure_ascii=False, default=str)
    
    print(f"\n📊 Integration Summary:")
    print(f"   • Seed files generated: {len(saved_files)}")
    print(f"   • Integration ready: {integration_summary['integration_ready']}")
    print(f"   • Summary saved to: {summary_path}")
    
    # Display file details
    print(f"\n📁 Generated Files:")
    for seed_type, file_info in integration_summary['seed_files'].items():
        print(f"   • {seed_type}: {file_info['file_path']} ({file_info['file_size_mb']:.2f} MB)")
    
    # Next steps
    print(f"\n🚀 Next Steps:")
    for step in integration_summary['next_steps']:
        print(f"   • {step}")
        
else:
    print("⚠️  No seed files to save")

print(f"\n✅ Seed JSON generation complete!")
print(f"📂 All files saved to: {seed_dir}")
print(f"🔗 Ready for integration with GifLab pipeline")


In [None]:
import json
from pathlib import Path
from datetime import datetime

# GifLab imports  
import sys
sys.path.append('../src')
from giflab import meta, io


In [None]:
# TODO: Implement seed JSON building
print("Seed JSON builder notebook - to be implemented in Stage 10")
