In [1]:
import os
import json
import shutil
from datetime import datetime
import cv2
import numpy as np

class DatasetManager:
    def __init__(self, dataset_dir='large_dataset'):
        self.dataset_dir = dataset_dir
        self.expressions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
    
    def get_dataset_stats(self):
        """Get comprehensive dataset statistics"""
        print("üìä DATASET STATISTICS")
        print("=" * 50)
        
        total_images = 0
        expression_stats = {}
        
        for expression in self.expressions:
            expr_path = os.path.join(self.dataset_dir, expression)
            if os.path.exists(expr_path):
                images = [f for f in os.listdir(expr_path) 
                         if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
                count = len(images)
                expression_stats[expression] = count
                total_images += count
                
                status = "‚úÖ" if count >= 5000 else "‚ö†Ô∏è "
                print(f"   {status} {expression:12}: {count:6,} images")
            else:
                expression_stats[expression] = 0
                print(f"   ‚ùå {expression:12}: 0 images")
        
        print(f"\nüéØ Total images: {total_images:,}")
        print(f"üìà Average per expression: {total_images/7:,.0f}")
        
        # Quality analysis
        self.analyze_quality()
        
        return expression_stats
    
    def analyze_quality(self):
        """Analyze image quality and dimensions"""
        print("\nüîç QUALITY ANALYSIS")
        
        dimensions = {}
        quality_issues = 0
        
        for expression in self.expressions[:1]:  # Sample first expression
            expr_path = os.path.join(self.dataset_dir, expression)
            if not os.path.exists(expr_path):
                continue
                
            images = [f for f in os.listdir(expr_path) 
                     if f.lower().endswith(('.jpg', '.jpeg', '.png'))][:10]  # Sample 10 images
            
            for img_file in images:
                img_path = os.path.join(expr_path, img_file)
                try:
                    img = cv2.imread(img_path)
                    if img is not None:
                        h, w = img.shape[:2]
                        dim_key = f"{w}x{h}"
                        dimensions[dim_key] = dimensions.get(dim_key, 0) + 1
                        
                        # Check for very dark or very bright images
                        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                        avg_brightness = np.mean(gray)
                        if avg_brightness < 30 or avg_brightness > 220:
                            quality_issues += 1
                except:
                    quality_issues += 1
        
        print("   Dimensions found:", dict(list(dimensions.items())[:3]))
        if quality_issues > 0:
            print(f"   ‚ö†Ô∏è  Potential quality issues: {quality_issues} images")
    
    def cleanup_dataset(self):
        """Remove corrupt or invalid images"""
        print("\nüßπ CLEANING DATASET")
        
        removed_count = 0
        
        for expression in self.expressions:
            expr_path = os.path.join(self.dataset_dir, expression)
            if not os.path.exists(expr_path):
                continue
                
            images = [f for f in os.listdir(expr_path) 
                     if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            
            for img_file in images:
                img_path = os.path.join(expr_path, img_file)
                try:
                    img = cv2.imread(img_path)
                    if img is None:
                        os.remove(img_path)
                        removed_count += 1
                        print(f"   Removed corrupt: {img_file}")
                except:
                    os.remove(img_path)
                    removed_count += 1
        
        print(f"‚úÖ Removed {removed_count} corrupt images")
    
    def create_dataset_report(self):
        """Create a comprehensive dataset report"""
        stats = self.get_dataset_stats()
        
        report = {
            'generated_date': datetime.now().isoformat(),
            'dataset_location': self.dataset_dir,
            'total_images': sum(stats.values()),
            'expression_stats': stats,
            'recommendations': self.get_recommendations(stats)
        }
        
        # Save report
        with open('dataset_report.json', 'w') as f:
            json.dump(report, f, indent=2)
        
        print(f"\nüìÑ Dataset report saved to: dataset_report.json")
        return report
    
    def get_recommendations(self, stats):
        """Get recommendations based on dataset stats"""
        recommendations = []
        total = sum(stats.values())
        
        if total < 1000:
            recommendations.append("üö® Dataset is very small. Aim for at least 1000 images per expression.")
        elif total < 10000:
            recommendations.append("üí° Consider collecting more data for better model performance.")
        
        # Check balance
        avg = total / 7
        for expr, count in stats.items():
            if count < avg * 0.5:
                recommendations.append(f"‚ö†Ô∏è  {expr} has significantly fewer images than average")
        
        if not recommendations:
            recommendations.append("‚úÖ Dataset looks well-balanced and sufficient for training.")
        
        return recommendations

def main():
    print("üé≠ DATASET MANAGEMENT TOOL")
    print("=" * 50)
    
    manager = DatasetManager()
    
    while True:
        print("\nüéÆ OPTIONS:")
        print("1. üìä Show dataset statistics")
        print("2. üßπ Cleanup corrupt images")
        print("3. üìÑ Generate dataset report")
        print("4. üöÄ Start data collection")
        print("0. ‚ùå Exit")
        
        choice = input("\nEnter your choice: ").strip()
        
        if choice == '1':
            manager.get_dataset_stats()
        elif choice == '2':
            manager.cleanup_dataset()
        elif choice == '3':
            manager.create_dataset_report()
        elif choice == '4':
            print("üöÄ Launching data collection...")
            from batch_dataset_collector import AdvancedDatasetCollector
            collector = AdvancedDatasetCollector(target_images=5000)
            collector.capture_dataset()
        elif choice == '0':
            print("üëã Goodbye!")
            break
        else:
            print("‚ùå Invalid choice")

if __name__ == "__main__":
    main()

üé≠ DATASET MANAGEMENT TOOL

üéÆ OPTIONS:
1. üìä Show dataset statistics
2. üßπ Cleanup corrupt images
3. üìÑ Generate dataset report
4. üöÄ Start data collection
0. ‚ùå Exit

Enter your choice: 1
üìä DATASET STATISTICS
   ‚ö†Ô∏è  angry       :      0 images
   ‚ö†Ô∏è  disgust     :      0 images
   ‚ö†Ô∏è  fear        :      0 images
   ‚ö†Ô∏è  happy       :      0 images
   ‚ö†Ô∏è  neutral     :      0 images
   ‚ö†Ô∏è  sad         :      0 images
   ‚ö†Ô∏è  surprise    :      0 images

üéØ Total images: 0
üìà Average per expression: 0

üîç QUALITY ANALYSIS
   Dimensions found: {}

üéÆ OPTIONS:
1. üìä Show dataset statistics
2. üßπ Cleanup corrupt images
3. üìÑ Generate dataset report
4. üöÄ Start data collection
0. ‚ùå Exit

Enter your choice: 2

üßπ CLEANING DATASET
‚úÖ Removed 0 corrupt images

üéÆ OPTIONS:
1. üìä Show dataset statistics
2. üßπ Cleanup corrupt images
3. üìÑ Generate dataset report
4. üöÄ Start data collection
0. ‚ùå Exit

Enter your choice: 3


ModuleNotFoundError: No module named 'batch_dataset_collector'