In [3]:
import cv2
import os
import numpy as np
import time
import threading
from datetime import datetime
import json

class AdvancedDatasetCollector:
    def __init__(self, target_images=5000):
        self.expressions = {
            '1': 'angry',
            '2': 'disgust', 
            '3': 'fear',
            '4': 'happy',
            '5': 'neutral',
            '6': 'sad',
            '7': 'surprise'
        }
        self.dataset_dir = 'large_dataset'
        self.target_images = target_images
        self.face_cascade = cv2.CascadeClassifier(
            cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
        )
        self.current_expression = 'neutral'
        self.counters = {expr: 0 for expr in self.expressions.values()}
        self.cap = None
        self.running = True
        self.auto_capture = False
        self.capture_delay = 0.5  # Delay between auto captures
        self.last_capture_time = 0
        self.session_stats = {
            'start_time': None,
            'total_captured': 0,
            'session_duration': 0
        }
        
        # Create dataset directories
        self.setup_directories()
        self.load_progress()
        
        # Setup signal handler
        import signal
        signal.signal(signal.SIGINT, self.signal_handler)
    
    def setup_directories(self):
        """Create directory structure for large dataset"""
        print("üìÅ Setting up directory structure for large dataset...")
        
        directories = [self.dataset_dir, 'models', 'progress']
        
        for directory in directories:
            if not os.path.exists(directory):
                os.makedirs(directory)
                print(f"‚úÖ Created: {directory}/")
        
        # Create expression subdirectories
        for expression in self.expressions.values():
            expr_path = f'{self.dataset_dir}/{expression}'
            if not os.path.exists(expr_path):
                os.makedirs(expr_path)
        
        print(f"üéØ Target: {self.target_images} images per expression")
        print(f"üìä Total target: {self.target_images * 7:,} images")
    
    def load_progress(self):
        """Load progress from previous sessions"""
        progress_file = 'progress/dataset_progress.json'
        if os.path.exists(progress_file):
            try:
                with open(progress_file, 'r') as f:
                    progress = json.load(f)
                    self.counters = progress.get('counters', self.counters)
                    print("üìà Loaded previous progress")
            except:
                print("‚ùå Could not load progress file")
        
        self.print_progress()
    
    def save_progress(self):
        """Save current progress"""
        progress_file = 'progress/dataset_progress.json'
        try:
            with open(progress_file, 'w') as f:
                json.dump({
                    'counters': self.counters,
                    'last_update': datetime.now().isoformat()
                }, f, indent=2)
        except:
            print("‚ùå Could not save progress")
    
    def print_progress(self):
        """Print current collection progress"""
        print("\nüìä CURRENT PROGRESS:")
        total_captured = sum(self.counters.values())
        total_target = self.target_images * 7
        
        for expr in self.expressions.values():
            percent = (self.counters[expr] / self.target_images) * 100
            status = "‚úÖ" if self.counters[expr] >= self.target_images else "üìù"
            print(f"   {status} {expr:12}: {self.counters[expr]:5d}/{self.target_images} ({percent:5.1f}%)")
        
        overall_percent = (total_captured / total_target) * 100
        print(f"\nüéØ Overall: {total_captured:,}/{total_target:,} ({overall_percent:.1f}%)")
    
    def signal_handler(self, sig, frame):
        """Handle Ctrl+C gracefully"""
        print("\n\nüõë Received interrupt signal. Shutting down gracefully...")
        self.running = False
    
    def initialize_camera(self):
        """Initialize camera with multiple attempts"""
        try:
            self.cap = cv2.VideoCapture(0)
            if not self.cap.isOpened():
                for i in range(1, 4):
                    self.cap = cv2.VideoCapture(i)
                    if self.cap.isOpened():
                        print(f"‚úÖ Camera found at index {i}")
                        break
                else:
                    print("‚ùå No camera found!")
                    return False
            
            # Set camera properties
            self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)  # Higher resolution
            self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
            self.cap.set(cv2.CAP_PROP_FPS, 30)
            self.cap.set(cv2.CAP_PROP_AUTOFOCUS, 1)
            self.cap.set(cv2.CAP_PROP_BRIGHTNESS, 0.5)
            
            return True
            
        except Exception as e:
            print(f"‚ùå Camera initialization error: {e}")
            return False
    
    def enhance_image_quality(self, image):
        """Enhance image quality for better training"""
        # Apply histogram equalization
        if len(image.shape) == 2:  # Grayscale
            image = cv2.equalizeHist(image)
        else:  # Color
            # Convert to YUV and equalize Y channel
            yuv = cv2.cvtColor(image, cv2.COLOR_BGR2YUV)
            yuv[:,:,0] = cv2.equalizeHist(yuv[:,:,0])
            image = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR)
        
        # Apply slight sharpening
        kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
        image = cv2.filter2D(image, -1, kernel)
        
        return image
    
    def capture_dataset(self):
        """Advanced dataset collection with batch processing"""
        if not self.initialize_camera():
            return
        
        self.session_stats['start_time'] = datetime.now()
        
        print("üöÄ ADVANCED DATASET COLLECTOR")
        print("=" * 60)
        print(f"üéØ Target: {self.target_images:,} images per expression")
        print(f"üíæ Location: {self.dataset_dir}/")
        print(f"üìä Current: {self.current_expression}")
        
        print("\nüéÆ ADVANCED CONTROLS:")
        print("   1-7      - Change expression")
        print("   SPACE    - Manual capture")
        print("   A        - Toggle auto-capture")
        print("   + / -    - Adjust auto-capture delay")
        print("   S        - Save progress")
        print("   P        - Show progress")
        print("   Q        - Quit")
        print("   Ctrl+C   - Emergency quit")
        
        print("\nüí° TIPS:")
        print("   - Use AUTO mode for bulk capture")
        print("   - Vary lighting, angles, and distances")
        print("   - Take breaks between expressions")
        print("   - Save progress regularly")
        
        auto_capture_active = False
        last_progress_save = time.time()
        
        try:
            while self.running:
                ret, frame = self.cap.read()
                if not ret:
                    print("‚ùå Failed to grab frame")
                    break
                
                # Enhance frame quality
                frame = self.enhance_image_quality(frame)
                
                # Convert to grayscale for face detection
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                
                # Detect faces with multiple scales for better detection
                faces = self.face_cascade.detectMultiScale(
                    gray, 
                    scaleFactor=1.1, 
                    minNeighbors=5, 
                    minSize=(100, 100),
                    flags=cv2.CASCADE_SCALE_IMAGE
                )
                
                # Process faces
                for (x, y, w, h) in faces:
                    # Draw enhanced bounding box
                    cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 3)
                    
                    # Draw facial landmarks area
                    cv2.circle(frame, (x + w//2, y + h//3), 5, (255, 0, 0), -1)  # Nose
                    cv2.circle(frame, (x + w//3, y + h//3), 5, (255, 0, 0), -1)  # Left eye
                    cv2.circle(frame, (x + 2*w//3, y + h//3), 5, (255, 0, 0), -1)  # Right eye
                    cv2.ellipse(frame, (x + w//2, y + 2*h//3), (w//4, h//6), 0, 0, 360, (255, 0, 0), 2)  # Mouth
                
                # Auto-capture logic
                current_time = time.time()
                if auto_capture_active and len(faces) > 0:
                    if current_time - self.last_capture_time >= self.capture_delay:
                        if self.counters[self.current_expression] < self.target_images:
                            self.save_face_image(gray, faces[0])
                            self.last_capture_time = current_time
                        else:
                            print(f"‚úÖ Target reached for {self.current_expression}")
                            auto_capture_active = False
                
                # Display information overlay
                self.draw_info_overlay(frame, auto_capture_active, len(faces))
                
                # Display frame
                cv2.imshow('Advanced Dataset Collector - 5000 Images Target', frame)
                
                # Handle keyboard input
                key = cv2.waitKey(1) & 0xFF
                if not self.handle_keyboard_input(key, faces, gray):
                    break
                
                # Auto-save progress every 30 seconds
                if current_time - last_progress_save > 30:
                    self.save_progress()
                    last_progress_save = current_time
        
        except Exception as e:
            print(f"‚ùå Error during capture: {e}")
        
        finally:
            self.cleanup()
            self.print_final_stats()
    
    def draw_info_overlay(self, frame, auto_capture_active, face_count):
        """Draw comprehensive information overlay"""
        # Main info
        y_offset = 30
        info_lines = [
            f"Expression: {self.current_expression}",
            f"Progress: {self.counters[self.current_expression]}/{self.target_images}",
            f"Auto-Capture: {'ON' if auto_capture_active else 'OFF'}",
            f"Faces: {face_count}",
            f"Delay: {self.capture_delay:.1f}s"
        ]
        
        for i, line in enumerate(info_lines):
            color = (0, 255, 0) if i == 0 else (255, 255, 255)
            cv2.putText(frame, line, (10, y_offset + i*25), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        
        # Progress bar
        progress = self.counters[self.current_expression] / self.target_images
        bar_width = 400
        bar_height = 20
        bar_x, bar_y = 10, 150
        
        cv2.rectangle(frame, (bar_x, bar_y), (bar_x + bar_width, bar_y + bar_height), (50, 50, 50), -1)
        cv2.rectangle(frame, (bar_x, bar_y), (bar_x + int(bar_width * progress), bar_y + bar_height), (0, 255, 0), -1)
        cv2.putText(frame, f"{progress*100:.1f}%", (bar_x + bar_width + 10, bar_y + 15), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        
        # Controls help
        controls_y = frame.shape[0] - 100
        controls = [
            "1-7: Change Exp | SPACE: Manual | A: Auto",
            "+/-: Delay | S: Save | P: Progress | Q: Quit"
        ]
        
        for i, control in enumerate(controls):
            cv2.putText(frame, control, (10, controls_y + i*20), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
    
    def handle_keyboard_input(self, key, faces, gray_frame):
        """Handle all keyboard inputs"""
        if key == ord('q'):
            return False
        
        elif key in [ord(str(i)) for i in range(1, 8)]:
            self.current_expression = self.expressions[chr(key)]
            print(f"üìù Changed expression to: {self.current_expression}")
        
        elif key == ord(' '):  # Space - manual capture
            if len(faces) > 0 and self.counters[self.current_expression] < self.target_images:
                self.save_face_image(gray_frame, faces[0])
            else:
                print("‚ùå No face detected or target reached!")
        
        elif key == ord('a'):  # Toggle auto-capture
            self.auto_capture = not self.auto_capture
            status = "ON" if self.auto_capture else "OFF"
            print(f"ü§ñ Auto-capture: {status}")
            self.last_capture_time = time.time()
        
        elif key == ord('+'):  # Increase delay
            self.capture_delay = min(5.0, self.capture_delay + 0.1)
            print(f"‚è±Ô∏è  Capture delay: {self.capture_delay:.1f}s")
        
        elif key == ord('-'):  # Decrease delay
            self.capture_delay = max(0.1, self.capture_delay - 0.1)
            print(f"‚è±Ô∏è  Capture delay: {self.capture_delay:.1f}s")
        
        elif key == ord('s'):  # Save progress
            self.save_progress()
            print("üíæ Progress saved!")
        
        elif key == ord('p'):  # Show progress
            self.print_progress()
        
        return True
    
    def save_face_image(self, gray_frame, face_rect):
        """Save enhanced face image with quality checks"""
        try:
            x, y, w, h = face_rect
            
            # Quality checks
            if w < 100 or h < 100:  # Face too small
                print("‚ö†Ô∏è  Face too small, skipping...")
                return
            
            # Expand the face region with margin
            margin = min(w, h) // 3  # Dynamic margin based on face size
            x = max(0, x - margin)
            y = max(0, y - margin)
            w = min(gray_frame.shape[1] - x, w + 2 * margin)
            h = min(gray_frame.shape[0] - y, h + 2 * margin)
            
            # Extract and enhance face
            face_img = gray_frame[y:y+h, x:x+w]
            
            # Multiple resolutions for training
            resolutions = [
                (48, 48),    # Standard for training
                (96, 96),    # Higher resolution
                (64, 64)     # Medium resolution
            ]
            
            for i, (width, height) in enumerate(resolutions):
                resized_face = cv2.resize(face_img, (width, height))
                
                # Apply enhancement
                resized_face = cv2.equalizeHist(resized_face)
                
                # Save with different names
                if i == 0:
                    filename = f"{self.dataset_dir}/{self.current_expression}/{self.current_expression}_{self.counters[self.current_expression]:06d}.jpg"
                else:
                    filename = f"{self.dataset_dir}/{self.current_expression}/{self.current_expression}_{self.counters[self.current_expression]:06d}_{width}x{height}.jpg"
                
                success = cv2.imwrite(filename, resized_face)
                
                if not success:
                    print(f"‚ùå Failed to save: {filename}")
                    return
            
            self.counters[self.current_expression] += 1
            self.session_stats['total_captured'] += 1
            
            # Progress feedback
            if self.counters[self.current_expression] % 100 == 0:
                percent = (self.counters[self.current_expression] / self.target_images) * 100
                print(f"üìà {self.current_expression}: {self.counters[self.current_expression]}/{self.target_images} ({percent:.1f}%)")
            
        except Exception as e:
            print(f"‚ùå Error saving image: {e}")
    
    def print_final_stats(self):
        """Print final collection statistics"""
        print("\n" + "=" * 60)
        print("üìä DATASET COLLECTION COMPLETED")
        print("=" * 60)
        
        total_captured = sum(self.counters.values())
        total_target = self.target_images * 7
        
        print(f"üéØ Target per expression: {self.target_images:,}")
        print(f"üìà Total captured: {total_captured:,}/{total_target:,}")
        print(f"üìÖ Session duration: {self.get_session_duration()}")
        
        print("\nüìã Expression Breakdown:")
        for expr in self.expressions.values():
            percent = (self.counters[expr] / self.target_images) * 100
            status = "‚úÖ COMPLETE" if self.counters[expr] >= self.target_images else "üìù IN PROGRESS"
            print(f"   {expr:12}: {self.counters[expr]:6,} images - {status} ({percent:5.1f}%)")
        
        # Save final progress
        self.save_progress()
        print(f"\nüíæ Progress saved to: progress/dataset_progress.json")
    
    def get_session_duration(self):
        """Calculate session duration"""
        if self.session_stats['start_time']:
            duration = datetime.now() - self.session_stats['start_time']
            hours = duration.seconds // 3600
            minutes = (duration.seconds % 3600) // 60
            return f"{hours}h {minutes}m"
        return "Unknown"
    
    def cleanup(self):
        """Cleanup resources"""
        try:
            if self.cap and self.cap.isOpened():
                self.cap.release()
            cv2.destroyAllWindows()
            # Additional cleanup
            for i in range(5):
                cv2.waitKey(1)
        except Exception as e:
            print(f"‚ö†Ô∏è Cleanup warning: {e}")

def main():
    print("üöÄ ADVANCED DATASET COLLECTOR - 5000 IMAGES TARGET")
    print("This will help you create a comprehensive dataset with 5000+ images per expression")
    
    # Get target from user
    try:
        target = int(input("Enter target images per expression (default 5000): ") or "5000")
    except:
        target = 5000
    
    collector = AdvancedDatasetCollector(target_images=target)
    collector.capture_dataset()

if __name__ == "__main__":
    main()

üöÄ ADVANCED DATASET COLLECTOR - 5000 IMAGES TARGET
This will help you create a comprehensive dataset with 5000+ images per expression
Enter target images per expression (default 5000): 45000
üìÅ Setting up directory structure for large dataset...
üéØ Target: 45000 images per expression
üìä Total target: 315,000 images
üìà Loaded previous progress

üìä CURRENT PROGRESS:
   üìù angry       :     0/45000 (  0.0%)
   üìù disgust     :     0/45000 (  0.0%)
   üìù fear        :     0/45000 (  0.0%)
   üìù happy       :     0/45000 (  0.0%)
   üìù neutral     :     0/45000 (  0.0%)
   üìù sad         :     0/45000 (  0.0%)
   üìù surprise    :     0/45000 (  0.0%)

üéØ Overall: 0/315,000 (0.0%)
üöÄ ADVANCED DATASET COLLECTOR
üéØ Target: 45,000 images per expression
üíæ Location: large_dataset/
üìä Current: neutral

üéÆ ADVANCED CONTROLS:
   1-7      - Change expression
   SPACE    - Manual capture
   A        - Toggle auto-capture
   + / -    - Adjust auto-capture delay
  