In [None]:
# Import necessary libraries

# Images
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
import pandas as pd
from tqdm import tqdm

# PDF's
import fitz

%matplotlib inline

In [None]:
# Define paths
input_dir = Path("photo/images")
output_dir_haar = Path("face_detected_HaarCascade")
output_dir_haar.mkdir(parents=True, exist_ok=True)

cropped_dir_haar = Path("cropped_face_HaarCascade")
cropped_dir_haar.mkdir(parents=True, exist_ok=True)

In [None]:
# Haar Cascade pre-trained model from OpenCV
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

no_face_detected = []

In [None]:
# def detect_faces_haar(img_path, scaleFactor=1.05, minNeighbors=4, minSize=(40, 40)):
#     img = cv2.imread(str(img_path))
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

#     faces = face_cascade.detectMultiScale(
#         gray,
#         scaleFactor=scaleFactor,
#         minNeighbors=minNeighbors,
#         minSize=minSize
#     )

#     for idx, (x, y, w, h) in enumerate(faces, start=1):
#         cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)

#         # Crop and save each face
#         face_crop = img[y:y+h, x:x+w]
#         crop_name = f"{img_path.stem}_face_{idx}.jpg"
#         cv2.imwrite(str(cropped_dir_haar / crop_name), face_crop)

#     # Save image with rectangles
#     output_path = output_dir_haar / img_path.name
#     cv2.imwrite(str(output_path), img)

#     print(f"{img_path.name}: {len(faces)} face(s) detected and cropped (HaarCascade).")

# def detect_faces_haar(img_path, scaleFactor=1.05, minNeighbors=4, minSize=(40, 40)):
#     img = cv2.imread(str(img_path))
#     if img is None:
#         print(f"Error: {img_path}")
#         return
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

#     faces = face_cascade.detectMultiScale(
#         gray,
#         scaleFactor=scaleFactor,
#         minNeighbors=minNeighbors,
#         minSize=minSize
#     )

#     if len(faces) == 0:
#         no_face_detected.append(img_path)
#     else:
#         for idx, (x, y, w, h) in enumerate(faces, start=1):
#             cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
#             face_crop = img[y:y+h, x:x+w]
#             crop_name = f"{img_path.stem}_face_{idx}.jpg"
#             cv2.imwrite(str(cropped_dir_haar / crop_name), face_crop)

#     cv2.imwrite(str(output_dir_haar / img_path.name), img)
#     print(f"HaarCascade processed: {img_path.name}, faces detected: {len(faces)}")

def detect_faces_haar(img_path, is_pdf=False, scaleFactor=1.05, minNeighbors=4, minSize=(40, 40)):
    img = cv2.imread(str(img_path))
    if img is None:
        print(f"Error: {img_path}")
        return
        
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(
        gray,
        scaleFactor=scaleFactor,
        minNeighbors=minNeighbors,
        minSize=minSize
    )

    if len(faces) == 0:
        no_face_detected.append(img_path)
    else:
        for idx, (x, y, w, h) in enumerate(faces, start=1):
            # Solo dibujar rectángulo si no es PDF
            if not is_pdf:  
                cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
            
            face_crop = img[y:y+h, x:x+w]
            crop_name = f"{img_path.stem}_face_{idx}.jpg"
            cv2.imwrite(str(cropped_dir_haar / crop_name), face_crop)

    # Solo guardar imagen con rectángulos si no es PDF
    if not is_pdf:
        cv2.imwrite(str(output_dir_haar / img_path.name), img)
    
    print(f"HaarCascade processed: {img_path.name}, faces detected: {len(faces)}")

In [None]:
# Process the images
image_counter = 0
max_images = 25

In [None]:
for folder in input_dir.iterdir():
    if not folder.is_dir():
        continue
    for img_path in folder.glob("*.jpg"):
        if image_counter >= max_images:
            break
        detect_faces_haar(img_path)
        image_counter += 1
    if image_counter >= max_images:
        break

# for folder in input_dir.iterdir():
#     if not folder.is_dir():
#         continue
#     for img_path in folder.glob("*.jpg"):
#         detect_faces_haar(img_path)

In [None]:
# Show first 5 images without face
print(f"\nTotal images without faces detected (HaarCascade): {len(no_face_detected)}")

for img_path in no_face_detected[:5]:
    img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.title(f"No face detected: {img_path.name}")
    plt.axis('off')
    plt.show()

---

In [None]:
# Directory containing PDF files
pdf_dir = Path('pdfs')
output_img_dir = Path('pdf_images_HaarCascade')
output_img_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Convert each page of a PDF to a list of images.
def pdf_to_images(pdf_path, output_dir, dpi=300):
    pdf_document = fitz.open(str(pdf_path))
    img_paths = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        # 72 es el DPI nativo de PDF
        zoom = dpi / 72  
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        
        img_path = output_dir / f"{pdf_path.stem}_page_{page_num+1}.png"
        pix.save(str(img_path))
        img_paths.append(img_path)
    
    return img_paths

In [None]:
# Assuming face_cascade and directories are already defined as in your previous Haar code
pdf_files = list(pdf_dir.glob("*.pdf"))

for pdf_file in pdf_files:
    print(f"Processing PDF: {pdf_file.name}")
    try:
        img_paths = pdf_to_images(pdf_file, output_img_dir)
        for img_path in img_paths:
            detect_faces_haar(img_path)
    except Exception as e:
        print(f"Error procesando {pdf_file.name}: {str(e)}")

In [None]:
def process_pdfs_haar():
    pdf_dir = Path('pdfs')
    pdf_cropped_dir = Path('pdf_cropped_faces_HaarCascade')
    pdf_cropped_dir.mkdir(parents=True, exist_ok=True)
    
    pdf_files = list(pdf_dir.glob("*.pdf"))
    
    for pdf_file in pdf_files:
        print(f"\nProcessing PDF: {pdf_file.name}")
        try:
            # Procesar cada página del PDF
            pdf_document = fitz.open(str(pdf_file))
            
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                zoom = 300 / 72  # 300 DPI
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                
                # Convertir a formato OpenCV
                img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
                img = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                
                # Detectar caras
                faces = face_cascade.detectMultiScale(
                    gray,
                    scaleFactor=1.05,
                    minNeighbors=4,
                    minSize=(40, 40)
                )
                
                if len(faces) == 0:
                    print(f"No faces detected in page {page_num+1}")
                    continue
                
                # Guardar cada cara recortada
                for idx, (x, y, w, h) in enumerate(faces, start=1):
                    face_crop = img[y:y+h, x:x+w]
                    crop_name = f"{pdf_file.stem}_page_{page_num+1}_face_{idx}.jpg"
                    cv2.imwrite(str(pdf_cropped_dir / crop_name), face_crop)
                    print(f"Saved: {crop_name}")
                    
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {str(e)}")


In [None]:
# Procesar PDFs
process_pdfs_haar()

---

# HaarCascade

In [1]:
# Import necessary libraries
import os
import time
from pathlib import Path
from typing import List, Tuple, Optional

# Images
import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
import pandas as pd
from tqdm import tqdm

# PDF's
import fitz

In [2]:
# Face detection using Haar Cascade
class FaceDetector:
    def __init__(self):
        # Configuration
        self.input_dir = Path("photo/images")
        self.output_dir_haar = Path("face_detected_HaarCascade")
        self.cropped_dir_haar = Path("cropped_face_HaarCascade")
        self.no_face_detected: List[Path] = []
        
        # Create directories if they don't exist
        self.output_dir_haar.mkdir(parents=True, exist_ok=True)
        self.cropped_dir_haar.mkdir(parents=True, exist_ok=True)
        
        # Load Haar Cascade classifier
        self.face_cascade = self._load_haar_cascade()
    
    # Load Haar Cascade classifier with error handling.
    def _load_haar_cascade(self) -> cv2.CascadeClassifier:
        try:
            cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
            if not os.path.exists(cascade_path):
                raise FileNotFoundError(f"Haar Cascade file not found at {cascade_path}")
            return cv2.CascadeClassifier(cascade_path)
        except Exception as e:
            print(f"Error loading Haar Cascade: {e}")
            raise
    
    # Detect faces in an image using Haar Cascade classifier.
    def detect_faces_haar(self, img_path: Path, is_pdf: bool = False, 
                            scaleFactor: float = 1.05, minNeighbors: int = 4, 
                            minSize: Tuple[int, int] = (40, 40)) -> Optional[int]:
        """    
        Args:
            img_path: Path to the image file
            is_pdf: Whether the image comes from a PDF (affects output)
            scaleFactor: Parameter specifying how much the image size is reduced at each image scale
            minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have
            minSize: Minimum possible object size
            
        Returns:
            Number of faces detected or None if error occurred
        """
        start_time = time.time()
        
        try:
            # Read image with error handling
            img = cv2.imread(str(img_path))
            if img is None:
                print(f"Error: Could not read image {img_path}")
                return None
                
            # Convert to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Detect faces with timing
            faces = self.face_cascade.detectMultiScale(
                gray,
                scaleFactor=scaleFactor,
                minNeighbors=minNeighbors,
                minSize=minSize
            )
            
            if len(faces) == 0:
                self.no_face_detected.append(img_path)
            else:
                for idx, (x, y, w, h) in enumerate(faces, start=1):
                    if not is_pdf:  
                        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
                    
                    face_crop = img[y:y+h, x:x+w]
                    crop_name = f"{img_path.stem}_face_{idx}.jpg"
                    cv2.imwrite(str(self.cropped_dir_haar / crop_name), face_crop)

            if not is_pdf:
                cv2.imwrite(str(self.output_dir_haar / img_path.name), img)
            
            elapsed_time = time.time() - start_time
            print(f"HaarCascade processed: {img_path.name}, faces: {len(faces)}, time: {elapsed_time:.2f}s")
            
            return len(faces)
            
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")
            return None
    
    # Process images from input directory with progress tracking
    def process_images(self, max_images: int = 25) -> None:
        image_counter = 0
        total_faces = 0
        total_time = 0
        
        for folder in self.input_dir.iterdir():
            if not folder.is_dir():
                continue
                
            for img_path in folder.glob("*.jpg"):
                if image_counter >= max_images:
                    break

                start_time = time.time()
                faces_detected = self.detect_faces_haar(img_path)
                elapsed = time.time() - start_time

                total_time += elapsed
                if faces_detected is not None:
                    total_faces += faces_detected
                image_counter += 1

                # Running average
                avg_time = total_faces / image_counter
                avg_faces = total_faces / image_counter if image_counter > 0 else 0
                print(f"Running avg: {avg_time:.3f}s/img, {avg_faces:.1f} faces/img")
                    
                # self.detect_faces_haar(img_path)
                # image_counter += 1

        # Final statistics
        print("\n" + "="*50)
        print("PROCESSING STATISTICS")
        print(f"Total images processed: {image_counter}")
        print(f"Total faces detected: {total_faces}")
        print(f"Average time per image: {total_time/image_counter:.3f} seconds")
        print(f"Average faces per image: {total_faces/image_counter:.1f}")
        print("=".center(50, "="))
    
    # Display sample images where no faces were detected.
    def show_no_face_samples(self, sample_size: int = 5) -> None:
        print(f"\nTotal images without faces detected: {len(self.no_face_detected)}")
        
        for img_path in self.no_face_detected[:sample_size]:
            try:
                img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
                plt.imshow(img)
                plt.title(f"No face detected: {img_path.name}")
                plt.axis('off')
                plt.show()
            except Exception as e:
                print(f"Error displaying {img_path}: {e}")

In [3]:
# Class for processing PDF files to detect faces.
class PDFProcessor:
    
    def __init__(self, face_detector: FaceDetector):
        self.pdf_dir = Path('pdfs')
        self.output_img_dir = Path('pdf_images_HaarCascade')
        self.pdf_cropped_dir = Path('pdf_cropped_faces_HaarCascade')
        self.face_detector = face_detector
        
        # Create directories if they don't exist
        self.output_img_dir.mkdir(parents=True, exist_ok=True)
        self.pdf_cropped_dir.mkdir(parents=True, exist_ok=True)
    
    # Process all PDF files in the directory and return total time
    def process_pdfs(self) -> float:
        pdf_files = list(self.pdf_dir.glob("*.pdf"))

        if not pdf_files:
            print("No PDF files found in directory")
            return 0.0
        
        start_time = time.time()
        print("\nStarting PDF processing...")
        for pdf_file in pdf_files:
            self._process_pdf(pdf_file)
        
        return time.time() - start_time
    
    # Process a single PDF file
    def _process_pdf(self, pdf_file: Path) -> None:
        print(f"\nProcessing PDF: {pdf_file.name}")
        try:
            pdf_document = fitz.open(str(pdf_file))
            
            for page_num in range(len(pdf_document)):
                page_start = time.time()
                page = pdf_document.load_page(page_num)
                # 300 DPI
                zoom = 300 / 72  
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                
                # Convert to OpenCV format
                img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
                img = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                
                # Create temp file path
                temp_path = self.output_img_dir / f"temp_{pdf_file.stem}_p{page_num}.png"
                cv2.imwrite(str(temp_path), img)
                
                # Process with face detector
                faces_detected = self.face_detector.detect_faces_haar(temp_path, is_pdf=True)
                
                # Clean up temporary file
                temp_path.unlink(missing_ok=True)
                
                page_time = time.time() - page_start
                print(f"Page {page_num+1}: {faces_detected or 0} faces, processed in {page_time:.2f}s")
                
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {str(e)}")

In [None]:
# Main execution
if __name__ == "__main__":
    # Initialize face detector with timing
    print("Initializing face detector...")
    init_start = time.perf_counter()
    face_detector = FaceDetector()
    init_time = time.perf_counter() - init_start
    print(f"Detector initialized in {init_time:.2f} seconds\n")
    
    # Process images with detailed timing and averages
    print("\n" + "="*50)
    print("Starting image processing...")
    img_start = time.perf_counter()
    
    # Track processing metrics
    image_counter = 0
    total_faces = 0
    total_time = 0.0
    max_images = 25
    
    # Process images with progress tracking
    for folder in face_detector.input_dir.iterdir():
        if not folder.is_dir():
            continue
            
        for img_path in folder.glob("*.jpg"):
            if image_counter >= max_images:
                break
                
            start_time = time.time()
            faces_detected = face_detector.detect_faces_haar(img_path)
            elapsed = time.time() - start_time
            
            total_time += elapsed
            if faces_detected is not None:
                total_faces += faces_detected
            image_counter += 1
            
            # Display running averages
            avg_time = total_time / image_counter
            avg_faces = total_faces / image_counter if image_counter > 0 else 0
            print(f"Progress: {image_counter}/{max_images} | "
                  f"Avg time: {avg_time:.3f}s | "
                  f"Avg faces: {avg_faces:.1f}")
    
    img_elapsed = time.perf_counter() - img_start
    
    # Image processing summary
    print("\n" + "="*50)
    print("IMAGE PROCESSING SUMMARY")
    print(f"Total images processed: {image_counter}")
    print(f"Images with faces detected: {image_counter - len(face_detector.no_face_detected)}")
    print(f"Images without faces: {len(face_detector.no_face_detected)}")
    print(f"Total faces detected: {total_faces}")
    print(f"Total processing time: {img_elapsed:.2f} seconds")
    print(f"Average time per image: {total_time/image_counter:.3f} seconds")
    print(f"Average faces per image: {total_faces/image_counter:.1f}")
    print("="*50 + "\n")
    
    # Show samples with no faces detected
    if face_detector.no_face_detected:
        print(f"Showing {min(3, len(face_detector.no_face_detected))} samples without detected faces...")
        face_detector.show_no_face_samples(sample_size=3)

    # Process PDFs with comprehensive timing
    print("\n" + "="*50)
    print("Starting PDF processing...")
    pdf_start = time.perf_counter()
    
    pdf_processor = PDFProcessor(face_detector)
    pdf_time = pdf_processor.process_pdfs()
    pdf_elapsed = time.perf_counter() - pdf_start
    
    print("\n" + "="*50)
    print("PDF PROCESSING SUMMARY")
    print(f"Total processing time: {pdf_elapsed:.2f} seconds")
    # Now safe because we always return a float
    if pdf_time > 0:  
        print(f"PDF-only processing time: {pdf_time:.2f} seconds")
    print("="*50 + "\n")
    
    # Final summary
    print("\n" + "="*50)
    print("FINAL SUMMARY")
    print(f"Total execution time: {time.perf_counter() - init_start:.2f} seconds")
    print(f"Total images processed: {image_counter}")
    print(f"Total faces detected: {total_faces}")
    print(f"Average time per image: {total_time/image_counter:.3f} seconds")
    print(f"Average faces per image: {total_faces/image_counter:.1f}")
    print("="*50)

Initializing face detector...
Detector initialized in 0.02 seconds


Starting image processing...
HaarCascade processed: 00.jpg, faces: 10, time: 0.70s
Progress: 1/25 | Avg time: 0.702s | Avg faces: 10.0
HaarCascade processed: 01.jpg, faces: 5, time: 0.63s
Progress: 2/25 | Avg time: 0.669s | Avg faces: 7.5
HaarCascade processed: 02.jpg, faces: 6, time: 0.64s
Progress: 3/25 | Avg time: 0.659s | Avg faces: 7.0
HaarCascade processed: 03.jpg, faces: 3, time: 0.69s
Progress: 4/25 | Avg time: 0.669s | Avg faces: 6.0
HaarCascade processed: 04.jpg, faces: 10, time: 0.83s
Progress: 5/25 | Avg time: 0.702s | Avg faces: 6.8
HaarCascade processed: 05.jpg, faces: 4, time: 0.74s
Progress: 6/25 | Avg time: 0.708s | Avg faces: 6.3
HaarCascade processed: 06.jpg, faces: 42, time: 1.19s
Progress: 7/25 | Avg time: 0.777s | Avg faces: 11.4
HaarCascade processed: 07.jpg, faces: 101, time: 1.63s
Progress: 8/25 | Avg time: 0.884s | Avg faces: 22.6
HaarCascade processed: 08.jpg, faces: 62, time: 1.20s
Progress