In [None]:
# Import necessary libraries

# Images
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
import pandas as pd
from tqdm import tqdm

# PDF's
import fitz

%matplotlib inline

In [None]:
# Define paths
input_dir = Path("photo/images")
output_dir_haar = Path("face_detected_HaarCascade")
output_dir_haar.mkdir(parents=True, exist_ok=True)

cropped_dir_haar = Path("cropped_face_HaarCascade")
cropped_dir_haar.mkdir(parents=True, exist_ok=True)

In [None]:
# Haar Cascade pre-trained model from OpenCV
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

no_face_detected = []

In [None]:
# def detect_faces_haar(img_path, scaleFactor=1.05, minNeighbors=4, minSize=(40, 40)):
#     img = cv2.imread(str(img_path))
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

#     faces = face_cascade.detectMultiScale(
#         gray,
#         scaleFactor=scaleFactor,
#         minNeighbors=minNeighbors,
#         minSize=minSize
#     )

#     for idx, (x, y, w, h) in enumerate(faces, start=1):
#         cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)

#         # Crop and save each face
#         face_crop = img[y:y+h, x:x+w]
#         crop_name = f"{img_path.stem}_face_{idx}.jpg"
#         cv2.imwrite(str(cropped_dir_haar / crop_name), face_crop)

#     # Save image with rectangles
#     output_path = output_dir_haar / img_path.name
#     cv2.imwrite(str(output_path), img)

#     print(f"{img_path.name}: {len(faces)} face(s) detected and cropped (HaarCascade).")

# def detect_faces_haar(img_path, scaleFactor=1.05, minNeighbors=4, minSize=(40, 40)):
#     img = cv2.imread(str(img_path))
#     if img is None:
#         print(f"Error: {img_path}")
#         return
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

#     faces = face_cascade.detectMultiScale(
#         gray,
#         scaleFactor=scaleFactor,
#         minNeighbors=minNeighbors,
#         minSize=minSize
#     )

#     if len(faces) == 0:
#         no_face_detected.append(img_path)
#     else:
#         for idx, (x, y, w, h) in enumerate(faces, start=1):
#             cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
#             face_crop = img[y:y+h, x:x+w]
#             crop_name = f"{img_path.stem}_face_{idx}.jpg"
#             cv2.imwrite(str(cropped_dir_haar / crop_name), face_crop)

#     cv2.imwrite(str(output_dir_haar / img_path.name), img)
#     print(f"HaarCascade processed: {img_path.name}, faces detected: {len(faces)}")

def detect_faces_haar(img_path, is_pdf=False, scaleFactor=1.05, minNeighbors=4, minSize=(40, 40)):
    img = cv2.imread(str(img_path))
    if img is None:
        print(f"Error: {img_path}")
        return
        
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(
        gray,
        scaleFactor=scaleFactor,
        minNeighbors=minNeighbors,
        minSize=minSize
    )

    if len(faces) == 0:
        no_face_detected.append(img_path)
    else:
        for idx, (x, y, w, h) in enumerate(faces, start=1):
            # Solo dibujar rectángulo si no es PDF
            if not is_pdf:  
                cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
            
            face_crop = img[y:y+h, x:x+w]
            crop_name = f"{img_path.stem}_face_{idx}.jpg"
            cv2.imwrite(str(cropped_dir_haar / crop_name), face_crop)

    # Solo guardar imagen con rectángulos si no es PDF
    if not is_pdf:
        cv2.imwrite(str(output_dir_haar / img_path.name), img)
    
    print(f"HaarCascade processed: {img_path.name}, faces detected: {len(faces)}")

In [None]:
# Process the images
image_counter = 0
max_images = 25

In [None]:
for folder in input_dir.iterdir():
    if not folder.is_dir():
        continue
    for img_path in folder.glob("*.jpg"):
        if image_counter >= max_images:
            break
        detect_faces_haar(img_path)
        image_counter += 1
    if image_counter >= max_images:
        break

# for folder in input_dir.iterdir():
#     if not folder.is_dir():
#         continue
#     for img_path in folder.glob("*.jpg"):
#         detect_faces_haar(img_path)

In [None]:
# Show first 5 images without face
print(f"\nTotal images without faces detected (HaarCascade): {len(no_face_detected)}")

for img_path in no_face_detected[:5]:
    img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.title(f"No face detected: {img_path.name}")
    plt.axis('off')
    plt.show()

---

In [None]:
# Directory containing PDF files
pdf_dir = Path('pdfs')
output_img_dir = Path('pdf_images_HaarCascade')
output_img_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Convert each page of a PDF to a list of images.
def pdf_to_images(pdf_path, output_dir, dpi=300):
    pdf_document = fitz.open(str(pdf_path))
    img_paths = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        # 72 es el DPI nativo de PDF
        zoom = dpi / 72  
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        
        img_path = output_dir / f"{pdf_path.stem}_page_{page_num+1}.png"
        pix.save(str(img_path))
        img_paths.append(img_path)
    
    return img_paths

In [None]:
# Assuming face_cascade and directories are already defined as in your previous Haar code
pdf_files = list(pdf_dir.glob("*.pdf"))

for pdf_file in pdf_files:
    print(f"Processing PDF: {pdf_file.name}")
    try:
        img_paths = pdf_to_images(pdf_file, output_img_dir)
        for img_path in img_paths:
            detect_faces_haar(img_path)
    except Exception as e:
        print(f"Error procesando {pdf_file.name}: {str(e)}")

In [None]:
def process_pdfs_haar():
    pdf_dir = Path('pdfs')
    pdf_cropped_dir = Path('pdf_cropped_faces_HaarCascade')
    pdf_cropped_dir.mkdir(parents=True, exist_ok=True)
    
    pdf_files = list(pdf_dir.glob("*.pdf"))
    
    for pdf_file in pdf_files:
        print(f"\nProcessing PDF: {pdf_file.name}")
        try:
            # Procesar cada página del PDF
            pdf_document = fitz.open(str(pdf_file))
            
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                zoom = 300 / 72  # 300 DPI
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                
                # Convertir a formato OpenCV
                img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
                img = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                
                # Detectar caras
                faces = face_cascade.detectMultiScale(
                    gray,
                    scaleFactor=1.05,
                    minNeighbors=4,
                    minSize=(40, 40)
                )
                
                if len(faces) == 0:
                    print(f"No faces detected in page {page_num+1}")
                    continue
                
                # Guardar cada cara recortada
                for idx, (x, y, w, h) in enumerate(faces, start=1):
                    face_crop = img[y:y+h, x:x+w]
                    crop_name = f"{pdf_file.stem}_page_{page_num+1}_face_{idx}.jpg"
                    cv2.imwrite(str(pdf_cropped_dir / crop_name), face_crop)
                    print(f"Saved: {crop_name}")
                    
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {str(e)}")


In [None]:
# Procesar PDFs
process_pdfs_haar()

---

# HaarCascade

In [1]:
# Import necessary libraries
import os
import time
from pathlib import Path
from typing import List, Tuple, Optional

# Images
import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
import pandas as pd
from tqdm import tqdm

# PDF's
import fitz

In [None]:
# Face detection using Haar Cascade
class FaceDetector:
    def __init__(self):
        # Configuration
        self.input_dir = Path("photo/images")
        self.output_dir_haar = Path("face_detected_HaarCascade")
        self.cropped_dir_haar = Path("cropped_face_HaarCascade")
        self.no_face_detected: List[Path] = []
        
        # Create directories if they don't exist
        self.output_dir_haar.mkdir(parents=True, exist_ok=True)
        self.cropped_dir_haar.mkdir(parents=True, exist_ok=True)
        
        # Load Haar Cascade classifier
        self.face_cascade = self._load_haar_cascade()
    
    # Load Haar Cascade classifier with error handling.
    def _load_haar_cascade(self) -> cv2.CascadeClassifier:
        try:
            cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
            if not os.path.exists(cascade_path):
                raise FileNotFoundError(f"Haar Cascade file not found at {cascade_path}")
            return cv2.CascadeClassifier(cascade_path)
        except Exception as e:
            print(f"Error loading Haar Cascade: {e}")
            raise
    
    # Detect faces in an image using Haar Cascade classifier.
    def detect_faces_haar(self, img_path: Path, is_pdf: bool = False, 
                            scaleFactor: float = 1.05, minNeighbors: int = 4, 
                            minSize: Tuple[int, int] = (40, 40)) -> Optional[int]:
        """    
        Args:
            img_path: Path to the image file
            is_pdf: Whether the image comes from a PDF (affects output)
            scaleFactor: Parameter specifying how much the image size is reduced at each image scale
            minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have
            minSize: Minimum possible object size
            
        Returns:
            Number of faces detected or None if error occurred
        """
        start_time = time.time()
        
        try:
            # Read image with error handling
            img = cv2.imread(str(img_path))
            if img is None:
                print(f"Error: Could not read image {img_path}")
                return None
                
            # Convert to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Detect faces with timing
            faces = self.face_cascade.detectMultiScale(
                gray,
                scaleFactor=scaleFactor,
                minNeighbors=minNeighbors,
                minSize=minSize
            )
            
            if len(faces) == 0:
                self.no_face_detected.append(img_path)
            else:
                for idx, (x, y, w, h) in enumerate(faces, start=1):
                    if not is_pdf:  
                        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
                    
                    face_crop = img[y:y+h, x:x+w]
                    crop_name = f"{img_path.stem}_face_{idx}.jpg"
                    cv2.imwrite(str(self.cropped_dir_haar / crop_name), face_crop)

            if not is_pdf:
                cv2.imwrite(str(self.output_dir_haar / img_path.name), img)
            
            elapsed_time = time.time() - start_time
            print(f"HaarCascade processed: {img_path.name}, faces: {len(faces)}, time: {elapsed_time:.2f}s")
            
            return len(faces)
            
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")
            return None
    
    # Process images from input directory with progress tracking
    def process_images(self, max_images: int = 25) -> None:
        image_counter = 0
        
        for folder in self.input_dir.iterdir():
            if not folder.is_dir():
                continue
                
            for img_path in folder.glob("*.jpg"):
                if image_counter >= max_images:
                    return
                    
                self.detect_faces_haar(img_path)
                image_counter += 1
    
    # Display sample images where no faces were detected.
    def show_no_face_samples(self, sample_size: int = 5) -> None:
        print(f"\nTotal images without faces detected: {len(self.no_face_detected)}")
        
        for img_path in self.no_face_detected[:sample_size]:
            try:
                img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
                plt.imshow(img)
                plt.title(f"No face detected: {img_path.name}")
                plt.axis('off')
                plt.show()
            except Exception as e:
                print(f"Error displaying {img_path}: {e}")

In [3]:
# Class for processing PDF files to detect faces.
class PDFProcessor:
    
    def __init__(self, face_detector: FaceDetector):
        self.pdf_dir = Path('pdfs')
        self.output_img_dir = Path('pdf_images_HaarCascade')
        self.pdf_cropped_dir = Path('pdf_cropped_faces_HaarCascade')
        self.face_detector = face_detector
        
        # Create directories if they don't exist
        self.output_img_dir.mkdir(parents=True, exist_ok=True)
        self.pdf_cropped_dir.mkdir(parents=True, exist_ok=True)
    
    # Convert PDF pages to images.
    def pdf_to_images(self, pdf_path: Path, output_dir: Path, dpi: int = 300) -> List[Path]:
        img_paths = []
        
        try:
            pdf_document = fitz.open(str(pdf_path))
            
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                # 72 is native PDF DPI
                zoom = dpi / 72  
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                
                img_path = output_dir / f"{pdf_path.stem}_page_{page_num+1}.png"
                pix.save(str(img_path))
                img_paths.append(img_path)
            
            return img_paths
            
        except Exception as e:
            print(f"Error converting PDF {pdf_path}: {str(e)}")
            return []
    
    # Process all PDF files in the directory.
    def process_pdfs(self) -> None:
        pdf_files = list(self.pdf_dir.glob("*.pdf"))
        
        for pdf_file in pdf_files:
            print(f"\nProcessing PDF: {pdf_file.name}")
            start_time = time.time()
            
            try:
                # Convert to images first, then process
                img_paths = self.pdf_to_images(pdf_file, self.output_img_dir)
                for img_path in img_paths:
                    self.face_detector.detect_faces_haar(img_path, is_pdf=True)
                
                # Direct processing (more efficient)
                # self._process_pdf_directly(pdf_file)
                
            except Exception as e:
                print(f"Error processing {pdf_file.name}: {str(e)}")
            
            elapsed_time = time.time() - start_time
            print(f"Completed processing {pdf_file.name} in {elapsed_time:.2f} seconds")
    
    # Process PDF directly without intermediate image files (more efficient).
    def _process_pdf_directly(self, pdf_file: Path) -> None:
        try:
            pdf_document = fitz.open(str(pdf_file))
            
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                # 300 DPI
                zoom = 300 / 72  
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat)
                
                # Convert to OpenCV format
                img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
                img = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                
                # Detect faces
                faces = self.face_detector.face_cascade.detectMultiScale(
                    gray,
                    scaleFactor=1.05,
                    minNeighbors=4,
                    minSize=(40, 40)
                )
                
                if len(faces) == 0:
                    print(f"No faces detected in page {page_num+1}")
                    continue
                
                # Save each cropped face
                for idx, (x, y, w, h) in enumerate(faces, start=1):
                    face_crop = img[y:y+h, x:x+w]
                    crop_name = f"{pdf_file.stem}_page_{page_num+1}_face_{idx}.jpg"
                    cv2.imwrite(str(self.pdf_cropped_dir / crop_name), face_crop)
                    print(f"Saved: {crop_name}")
                    
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {str(e)}")

In [5]:
# Main execution
if __name__ == "__main__":
    # Initialize face detector with timing
    print("Initializing face detector...")
    init_start = time.perf_counter()
    face_detector = FaceDetector()
    print(f"Detector initialized in {time.perf_counter() - init_start:.2f} seconds\n")
    
    # Process images with detailed timing
    print("=".center(50,"="))
    print("Starting image processing...")
    img_start = time.perf_counter()
    
    # Process maximum 25 images by default
    face_detector.process_images(max_images=25)
    processed_images = 25  
    
    img_elapsed = time.perf_counter() - img_start
    print("\n" + "="*50)
    print(f"IMAGE PROCESSING SUMMARY")
    print(f"Total images processed: {processed_images}")
    print(f"Images with faces detected: {processed_images - len(face_detector.no_face_detected)}")
    print(f"Images without faces: {len(face_detector.no_face_detected)}")
    print(f"Total processing time: {img_elapsed:.2f} seconds")
    print("="*50 + "\n")
    
    # Show samples with no faces detected
    if face_detector.no_face_detected:
        print(f"Showing {min(3, len(face_detector.no_face_detected))} samples without detected faces...")
        face_detector.show_no_face_samples(sample_size=3)

    # Process PDFs with comprehensive timing
    print("\n" + "="*50)
    print("Starting PDF processing...")
    pdf_start = time.perf_counter()
    
    pdf_processor = PDFProcessor(face_detector)    
    pdf_elapsed = time.perf_counter() - pdf_start
    print("\n" + "="*50)
    print(f"PDF PROCESSING SUMMARY")
    print(f"Total processing time: {pdf_elapsed:.2f} seconds")
    print("="*50 + "\n")
    
    # Final summary
    print("\n" + "="*50)
    print("FINAL SUMMARY")
    print(f"Total execution time: {time.perf_counter() - init_start:.2f} seconds")
    print(f"Total faces detected: {processed_images - len(face_detector.no_face_detected)}")
    print("=".center(50,"="))

Initializing face detector...
Detector initialized in 0.04 seconds

Starting image processing...
HaarCascade processed: 00.jpg, faces: 10, time: 0.74s
HaarCascade processed: 01.jpg, faces: 5, time: 0.64s
HaarCascade processed: 02.jpg, faces: 6, time: 0.61s
HaarCascade processed: 03.jpg, faces: 3, time: 0.65s
HaarCascade processed: 04.jpg, faces: 10, time: 0.81s
HaarCascade processed: 05.jpg, faces: 4, time: 0.70s
HaarCascade processed: 06.jpg, faces: 42, time: 1.10s
HaarCascade processed: 07.jpg, faces: 101, time: 1.56s
HaarCascade processed: 08.jpg, faces: 62, time: 1.14s
HaarCascade processed: 09.jpg, faces: 4, time: 0.65s
HaarCascade processed: 10.jpg, faces: 3, time: 0.57s
HaarCascade processed: 11.jpg, faces: 6, time: 0.58s
HaarCascade processed: 12.jpg, faces: 3, time: 0.56s
HaarCascade processed: 13.jpg, faces: 5, time: 0.56s
HaarCascade processed: 14.jpg, faces: 6, time: 0.63s
HaarCascade processed: 15.jpg, faces: 6, time: 0.55s
HaarCascade processed: 16.jpg, faces: 6, time: 1.