In [None]:
import zipfile
from PIL import Image
import pytesseract
import cv2 as cv
import numpy as np

# loading the face detection classifier
face_cascade = cv.CascadeClassifier('haarcascade_frontalface_default.xml')

### Extracting Images from zip file

In [None]:
# Lets iterate over the zip images
# We will store them in dictionary data structure for global use
# Download the dataset before running this cell
iter_images = {}

with zipfile.ZipFile('readonly/images.zip', 'r') as zip_image: # For testing use 'small_img.zip'
    for file in zip_image.infolist():
        with zip_image.open(file) as image_file:
            # Converting image file to useful image format using pillow 
            temp_img = Image.open(image_file).convert('RGB')
            iter_images[file.filename] = temp_img

### Extracting Text from Images

In [None]:
# Lets create a new dictionary with image name and its text as value
images_text = {}

for file in iter_images.keys():
    # We will use image_to_string function to extract the text from image
    txt = pytesseract.image_to_string(iter_images[file])
    images_text[file] = txt

### Extracting Face from Images

In [None]:
# We will use openCV to detect faces in the image
face_images = {}

for file in iter_images.keys():
    # Reading the image using numpy array
    cv_img = np.array(iter_images[file])

    # Converting the image to gray scale
    gray = cv.cvtColor(cv_img, cv.COLOR_BGR2GRAY)

    # Detecting faces in the image
    faces = face_cascade.detectMultiScale(gray, 1.3, 5) # 1.3 and 5 are the scaling factor and number of neighbors respectively

    # Lets create a list of cropped faces of size 100x100
    cropped_faces = []
    for x,y,w,h in faces:
        cropped_faces.append(iter_images[file].crop((x,y,x+w,y+h)).resize((100,100)))

    # Lets store the cropped faces in a dictionary
    face_images[file] = cropped_faces

### Searching for a face using text

In [None]:
def search_faces(text):
    for file in images_text.keys():
        if text in images_text[file]:
            if len(face_images[file]) != 0:
                print("Result found in file {}".format(file))
                
                # Calculate dimensions for the contact sheet
                h = int(len(face_images[file]) / 5) + 1 if len(face_images[file]) % 5 != 0 else int(len(face_images[file]) / 5)
                contact_sheet = Image.new(face_images[file][0].mode, (500, 100 * h))
                
                # Paste images into the contact sheet
                x = 0
                y = 0
                for img in face_images[file]:
                    contact_sheet.paste(img, (x, y))
                    if x + 100 == contact_sheet.width:
                        x = 0
                        y += 100
                    else:
                        x += 100
                
                display(contact_sheet)
            else:
                print("Result found in file {} \nBut there were no faces in that file\n\n".format(file))


In [None]:
search_faces('Christopher')

In [None]:
search_faces('Mark')

In [None]:
search_faces('pizza')