In [1]:
import scipy.io
import numpy as np

from datetime import datetime, timedelta
import time

import tensorflow as tf

import keras
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.layers import Dense, Activation, Dropout, Flatten, Input, Convolution2D, ZeroPadding2D, MaxPooling2D, Activation
from keras.layers import Conv2D, AveragePooling2D
from keras.models import Model, Sequential
from keras import metrics
from keras.models import model_from_json

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import cv2

import pandas as pd
"""import ray
ray.init(plasma_directory="/workspaces/96273/temp")
import modin.pandas as pd"""

'import ray\nray.init(plasma_directory="/workspaces/96273/temp")\nimport modin.pandas as pd'

In [2]:
#if you have multiple GPUs, use this block to avoid allocate all GPUs and all memory.

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import tensorflow as tf
from keras import backend as K
# config = tf.ConfigProto()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
# session = tf.Session(config=config)
session = tf.compat.v1.Session(config=config)
K.set_session(session)

In [3]:
#VGG-Face for face recognition: https://sefiks.com/2018/08/06/deep-face-recognition-with-keras/

def loadVggFaceModel():
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(224,224, 3)))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Convolution2D(4096, (7, 7), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Convolution2D(4096, (1, 1), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Convolution2D(2622, (1, 1)))
    model.add(Flatten())
    model.add(Activation('softmax'))
    
    vgg_face_descriptor = Model(inputs=model.layers[0].input, outputs=model.layers[-2].output)
    
    return vgg_face_descriptor

In [4]:
model = loadVggFaceModel()

In [6]:
#you can download pretrained weights from https://drive.google.com/file/d/1CPSeum3HpopfomUEK1gybeuIVoeJT_Eo/view?usp=sharing
from keras.models import model_from_json
model.load_weights('vgg_face_weights.h5')

In [7]:
#open-cv's face detection module
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

# Read meta data

In [9]:
# Ref https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
# https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_crop.tar
mat = scipy.io.loadmat('imdb_crop/imdb.mat')

FileNotFoundError: [Errno 2] No such file or directory: 'imdb_crop/imdb.mat'

In [None]:
columns = ["dob", "photo_taken", "full_path", "gender", "name", "face_location", "face_score", "second_face_score", "celeb_names", "celeb_id"]

In [None]:
instances = mat['imdb'][0][0][0].shape[1]

In [None]:
df = pd.DataFrame(index = range(0,instances), columns = columns)

In [None]:
df.shape

In [None]:
for i in mat:
    if i == "imdb":
        current_array = mat[i][0][0]
        for j in range(len(current_array)):
            #print(j,". ",columns[j],": ",current_array[j][0])
            df[columns[j]] = pd.DataFrame(current_array[j][0])

In [None]:
df.head()

In [None]:
#remove pictures does not include face
df = df[df['face_score'] != -np.inf]

#some pictures include more than one face, remove them
df = df[df['second_face_score'].isna()]

#check threshold
df = df[df['face_score'] >= 3]

In [None]:
df.shape #95234

In [None]:
def extractNames(name):
    return name[0]

In [None]:
df['celebrity_name'] = df['name'].apply(extractNames)

In [None]:
df.shape

In [None]:
#df = df.drop(columns = ['dob', 'photo_taken', 'face_location', 'face_score', 'second_face_score'])

# Load data set images

In [None]:
def getImagePixels(image_path):
    return cv2.imread("imdb_crop/%s" % image_path[0]) #pixel values in scale of 0-255

In [None]:
tic = time.time()
df['pixels'] = df['full_path'].apply(getImagePixels)
toc = time.time()

print("this block completed in ",toc-tic," seconds...") #562.80 seconds

# Represent images as vectors

In [None]:
def findFaceRepresentation(img):
    detected_face = img
    #plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    
    #-----------------------------
    """
    faces = face_cascade.detectMultiScale(img, 1.3, 5)

    for (x,y,w,h) in faces:
        detected_face = img[int(y):int(y+h), int(x):int(x+w)]

        try:
            margin = 10
            margin_x = int((w * margin)/100); margin_y = int((h * margin)/100)
            detected_face = img[int(y-margin_y):int(y+h+margin_y), int(x-margin_x):int(x+w+margin_x)]
        except:
            print("detected face has no margin")
    """
    #-----------------------------
    
    try: 
        detected_face = cv2.resize(detected_face, (224, 224))
        #plt.imshow(cv2.cvtColor(detected_face, cv2.COLOR_BGR2RGB))
        
        #normalize detected face in scale of -1, +1

        img_pixels = image.img_to_array(detected_face)
        img_pixels = np.expand_dims(img_pixels, axis = 0)
        img_pixels /= 127.5
        img_pixels -= 1
        
        representation = model.predict(img_pixels)[0,:]
    except:
        representation = None
        
    return representation  

In [None]:
tic = time.time()
df['face_vector_raw'] = df['pixels'].apply(findFaceRepresentation) #vector for raw image
toc = time.time()

print("this block completed in ",toc-tic," seconds...")

# Load Your Photo

In [None]:
img = cv2.imread("sefik.jpg") #pixel values in scale of 0-255
#img = cv2.imread("sefik_2.jpg") #pixel values in scale of 0-255

plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

faces = face_cascade.detectMultiScale(img, 1.3, 5)

for (x,y,w,h) in faces:
    detected_face = img[int(y):int(y+h), int(x):int(x+w)]
    
    #cv2.rectangle(img,(x,y),(x+w,y+h),(128,128,128),cv2.FILLED)
    #plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    
    #add 15% margin around the face
    try:
        margin = 10
        margin_x = int((w * margin)/100); margin_y = int((h * margin)/100)
        detected_face = img[int(y-margin_y):int(y+h+margin_y), int(x-margin_x):int(x+w+margin_x)]
    except:
        print("detected face has no margin")
    
    detected_face = cv2.resize(detected_face, (224, 224))

#plt.imshow(detected_face)
#plt.imshow(cv2.cvtColor(detected_face, cv2.COLOR_BGR2RGB))

In [None]:
img_pixels = image.img_to_array(detected_face)
img_pixels = np.expand_dims(img_pixels, axis = 0)
img_pixels /= 127.5
img_pixels -= 1

In [None]:
yourself_representation = model.predict(img_pixels)[0,:]

# Find Similarities

In [None]:
def findCosineSimilarity(source_representation, test_representation=yourself_representation):
    try:
        a = np.matmul(np.transpose(source_representation), test_representation)
        b = np.sum(np.multiply(source_representation, source_representation))
        c = np.sum(np.multiply(test_representation, test_representation))
        return 1 - (a / (np.sqrt(b) * np.sqrt(c)))
    except:
        return 10 #assign a large value. similar faces will have small value.

In [None]:
df['similarity'] = df['face_vector_raw'].apply(findCosineSimilarity)

In [None]:
df = df.sort_values(by=['similarity'], ascending=True)

In [None]:
"""x = df.iloc[0]['pixels'].reshape(224, 224, 3)/255
plt.imshow(x)"""

In [None]:
df.head(1)

In [None]:
#df = df.reset_index()

In [None]:
#this block might show different pictures of same actors
if True:
    for i in range(0, 7):
        instance = df.iloc[i]
        name = instance['celebrity_name']
        similarity = instance['similarity']
        
        #img = instance['pixels']
        full_path = instance['full_path'][0]
        img = cv2.imread("imdb_crop/%s" % full_path)
        
        print(i,".",name," (",similarity,") - ",full_path)

        plt.axis('off')
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.show()

        print("-------------------------")

In [None]:
pivot_df = df.drop_duplicates(subset ="celebrity_name")
pivot_df = pivot_df[pivot_df['photo_taken'] >= 2000]

#0: woman, 1: man. if you know the gender of your target image, then you can filter data set and it fasten system
pivot_df = pivot_df[pivot_df['gender'] == 1]

pivot_df = pivot_df.reset_index()

In [None]:
for i in range(0, 4):
    instance = pivot_df.iloc[i]
    name = instance['celebrity_name']
    similarity = instance['similarity']
    
    similarity = (1 - similarity)*100
    
    #img = instance['pixels']
    full_path = instance['full_path'][0]
    img = cv2.imread("imdb_crop/%s" % full_path)
    
    print(name," (",similarity,"%) - ",full_path)
    
    plt.axis('off')
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()
    
    print("-------------------------")

In [None]:
#you can check a specific celebrity
target = pivot_df[pivot_df['celebrity_name'] == 'Jim Parsons']

for index, instance in target.iterrows():
    name = instance['celebrity_name']
    similarity = instance['similarity']
    full_path = instance['full_path'][0]

    similarity = (1 - similarity)*100
    
    print(index,". ", name," (",similarity,") - ",full_path)

    img = cv2.imread("imdb_crop/%s" % full_path)
    
    plt.axis('off')
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()