In [None]:
import os
import cv2
import h5py
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from glob import glob
from random import choice, sample
from tqdm import tqdm
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.layers import Input, Dense, GlobalMaxPool2D, GlobalAvgPool2D, Concatenate, Multiply, Dropout, Subtract
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing import image

In [None]:
print(os.listdir("../input"))

In [None]:
!pip install git+https://github.com/rcmalli/keras-vggface.git

In [None]:
from keras_vggface.vggface import VGGFace
from keras_vggface.utils import preprocess_input

In [None]:
# Will unzip the files so that you can see them..
with zipfile.ZipFile("../input/recognizing-faces-in-the-wild/train.zip","r") as z:
    z.extractall("./train")
with zipfile.ZipFile("../input/recognizing-faces-in-the-wild/test.zip","r") as z:
    z.extractall("./test")
    
train_relationships_file_path = "../input/recognizing-faces-in-the-wild/train_relationships.csv"
training_data_folders_path = "./train/"
val_families = "F00"

In [None]:
all_images = glob(training_data_folders_path + "*/*/*.jpg")
# print(all_images)  # list all the images
training_data_images = [x for x in all_images]  # list all the images
validation_set_images = [x for x in all_images if val_families in x] # list images that belong to val_families
# print(validation_set_images)
train_person_to_images_map = defaultdict(list)

ppl = [x.split("/")[-3] + "/" + x.split("/")[-2] for x in all_images]
# print(ppl)
# print(len(ppl))  # 12379 pics in total

for x in training_data_images:
    train_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)  # total number of ppl in the data set
    
# print(train_person_to_images_map)  # segregates pics of each person in each family of train images
# print(len(train_person_to_images_map))  # 2316 ppl in total

val_person_to_images_map = defaultdict(list)

for x in validation_set_images:
    val_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

# print(val_person_to_images_map)  # segregates pics of each person in each family of val images
# print(len(val_person_to_images_map))  # 263 ppl in total

In [None]:
relationships = pd.read_csv(train_relationships_file_path)
# print(relationships)
relationships = list(zip(relationships.p1.values, relationships.p2.values))
# print(relationships)
# print(len(relationships))  # 3598 realtions or rows in the csv file
relationships = [x for x in relationships if x[0] in ppl and x[1] in ppl]  # to eliminate the false relations
# relationships = [x for x in relationships if x[0] in ppl and x[1] not in ppl]
# print(relationships)
# print(len(relationships))  # 3362 true relations

train_relationships = [x for x in relationships if val_families not in x[0]]  # all the relations without the val_families
# print(train_relationships)
val_relationships = [x for x in relationships if val_families in x[0]]  # relations only with the val_families
# print(val_relationships)

In [None]:
def read_img(path):  # read_img function with path parameter
    img = image.load_img(path, target_size=(224, 224))  # loading the image as 224*224 size
    img = np.array(img).astype(np.float)  # converting the image into a array of floats
    return preprocess_input(img, version=2)  # normalize each pixel value

def gen(list_tuples, person_to_images_map, batch_size=16):
    ppl = list(person_to_images_map.keys())
    while True:
        batch_tuples = sample(list_tuples, batch_size // 2)
        labels = [1] * len(batch_tuples)
        while len(batch_tuples) < batch_size:
            p1 = choice(ppl)
            p2 = choice(ppl)

            if p1 != p2 and (p1, p2) not in list_tuples and (p2, p1) not in list_tuples:
                batch_tuples.append((p1, p2))
                labels.append(0)

        for x in batch_tuples:
            if not len(person_to_images_map[x[0]]):
                print(x[0])

        X1 = [choice(person_to_images_map[x[0]]) for x in batch_tuples]
        X1 = np.array([read_img(x) for x in X1])

        X2 = [choice(person_to_images_map[x[1]]) for x in batch_tuples]
        X2 = np.array([read_img(x) for x in X2])

        yield [X1, X2], labels


def baseline_model():
    input_1 = Input(shape=(224, 224, 3))
    input_2 = Input(shape=(224, 224, 3))

    base_model = VGGFace(model='resnet50', include_top=False)  

    for x in base_model.layers[:-3]:
        x.trainable = True

    x1 = base_model(input_1)
    x2 = base_model(input_2)

    x1 = Concatenate(axis=-1)([GlobalMaxPool2D()(x1), GlobalAvgPool2D()(x1)])  # next layer
    x2 = Concatenate(axis=-1)([GlobalMaxPool2D()(x2), GlobalAvgPool2D()(x2)])

    x3 = Subtract()([x1, x2])  # next layer
    x3 = Multiply()([x3, x3])  # next layer

    x1_ = Multiply()([x1, x1])
    x2_ = Multiply()([x2, x2])
    x4  = Subtract()([x1_, x2_])
    
    x5 = Multiply()([x1, x2])
    
    x = Concatenate(axis=-1)([x4, x3, x5])

    x = Dense(100, activation="relu")(x)
    x = Dropout(0.01)(x)
    out = Dense(1, activation="sigmoid")(x)

    model = Model([input_1, input_2], out)

    model.compile(loss="binary_crossentropy", metrics=['acc'], optimizer=Adam(0.00001))

    model.summary()

    return model

In [None]:
file_path = "./vgg_face.h5"

#verbose is a parameter which deicdes how much information is to be displayed on the terminal every epoch 
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# A function to save the model after monitering a value ( val_acc ), once when it has reached 
#the best value (save_best_only) when the value is its max value ( mode = 'max')

reduce_on_plateau = ReduceLROnPlateau(monitor="val_acc", mode="max", factor=0.1, patience=20, verbose=1)
# A function to reduce the learning rate by  a factor (factor) based on monitering a value (val_acc) if it has no improvemnt 
#from its best score (mode = 'max') for a few epochs (patience)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
# A fucntion to stop the training after a few rounds (patience) if there is no improvement on the value being monitered
#( loss ) based on its best score depending if it is to be maximised or minimised (mode)

callbacks_list = [checkpoint, reduce_on_plateau, es]

curr_model = baseline_model()  # initializing model with the given layes
# curr_model.load_weights(file_path)
curr_model_hist = curr_model.fit_generator(gen(train_relationships, train_person_to_images_map, batch_size=16), use_multiprocessing=True,
                    validation_data=gen(val_relationships, val_person_to_images_map, batch_size=16), epochs=120, verbose=2,
                    workers=4, callbacks=callbacks_list, steps_per_epoch=200, validation_steps=10)

In [None]:
import matplotlib.pyplot as plt
def plot_accuracy(y):
    if(y == True):
        plt.plot(curr_model_hist.history['acc'])
        plt.plot(curr_model_hist.history['val_acc'])
        plt.legend(['train', 'validation'], loc='lower right')
        plt.title('accuracy plot - train vs validation')
        plt.xlabel('epoch')
        plt.ylabel('accuracy')
        plt.show()
    else:
        pass
    return

def plot_loss(y):
    if(y == True):
        plt.plot(curr_model_hist.history['loss'])
        plt.plot(curr_model_hist.history['val_loss'])
        plt.legend(['training loss', 'validation loss'], loc = 'upper right')
        plt.title('loss plot - training vs vaidation')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
    else:
        pass
    return


plot_accuracy(True)
plot_loss(True)

In [None]:
# test_path = "./test/"

# def chunker(seq, size=32):
#     return (seq[pos:pos + size] for pos in range(0, len(seq), size))

# submission = pd.read_csv( '../input/recognizing-faces-in-the-wild/sample_submission.csv')

# predictions = []

# for batch in tqdm(chunker(submission.img_pair.values)):
#     X1 = [x.split("-")[0] for x in batch]
#     X1 = np.array([read_img(test_path + x) for x in X1])

#     X2 = [x.split("-")[1] for x in batch]
#     X2 = np.array([read_img(test_path + x) for x in X2])
    
#     prediction = 0
#     for i in ['00','01','02','03','04','05','06','07','08','09']:
#         curr_model.load_weights('../input/valaccall/Val_acc_f'+ i +'.h5')
#         prediction = prediction + curr_model.predict([X1, X2])
#     for i in ['00','01','02','03','04','05','06','07','08','09']:
#         curr_model.load_weights('../input/vjmodels/F'+ i +'_val_loss.h5')
#         prediction = prediction + curr_model.predict([X1, X2])
#     prediction = prediction/20
#     pred = prediction.ravel().tolist()
#     predictions += pred

# submission['is_related'] = predictions

# submission.to_csv("CSVFinal.csv", index=False)