# InceptionV3 Feature Extraction
Create "/data/dataset_features.csv"

In [1]:
# imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import cv2

import keras
import tensorflow as tf

from keras.applications import InceptionV3
from arcface import ArcFace
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# file paths
path = os.getcwd()
path = path[:-4]
data_dir = path + "/data/lfw-deepfunneled"

In [3]:
# model architecture: pre-trained InceptionV3 transfer learning
inceptionv3 = InceptionV3(include_top=True, weights='imagenet')

## Data Preprocessing $\rightarrow$ InceptionV3 Feature Embeddings

In [4]:
def getImg(name, num):
    imgPath = data_dir + '/' + name + '/' + name + '_' +  "{:04d}".format(num) + '.jpg'
    img = cv2.imread(imgPath)
    img = cv2.resize(img, (299, 299))
    return img

def getFeatures(x):
    return inceptionv3(tf.expand_dims(x, axis=0))[0].numpy()

In [5]:
people = pd.read_csv(path + "/data/people.csv")
names = [item for item in people["name"].tolist() if item in os.listdir(data_dir)] # intersection of names
people = people[people["name"].isin(names)] # filter people DataFrame to intersection

N = 6
people = people[people.images > N]

In [6]:
# restructure DataFrame
people = people.rename(columns={'images': 'num_images'})
people = people.set_index('name')

# process 250x250 image data as 299x299 image elements under respective person
image_col = []
for person in people.iterrows():
    person = person[1]
    image_list = []
    for i in range(int(person["num_images"])):
        image_path = data_dir + '/' + person.name + '/' + person.name + '_' +  "{:04d}".format(i+1) + '.jpg'
        image = cv2.imread(image_path)
        image = cv2.resize(image, (299, 299))
        image_list.append(image)
    image_col.append(image_list)
people["images"] = image_col

# num_images == len(images) for each person
assert ([len(x) for x in people["images"]] == people["num_images"].values).all()

# construct dataset DataFrame (precursor to X, y) with each row sample as the individual images
dataset = [] # element: (image, person, train or test)
for person in people.iterrows():
    person = person[1]
    image_list = person["images"]
    num_images = len(image_list)
    num_test_images = int(np.floor(num_images * 0.2))
    test_i = np.random.choice(num_images, num_test_images)
    for i in range(num_images):
        image = image_list[i]
        element = [image, person.name, "test" if i in test_i else "train"]
        dataset.append(element)
dataset = pd.DataFrame(dataset, columns=["image", "person", "split"])

In [8]:
dataset

Unnamed: 0,image,person,split
0,"[[[0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], ...",Ann_Veneman,train
1,"[[[0, 1, 1], [0, 1, 1], [0, 1, 1], [0, 1, 1], ...",Ann_Veneman,train
2,"[[[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 0, 0], ...",Ann_Veneman,train
3,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Ann_Veneman,train
4,"[[[129, 163, 176], [129, 163, 176], [129, 163,...",Ann_Veneman,test
...,...,...,...
5090,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,train
5091,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,train
5092,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,test
5093,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,train


In [14]:
features = []
for _ , person in dataset.iterrows():
    features.append(getFeatures(person.image))
dataset["v3_features"] = features

In [15]:
dataset

Unnamed: 0,image,person,split,v3_features
0,"[[[0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], ...",Ann_Veneman,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[[[0, 1, 1], [0, 1, 1], [0, 1, 1], [0, 1, 1], ...",Ann_Veneman,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[[[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 0, 0], ...",Ann_Veneman,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Ann_Veneman,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[[[129, 163, 176], [129, 163, 176], [129, 163,...",Ann_Veneman,test,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
5090,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.462..."
5091,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5092,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,test,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5093,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",Zhu_Rongji,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.076..."


In [16]:
dataset.to_csv(path + "/data/dataset_features.csv")