In [1]:
import cv2
import dlib

import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
%matplotlib inline

import base64
import time

In [2]:
faces_df = pd.read_csv('../data/FaceImageCroppedWithAlignment.tsv', sep='\t',
                       names=['FreebaseMID', 'ImageSearchRank', 'ImageURL', 'PageURL', 'FaceID', 'FaceRectangle', 'FaceData'],
                       nrows=1)

In [3]:
faces_df

Unnamed: 0,FreebaseMID,ImageSearchRank,ImageURL,PageURL,FaceID,FaceRectangle,FaceData
0,m.0107_f,0,http://getbeatmadrid.files.wordpress.com/2013/...,http://getbeatmadrid.wordpress.com/2013/01/28/...,FaceId-0,KsQsP3Pumj2B6UE/Vj4/Pg==,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAQCAwMDAgQDAw...


In [4]:
DATASET_PATH = '../data/FaceImageCroppedWithAlignment.tsv'
FACES_DUMP_PATH = '../data/faces.npy'

In [4]:
face_rec_model_path = '../models/dlib_face_recognition_resnet_model_v1.dat'
predictor_path = '../models/shape_predictor_68_face_landmarks.dat'

shape_predictor = dlib.shape_predictor(predictor_path)
facerec = dlib.face_recognition_model_v1(face_rec_model_path)

In [5]:
def _dump_descriptors(face_ids, descriptors, dump_path):
    face_ids = np.vstack(face_ids)
    descriptors = np.vstack(descriptors)
    faces = np.hstack([face_ids, descriptors])
    
    np.save(dump_path, faces)

In [8]:
max_num_lines = 8456240
num_lines = 1000
dump_freq = 100

face_ids = []
descriptors = []
times = []

with open(DATASET_PATH, 'r') as tsv_f:
    for i, row_str in tqdm(enumerate(tsv_f), total=min(num_lines, max_num_lines)):
        if i >= num_lines:
            break
        
        row = row_str.split('\t')
        mid, face_id, data = row[0], row[4], base64.b64decode(row[-1][:-1])
        
        with open('../data/tmp.png', 'wb') as f:
            f.write(data)
        img = cv2.imread('../data/tmp.png')[..., ::-1]
        
        face_id = int(face_id.split('FaceId-')[-1])
        face_ids.append(face_id)
        
        shape = shape_predictor(img, dlib.rectangle(0, 0, img.shape[1], img.shape[0]))
        
        start_time = time.time()
        face_descriptor = facerec.compute_face_descriptor(img, shape)
        end_time = time.time()
        times.append(end_time - start_time)
        
        face_descriptor = np.array(face_descriptor)
        descriptors.append(face_descriptor)
        
        if i % dump_freq == 1:
            _dump_descriptors(face_ids, descriptors, FACES_DUMP_PATH)
        
    _dump_descriptors(face_ids, descriptors, FACES_DUMP_PATH)

 99%|█████████▉| 993/1000 [00:09<00:00, 127.92it/s]


In [12]:
(8456240 / 5994 * 52) / 3600

20.37799280762244

In [9]:
np.mean(times)

0.00441951060295105

In [10]:
np.load('../data/faces.npy').shape

(1000, 129)

In [None]:
max_num_lines = 8456240
num_lines = 10000000000000

face_ids = []
m_ids = []

with open(DATASET_PATH, 'r') as tsv_f:
    for i, row_str in tqdm(enumerate(tsv_f), total=min(num_lines, max_num_lines)):
        if i >= num_lines:
            break
        
        row = row_str.split('\t')
        mid, face_id, data = row[0], row[4], base64.b64decode(row[-1][:-1])
#         face_id = int(face_id.split('FaceId-')[-1])
        face_ids.append(face_id)
        m_ids.append(mid)

m_ids = np.array(m_ids)
# np.save('../data/m_ids.npy', m_ids)

In [5]:
all_descrs = np.load('../data/faces.npy')
m_ids = np.load('../data/m_ids.npy')

In [6]:
unique_ids = np.unique(m_ids)
uids_map = {uid: i for uid, i in zip(unique_ids, range(len(unique_ids)))}
m_ids = np.apply_along_axis(lambda mid: uids_map[mid[0]], 1, m_ids)
m_ids = m_ids[:, np.newaxis]

In [17]:
descrs = np.hstack([m_ids, all_descrs[:, 1:]])

In [20]:
np.save('../data/faces_.npy', descrs)