# Data preparation

In this notebook, we will prepare the data for training our face recogntion model.

We split data into 2 set.
* training set: first 600 images
* Validation set: last 400 images

 ## Loading packages

In [29]:
import sys
import numpy as np

import matplotlib.pyplot as plt 
from skimage import io, util, color, transform

from statistics import mode

sys.path.append('../src')
import data_utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generating the data

### Generating the postive data

We crop the face in the training set, and save them.

In [7]:
label = np.loadtxt('../data/project_train/label.txt')

In [10]:
# get orginal images
img_raw = []
n_total = 1000
for i in range(n_total):
    im = color.rgb2gray(io.imread("../data/project_train/train/" + "%04d"%(i+1) + ".jpg"))
    im = util.img_as_float(im)
    img_raw.append(im)

In [49]:
# get face images
img_target = []
img_train = []
img_validation = []
for img in label:
    idx = int(img[0])
    o_h = int(img[1])
    o_l = int(img[2])
    h = int(img[3])
    l = int(img[4])
    sub_img = img_raw[idx-1][o_h :o_h + h, o_l : o_l + l]
    img_target.append(sub_img)
    if idx > 600:
        img_validation.append(sub_img)
    else:
        img_train.append(sub_img)

In [50]:
mean_hl = np.mean(label[:,3]/label[:,4])
mean_hl

1.519537040270022

In [51]:
ratio_HL = 1.5

In [52]:
# create detector box with fixed size
l_fixed = int(mode(label[:,4]))+1
h_fixed = int(ratio_HL * l_fixed)
print('{}, {}'.format(l_fixed, h_fixed))

80, 120


In [53]:
data_train_pos = []

for img in img_train:
    im = transform.resize(img,(h_fixed,l_fixed))
    data_train_pos.append(im)

In [54]:
data_train_pos = np.array(data_train_pos)
data_train_pos.shape

(771, 120, 80)

In [55]:
np.save('../data/output/data_set/data_train_pos', data_train_pos)

In [56]:
data_validation_pos = []

for img in img_validation:
    im = transform.resize(img,(h_fixed,l_fixed))
    data_validation_pos.append(im)

In [57]:
data_validation_pos = np.array(data_validation_pos)
data_validation_pos.shape

(513, 120, 80)

In [58]:
np.save('../data/output/data_set/data_validation_pos', data_validation_pos)

### Generating the negative data

We crop image randomly with the size of the detector box. If the cover area between the cropped image and one of the face images > 0.1 (the status of being completely covered represent by 1), then the cropped image isn't a negative example.

In [41]:
data_train_neg = []
for i in range(len(img_raw[0:600])):
    H = img_raw[i].shape[0]
    L = img_raw[i].shape[1]
    image = img_raw[i]
    exemples_pos = []
    for face in label:
        if face[0] == i+1:
            exemples_pos.append(face)
    nb_img_neg = 0
    while nb_img_neg < 10:
        position_h = int(np.random.uniform(0,H))
        position_l = int(np.random.uniform(0,L))
        # h and l need to be fine-tined
        h = int(np.random.uniform(int(H/5),int(H/2))) 
        l = int(h/ratio_HL)
        img_atr = np.array([position_h, position_l, h, l])
        is_pos = False
        for face in exemples_pos:
            aire_re = data_utils.calcul_aire_recouvrement(face[1:5], img_atr)
            if aire_re > 0.1:
                is_pos = True
        if is_pos == False:
            img_neg = image[position_h: position_h + h, position_l: position_l + l]
            img_neg = transform.resize(img_neg, (h_fixed, l_fixed))
            data_train_neg.append(img_neg)
            nb_img_neg += 1

In [42]:
data_train_neg = np.array(data_train_neg)

In [43]:
data_train_neg.shape

(6000, 120, 80)

In [44]:
np.save('../data/output/data_set/data_train_neg', data_train_neg)