In [6]:
import os
import cv2
import dlib
import numpy as np
from tqdm import tqdm

import sys
sys.path.append('../../MFRS')

from utils.config import proj_path, data_path

In [9]:
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
face_encoder = dlib.face_recognition_model_v1("dlib_face_recognition_resnet_model_v1.dat")

In [10]:
#utils
def face_distance(face_encodings, face_to_compare):
    return np.linalg.norm(face_encodings - face_to_compare)


def compare_faces(known_face_encodings, face_encoding_to_check, tolerance=0.6):
    compare = []
    torf = (face_distance(known_face_encodings, face_encoding_to_check) <= tolerance)
    dis = np.round(face_distance(known_face_encodings, face_encoding_to_check), 2)
    return [torf, dis]

In [15]:
sitmuli_colors = os.path.join(data_path, 'STIMULI_colors/')
pictures_pathes = sorted(
    [
        os.path.join(sitmuli_colors, sname)
        for sname in os.listdir(sitmuli_colors)
    ]
)
all_data = os.path.join(data_path, 'all/')
all_data = sorted(
    [
        os.path.join(all_data, sname)
        for sname in os.listdir(all_data)
    ]
)


In [127]:
pictures_loop_generator = tqdm(all_data)
final_list = []
for picture in pictures_loop_generator:
    img = cv2.imread(picture)
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_face = detector(img_gray)[0]

    landmarks = predictor(img_gray, img_face)
    face_embedding = np.array(face_encoder.compute_face_descriptor(img_face, landmarks, num_jitters=1))
    final_list.append(face_embedding)

100%|██████████| 49026/49026 [1:36:21<00:00,  8.48it/s]   


In [128]:
np.save("celebA_embeddings.npy", np.array(final_list))

  np.save("celebA_embeddings.npy", np.array(final_list))


In [33]:
id=0 # iterate manually from 0 to 149
img = cv2.imread(pictures_pathes[0])
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
face = detector(gray)[0]
landmarks = predictor(gray, face)
main_face_embedding = np.array(face_encoder.compute_face_descriptor(im, landmarks, num_jitters=1))


In [35]:
list_comp = []
for i, list_of_face_embedding in enumerate(final_list):
    if len(list_of_face_embedding)==0:
        list_comp.append([i, -1])
        continue
    listof = compare_faces(list_of_face_embedding[0], main_face_embedding, tolerance=0.6)
    list_comp.append(listof)

In [37]:
min_score, max_score = 0.0, 0.5
selected=[]
for i, item in enumerate(list_comp):
    if type(item[0])==int:
        continue
    if  item[1]>=min_score and item[1]<max_score:
        selected.append(i)

In [None]:
# plot pictures and verify 
import matplotlib.pyplot as plt
fig, axes = plt.subplots(len(selected),1, figsize=(15, len(selected)*5))
for i in range(len(selected)):
    im=plt.imread(all_data[selected[i]])
    axes[i].imshow(im)

In [11]:
matched = {4:[12951, 33166, 15094],
          5: [9048],
          8: [5364, 3574, 15500],
          11:[16119],
          16: [6203],
          19: [2910, 15246, 31560 ],
          20: [7522],
          29: [9819, 25086],
           36: [46476],
           101: [34937],
           102: [25683],
           105:[20983],
           107:[31166],
           109: [25198, 10701],
           114: [19497],
           123: [33688], 
           125: [6155],
           130: [45260, 48905],
           136: [39948],
           138: [41123],
           140: [28371],
           143: [11257],
           52: [1007],
           53: [44424],
           57: [2926],
           62: [1105],
           65: [6577],
           68: [5539],
           69: [23033],
           80: [27486],
           81: [17062],
           }

In [71]:
import pandas as pd
data = pd.read_csv('../../files/identity_CelebA.txt', sep=" ", header=None)


In [72]:
matched_ids = {}
for key, value in matched.items():
    name = os.path.basename(all_data[value[0]])
    id = list(data[data[0]==name][1])[0]
    occ=data[data[1]==id].count()[0]
    name = f'f{key:03d}'
    matched_ids[name] = [id, occ]

In [73]:
stimuli_name = os.path.join(data_path, 'web_scrapping_stimuli/')
stimuli_name = sorted(
    [
        sname for sname in os.listdir(stimuli_name)
    ]
)


In [74]:
from PIL import Image

In [75]:
counter_images = len(data)
for stimuli in matched_ids.keys():
    values = matched_ids[stimuli]
    if values[1]==30:
        continue
    scrapped_data_folder = os.path.join(data_path, 'web_scrapping_stimuli/', stimuli)
    scrapped_data_pictures = sorted(
    [
        os.path.join(scrapped_data_folder, sname)
        for sname in os.listdir(scrapped_data_folder)
    ]
    )
    for image in scrapped_data_pictures:
        if os.path.basename(image) == ".DS_Store":
            continue
        im = Image.open(image)
        im = im.convert("RGB")
        im.save(os.path.join(data_path, f'new_celebA/{counter_images}.jpg'))
        list_row = [ f'{counter_images}.jpg', values[0]]
        data.loc[len(data)] = list_row
        counter_images+=1

In [76]:
stimuli_ids = [values[0] for values in matched_ids.values()]

In [77]:
counter_images = len(data)
new_id = max(list(data[1]))+1
for stimuli in stimuli_name:
    if stimuli in matched_ids.keys():
        continue
    scrapped_data_folder = os.path.join(data_path, 'web_scrapping_stimuli/', stimuli)
    scrapped_data_pictures = sorted(
    [
        os.path.join(scrapped_data_folder, sname)
        for sname in os.listdir(scrapped_data_folder)
    ]
    )
    for image in scrapped_data_pictures[:30]:
        if os.path.basename(image) == ".DS_Store":
            continue
        im = Image.open(image)
        im = im.convert("RGB")
        im.save(os.path.join(data_path, f'new_celebA/{counter_images}.jpg'))
        list_row = [ f'{counter_images}.jpg', new_id]
        data.loc[len(data)] = list_row
        counter_images+=1
    stimuli_ids.append(new_id)
    new_id+=1

In [56]:
data.to_csv('../../files/identity_CelebA_new.txt', header=None, index=None, sep=' ', mode='a')


In [91]:
result = data[~data[1].isin(stimuli_ids)].reset_index(drop=True)


In [202]:
from utils.config import proj_path
dir_path = os.path.join(proj_path, "files/")
from data_processing_scripts.celebA_select_ids import clean_csv, create_csv, correct_gender, generate_new_ids

In [137]:
old_data = pd.read_csv('../../files/identity_CelebA.txt', sep=" ", header=None)
old_data.columns =["name", "id"]
list_attr_celeba = pd.read_csv('../../files/list_attr_celeba.csv')
list_attr_celeba = pd.merge(old_data, list_attr_celeba, on='name', how='outer')
list_attr_celeba = list_attr_celeba[~list_attr_celeba["id"].isin(stimuli_ids)].reset_index(drop=True)

In [192]:
# Count the occurence of each id in the dataset
id_occurence=list_attr_celeba['id'].value_counts()
id_occurence=id_occurence.to_frame().reset_index()
id_occurence.columns=['id', 'values']

# Merge the id_occurence dataframe and merged dataframe
merged = pd.merge(list_attr_celeba, id_occurence, on="id")

# Corret some mistakes in gender attribue
merged=correct_gender(merged)

df_male=merged[merged['Male']==1]
df_female=merged[merged['Male']==-1]

In [203]:
final=create_csv(30, 435, df_male, df_female, merged)
not_included = data[data[1].isin(stimuli_ids)].reset_index(drop=True)
not_included.columns = ["name", "id"]
final = pd.concat([final, not_included])
final=generate_new_ids(final)

In [206]:
final.to_csv("../../files/csv_files/final_celebA_with_stimuli.csv")