
> 이 코드는 threshold를 구할 때 필요한 csv file을 생성한다

- csv file에는 같은 사람 3000 pairs, 다른 사람 3000pairs에 대하여 facenet을 이용해 구한 거리 값을 저장
- 결과물(outputs)
    - 동양인 csv file = `pair_with_dist_Asian.csv`
    - 서양인 csv file = `pair_with_dist_lfw.csv`
    


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import itertools
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
tqdm.pandas()

# 동양인 데이터
- **<u>경로만 맞추면 코랩이든 로컬이든 다 돌아감</u>**
- face train data
- 같은 사람 3000 pairs, 다른 사람 3000 pairs 



### Data set

In [3]:
%%time
import os
data_path = '/content/drive/My Drive/기업프로젝트-라젠/data/face_train_data/' # 동양인 데이터 경로로
ids = os.listdir(data_path)

idendities = {}
for id in ids:
    idendities[id] = os.listdir(data_path + id)

CPU times: user 281 ms, sys: 457 ms, total: 739 ms
Wall time: 6min 4s


In [4]:
for k, v in idendities.items():
    print(k, v)
    break

10469465 ['1037182866,2808591370_align.jpg', '968458796,1453049543_align.jpg']


### Positive samples
Find different photos of same people

In [5]:
positives = []

for key, values in idendities.items():
    
    # print(key)
    for i in range(0, len(values)-1):
        for j in range(i+1, len(values)):
            # print(key + '/' + values[i] + " and " + key + '/' + values[j])
            positive = []
            positive.append(key + '/' + values[i])
            positive.append(key + '/' + values[j])
            positives.append(positive)

In [None]:
positives = pd.DataFrame(positives, columns = ["file_x", "file_y"])
positives["decision"] = "Yes"
positives

Unnamed: 0,file_x,file_y,decision
0,"10469465/1037182866,2808591370_align.jpg","10469465/968458796,1453049543_align.jpg",Yes
1,"10471276/992158726,2815268164_align.jpg",10471276/50da81cb39dbb6fdefe8cefe0924ab18962b3...,Yes
2,"10470003/1514959826,590122606_align.jpg","10470003/28251887,1789035904_align.jpg",Yes
3,"10470003/1514959826,590122606_align.jpg","10470003/2105697140,2410667575_align.jpg",Yes
4,"10470003/1514959826,590122606_align.jpg","10470003/4171916877,3039255224_align.jpg",Yes
...,...,...,...
13007022,"10177409/1507152708,823560618_align.jpg","10177409/2824455197,150041884_align.jpg",Yes
13007023,"10177409/1507152708,823560618_align.jpg","10177409/592513219,3340722231_align.jpg",Yes
13007024,"10177409/2539562195,2419004821_align.jpg","10177409/2824455197,150041884_align.jpg",Yes
13007025,"10177409/2539562195,2419004821_align.jpg","10177409/592513219,3340722231_align.jpg",Yes


### Negative samples
Compare photos of different people

In [None]:
ids_keys_list = list(idendities.keys())
samples_list = list(idendities.values())

negatives = []

for i in range(0, len(idendities) - 1): # key 개수
    for j in range(i+1, len(idendities)):
        #print(samples_list[i], " vs ",samples_list[j]) 
        cross_product = itertools.product(samples_list[i], samples_list[j]) 
        cross_product = list(cross_product) # list of tuples
        #print(cross_product)
        
        for cross_sample in cross_product:
            # print(ids_keys_list[i] + '/' + cross_sample[0] + " vs " + ids_keys_list[j] + '/' + cross_sample[1])
            negative = []
            negative.append(ids_keys_list[i] + '/' + cross_sample[0])
            negative.append(ids_keys_list[j] + '/' + cross_sample[1])
            negatives.append(negative)

negatives = pd.DataFrame(negatives, columns = ["file_x", "file_y"])
negatives["decision"] = "No"

### Merge Positives and Negative Samples

In [None]:
df = pd.concat([positives, negatives]).reset_index(drop = True)

In [None]:
df.file_x = "/content/drive/My Drive/기업프로젝트-라젠/data/face train data/"+df.file_x  # 데이터 전체 경로가 입력되게 함
df.file_y = "/content/drive/My Drive/기업프로젝트-라젠/data/face train data/"+df.file_y

In [None]:
df.iloc[0, 0]

In [None]:
# 같은 얼굴 3000개, 다른 얼굴 3000개 pair만 남기기
selected_pos_df = df[df['decision']=='Yes'].sample(n=3000)
selected_neg_df = df[df['decision']=='No'].sample(n=3000)

selected_df = pd.concat([selected_pos_df, selected_neg_df]).reset_index(drop = True)

In [None]:
selected_df

Unnamed: 0,file_x,file_y,decision
0,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes
1,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes
2,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes
3,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes
4,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes
...,...,...,...
5995,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No
5996,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No
5997,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No
5998,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No


### DeepFace (거리 계산)

In [None]:
!pip install DeepFace
from deepface import DeepFace

In [None]:
instances = selected_df[["file_x", "file_y"]].values.tolist()
print(instances[:10])

[['/content/drive/My Drive/기업프로젝트-라젠/data/face train data/8529/2033024338,2950406043_align.jpg', '/content/drive/My Drive/기업프로젝트-라젠/data/face train data/8529/2996900555,852080785_align.jpg'], ['/content/drive/My Drive/기업프로젝트-라젠/data/face train data/6730/1064443725,81020363_align.jpg', '/content/drive/My Drive/기업프로젝트-라젠/data/face train data/6730/0db52fad7062d5444a36d678.jpg_align.jpg'], ['/content/drive/My Drive/기업프로젝트-라젠/data/face train data/9664/4198803250,2026401963_align.jpg', '/content/drive/My Drive/기업프로젝트-라젠/data/face train data/9664/640802359,234659442_align.jpg'], ['/content/drive/My Drive/기업프로젝트-라젠/data/face train data/11375/3891442477,936584836_align.jpg', '/content/drive/My Drive/기업프로젝트-라젠/data/face train data/11375/1340820667,2772650163_align.jpg'], ['/content/drive/My Drive/기업프로젝트-라젠/data/face train data/1131/813846020,40360654_align.jpg', '/content/drive/My Drive/기업프로제

In [None]:
model_name = "Facenet"
distance_metric = "euclidean_l2"
resp_obj = DeepFace.verify(instances, model_name = model_name, distance_metric = distance_metric, enforce_detection=False)

Using Facenet model backend euclidean_l2 distance.


Verification: 100%|██████████| 6000/6000 [26:30<00:00,  3.77it/s]


In [None]:
distances = []
for i in range(0, len(instances)):
    distance = round(resp_obj["pair_%s" % (i+1)]["distance"], 4)
    distances.append(distance)

selected_df["distance_l2"] = distances
selected_df

Unnamed: 0,file_x,file_y,decision,distance,distance_l2
0,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes,11.1794,1.3843
1,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes,7.3643,0.6335
2,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes,8.3086,1.2123
3,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes,12.0697,1.2579
4,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,Yes,11.8796,1.2326
...,...,...,...,...,...
5995,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No,10.3889,1.2115
5996,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No,3.4046,0.6786
5997,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No,10.9161,1.0488
5998,/content/drive/My Drive/기업프로젝트-라젠/d...,/content/drive/My Drive/기업프로젝트-라젠/d...,No,10.8688,1.0539


In [None]:
# selected_df.to_csv('pair_with_dist_Asian.csv', index=False)

In [None]:
selected_df.describe()