# 1. 이미지 전처리
> 정면 얼굴 찾기

In [None]:
# vision api로 정면 얼굴 찾기
from googleapiclient import  discovery
from oauth2client.client  import GoogleCredentials
import sys
import io
import base64
from PIL import Image
from PIL import ImageDraw
from genericpath import isfile
import os
import hashlib
from oauth2client.service_account import ServiceAccountCredentials


NUM_THREADS = 70000
MAX_FACE = 2
MAX_LABEL = 2
IMAGE_SIZE = 156,156
MAX_ROLL = 15
MAX_TILT = 15
MAX_PAN = 15

# index to transfrom image string label to number
global_label_index = 0 
global_label_number = [0 for x in range(13)]
global_image_hash = []

class FaceDetector():
    def __init__(self):
        # initialize library
        #credentials = GoogleCredentials.get_application_default()
        scopes = ['https://www.googleapis.com/auth/cloud-platform']
        credentials = ServiceAccountCredentials.from_json_keyfile_name(
                        './still-sensor-325313-e544dd4f101f.json', scopes=scopes)
        self.service = discovery.build('vision', 'v1', credentials=credentials)
        #print ("Getting vision API client : %s" ,self.service)

    #def extract_face(selfself,image_file,output_file):
    def skew_angle(self):
        return None
    
    def detect_face(self,image_file):
        try:
            with io.open(image_file,'rb') as fd:
                image = fd.read()
                batch_request = [{
                        'image':{
                            'content':base64.b64encode(image).decode('utf-8')
                            },
                        'features':[
                            {
                            'type':'FACE_DETECTION',
                            'maxResults':MAX_FACE,
                            },
                            {
                            'type':'LABEL_DETECTION',
                            'maxResults':MAX_LABEL,
                            }
                                    ]
                        }]
                fd.close()
        
            request = self.service.images().annotate(body={
                            'requests':batch_request, })
            response = request.execute()
            if 'faceAnnotations' not in response['responses'][0]:
                print('[Error] %s: Cannot find face ' % image_file)
                return None
                
            face = response['responses'][0]['faceAnnotations']
            label = response['responses'][0]['labelAnnotations']
            
            if len(face) > 1 :
                print('[Error] %s: It has more than 2 faces in a file' % image_file)
                return None
            
            roll_angle = face[0]['rollAngle']
            pan_angle = face[0]['panAngle']
            tilt_angle = face[0]['tiltAngle']
            angle = [roll_angle,pan_angle,tilt_angle]
            
            # check angle
            # if face skew angle is greater than > 20, it will skip the data
            if abs(roll_angle) > MAX_ROLL or abs(pan_angle) > MAX_PAN or abs(tilt_angle) > MAX_TILT:
                print('[Error] %s: face skew angle is big' % image_file)
                return None
            
            # check sunglasses
            for l in label:
                if 'sunglasses' in l['description']:
                    print('[Error] %s: sunglass is detected' % image_file)  
                    return None
            
            box = face[0]['fdBoundingPoly']['vertices']
            left = box[0]['x']
            top = box[1]['y']
                
            right = box[2]['x']
            bottom = box[2]['y']
                
            rect = [left,top,right,bottom]
                
            print("[Info] %s: Find face from in position %s and skew angle %s" % (image_file,rect,angle))
            return rect
        except Exception as e:
            print('[Error] %s: cannot process file : %s' %(image_file,str(e)) )
            
    def rect_face(self,image_file,rect,outputfile):
        try:
            fd = io.open(image_file,'rb')
            image = Image.open(fd)
            draw = ImageDraw.Draw(image)
            draw.rectangle(rect,fill=None,outline="green")
            image.save(outputfile)
            fd.close()
            print('[Info] %s: Mark face with Rect %s and write it to file : %s' %(image_file,rect,outputfile) )
        except Exception as e:
            print('[Error] %s: Rect image writing error : %s' %(image_file,str(e)) )
        
    def crop_face(self,image_file,rect,outputfile):
        
        global global_image_hash
        try:
            fd = io.open(image_file,'rb')
            image = Image.open(fd)  

            # extract hash from image to check duplicated image
            m = hashlib.md5()
            with io.BytesIO() as memf:
                image.save(memf, 'PNG')
                data = memf.getvalue()
                m.update(data)
            image_hash = m.hexdigest()
            
            if image_hash in global_image_hash:
                print('[Error] %s: Duplicated image' %(image_file) )
                return None
            global_image_hash.append(image_hash)

            crop = image.crop(rect)
            im = crop.resize(IMAGE_SIZE,Image.ANTIALIAS)
            
            
            im.save(outputfile,"JPEG")
            fd.close()
            print('[Info]  %s: Crop face %s and write it to file : %s' %( image_file,rect,outputfile) )
            return True
        except Exception as e:
            print('[Error] %s: Crop image writing error : %s' %(image_file,str(e)) )
        
    def getfiles(self,src_dir):
        files = []
        for f in os.listdir(src_dir):
            if isfile(os.path.join(src_dir,f)):
                if not f.startswith('.'):
                    files.append(os.path.join(src_dir,f))

        return files
    
    # read files in src_dir and generate image that rectangle in face and write into files in des_dir
    def rect_faces_dir(self,src_dir,des_dir):
        if not os.path.exists(des_dir):
            os.makedirs(des_dir)
            
        files = self.getfiles(src_dir)
        for f in files:
            des_file = os.path.join(des_dir,os.path.basename(f))
            rect = self.detect_face(f)
            if rect != None:
                self.rect_face(f, rect, des_file)
    
    # read files in src_dir and crop face only and write it into des_dir
    def crop_faces_dir(self,src_dir,des_dir,maxnum):
        
        # training data will be written in $des_dir/training
        # validation data will be written in $des_dir/validate
        
        des_dir_training = os.path.join(des_dir,'training')
        des_dir_validate = os.path.join(des_dir,'validate')
        
        if not os.path.exists(des_dir):
            os.makedirs(des_dir)
        if not os.path.exists(des_dir_training):
            os.makedirs(des_dir_training)
        if not os.path.exists(des_dir_validate):
            os.makedirs(des_dir_validate)
        
        path,folder_name = os.path.split(src_dir)
        label = folder_name
        
        # create label file. it will contains file location 
        # and label for each file
        training_file = open(des_dir+'/training_file.txt','a')
        validate_file = open(des_dir+'/validate_file.txt','a')
        
        files = self.getfiles(src_dir)
        global global_label_index
        cnt = 0 
        num = 0 # number of training data
        for f in files:
            rect = self.detect_face(f)

            # replace ',' in file name to '.'
            # because ',' is used for deliminator of image file name and its label
            des_file_name = os.path.basename(f)
            des_file_name = des_file_name.replace(',','_')
            
            if rect != None:
                # 70% of file will be stored in training data directory
                if(cnt < 8):
                    des_file = os.path.join(des_dir_training,des_file_name)
                    # if we already have duplicated image, crop_face will return None
                    if self.crop_face(f, rect, des_file ) != None:
                        training_file.write("%s,%s,%d\n"%(des_file,label,global_label_index) )
                        num = num + 1
                        global_label_number[global_label_index] = num
                        cnt = cnt+1

                    if (num>=maxnum):
                        break
                # 30% of files will be stored in validation data directory
                else: # for validation data
                    des_file = os.path.join(des_dir_validate,des_file_name)
                    if self.crop_face(f, rect, des_file) != None:
                        validate_file.write("%s,%s,%d\n"%(des_file,label,global_label_index) )
                        cnt = cnt+1
                    
                if(cnt>9): 
                    cnt = 0
        #increase index for image label
        
        global_label_index = global_label_index + 1 
        print('## label %s has %s of training data' %(global_label_index,num))
        training_file.close()
        validate_file.close()
        
    def getdirs(self,dir):
        dirs = []
        for f in os.listdir(dir):
            f=os.path.join(dir,f)
            if os.path.isdir(f):
                if not f.startswith('.'):
                    dirs.append(f)

        return dirs
        
    def crop_faces_rootdir(self,src_dir,des_dir,maxnum):
        # crop file from sub-directoris in src_dir
        dirs = self.getdirs(src_dir)
        
        #list sub directory
        for d in dirs:
            print('[INFO] : ### Starting cropping in directory %s ###'%d)
            self.crop_faces_dir(d, des_dir,maxnum)
        #loop and run face crop
        global global_label_number
        print("number of datas per label ", global_label_number)
        
def main(argv):
#   원본 이미지 저장장소
    srcdir= 'C:\\Users\\성수현\\face_image_align\\원본\\'
#   수정 이미지 저장장소 
    desdir = 'C:\\Users\\성수현\\face_image_align\\수정본\\'
#   이미지에서 정면 얼굴 찾는 횟수
    maxnum = int(36000)

    detector = FaceDetector()

    detector.crop_faces_rootdir(srcdir, desdir,maxnum)
    
if __name__ == "__main__":
    main(sys.argv)

# 2. 이미지 수치화

# 2.1 훈련데이터 수치화

In [5]:
import face_recognition as fr
from PIL import Image
import glob
import os
import numpy as np
import pandas as pd

In [171]:
# 남자 학습 데이터 불러오기
train_image_files = glob.glob('C:\\Users\\성수현\\face_image_align\\수정본\\training\\train_man\\'+'*.*')

In [8]:
# 학습 이미지 데이터 -> 벡터 데이터(수치화)
image = []
encodings = []
for file in image_files:
    encodings.append(fr.face_encodings(fr.load_image_file(file)))

In [172]:
train_filelist = os.listdir('C:\\Users\\성수현\\face_image_align\\수정본\\training\\train_man\\')

In [162]:
# 뽑아낸 이미지 데이터와 데이터 경로의 개수가 일치하는지 확인
print(len(encodings),len(train_image_files),len(train_filelist))

6313 6313 6313


In [120]:
# encoding 과정이 너무 오래걸리므로 dataframe 형식으로 저장하여 불러오기
pd.DataFrame(encodings,index=train_filelist).to_csv('./수정본/training/man_train_encodings.txt')

In [173]:
# 뽑아낸 이미지 데이터와 데이터 경로의 개수 일치 시키기(간혹 이미지 데이터 벡터화에 실패하는 이미지 존재 -> 빈 리스트가 존재)
cnt = 0
for idx,val in enumerate(encodings):
    if not val:
        train_filelist.pop(idx-cnt)
        train_image_files.pop(idx-cnt)
        cnt += 1

In [174]:
# 2차원 리스트 -> 1차원 리스트(KMeans에서 입력이 1차원이어야 하므로) + 빈 리스트 제거
t = []
t = encodings.copy()
man_train_encodings = sum(t,[])

In [176]:
# 뽑아낸 이미지 데이터와 데이터 경로의 개수가 일치하는지 확인
print(len(man_train_encodings),len(train_image_files),len(train_filelist))

6039 6039 6039


# 2.2 KMeans

In [63]:
from sklearn.cluster import KMeans

In [64]:
# train set으로 KMeans 학습
clf = KMeans(n_clusters = 13)
clf.fit(man_train_encodings)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=13, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [65]:
# 13개의 class로 나뉘었는지 확인
np.unique(clf.labels_)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

# 2.3 테스트 데이터 수치화

In [155]:
val_image_path = glob.glob('C:\\Users\\성수현\\face_image_align\\수정본\\validate\\val_man\\'+'*.*')

In [24]:
val_image = []
val_man_encodings = []
for val_man_file in val_image_path:
    val_man_encodings.append(fr.face_encodings(fr.load_image_file(val_man_file)))

In [156]:
val_filelist = os.listdir('C:\\Users\\성수현\\face_image_align\\수정본\\validate\\val_man\\')

In [131]:
# encoding 과정이 너무 오래걸리므로 dataframe 형식으로 저장하여 불러오기
pd.DataFrame(val_man_encodings,index=val_filelist).to_csv('./수정본/validate/man_val_encodings.txt')

In [157]:
cnt = 0
for idx,val in enumerate(val_man_encodings):
    if not val:
        print(filelist[idx-cnt])
        val_filelist.pop(idx-cnt)
        val_image_path.pop(idx-cnt)
        
        cnt += 1

2PM 장우영27.jpg
FT아일랜드 최민환31.jpg
HOT 강타46.jpg
NCT 제노42.jpg
NCT 태용44.jpg
NCT 해찬1.jpg
SF9 로운24.jpg
SF9 로운24.jpg
가수 비47.jpg
가수 세븐14.jpg
가수 신승훈12.jpg
가수 에릭남43.jpg
가수 영웅재중48.jpg
가수 은지원44.jpg
가수 이재훈33.jpg
가수 이재훈33.jpg
가수 정세운3.jpg
가수 존박35.jpg
가수 테이43.jpg
개그맨 김태현38.jpg
개그맨 오지환11.jpg
개그맨 이정규39.jpg
개그맨 정종철31.jpg
국회의원 김상훈43.jpg
꽃보다남자 구준표21.jpg
네이버 창업주 이해진_v36.jpg
네이버이사회의장 이해진35.jpg
노홍철42.jpg
농구선수 허훈48.jpg
더보이즈 주학년24.jpg
래퍼 넉살27.jpg
래퍼 로꼬16.jpg
래퍼 원슈타인16.jpg
래퍼 한해2.jpg
래퍼 행주9.jpg
모델 주우재14.jpg
모델 주우재14.jpg
몬스타엑스 민혁17.jpg
몬스타엑스 민혁17.jpg
몬스타엑스 형원12.jpg
방탄소년단 슈가44.jpg
방탄소년단 제이홉37.jpg
방탄소년단 지민10.jpg
배구선수 조재성1.jpg
배우 공명23.jpg
배우 김래원29.jpg
배우 김병세21.jpg
배우 김사권9.jpg
배우 김승수45.jpg
배우 박광현14.jpg
배우 봉태규2.jpg
배우 소지섭1.jpg
배우 신성록48.jpg
배우 양세종32.jpg
배우 양세종43.jpg
배우 원빈43.jpg
배우 윤계상36.jpg
배우 이광수28.jpg
배우 이동욱15.jpg
배우 이상이31.jpg
배우 이상이9.jpg
배우 이준혁38.jpg
배우 이준혁38.jpg
배우 이진혁40.jpg
배우 정건주12.jpg
배우 정건주42.jpg
배우 정건주42.jpg
배우 정해인28.jpg
배우 조병규20.jpg
배우 지성28.jpg
배우 진구24.jpg
배우 최시원14.jpg
배우 현우33.jpg
비스트 용준형45.jpg
빅뱅 탑45.jpg
빅스 혁4

In [158]:
t = []
t = val_man_encodings.copy()
man_val_encodings = sum(t,[])

In [159]:
print(len(val_filelist),len(val_image_path),len(man_val_encodings))

1685 1685 1685


# 2.5 테스트 데이터 클래스 예측

In [263]:
# val set으로 class 예측
result = clf.predict(man_val_encodings)

In [264]:
df_val_face = pd.DataFrame(index=val_filelist,columns=['class','imagepath'])
# ImageDataGenerator.flow_from_dataframe 에서 class가 string형이어야 하므로 변환
df_val_face['class'] = list(map(str,result))
df_val_face['imagepath'] = val_image_path

In [265]:
df_val_face.to_csv('./val_to_class.txt')

# 2.6 훈련 데이터 클래스 재예측

In [259]:
result2 = clf.predict(man_train_encodings)

In [260]:
df_train_face = pd.DataFrame(index=train_filelist,columns=['class','imagepath'])
df_train_face['class'] = list(map(str,result2))
df_train_face['imagepath'] = train_image_files

In [249]:
df_train_face = pd.concat([df_train_face['imagepath'],pd.get_dummies(df_train_face['class'])],axis=1)

In [261]:
df_train_face

Unnamed: 0,class,imagepath
2AM 임슬옹11.jpg,1,C:\Users\성수현\face_image_align\수정본\training\tra...
2AM 임슬옹18.jpg,1,C:\Users\성수현\face_image_align\수정본\training\tra...
2AM 임슬옹19.jpg,1,C:\Users\성수현\face_image_align\수정본\training\tra...
2AM 임슬옹20.jpg,1,C:\Users\성수현\face_image_align\수정본\training\tra...
2AM 임슬옹28.jpg,3,C:\Users\성수현\face_image_align\수정본\training\tra...
...,...,...
호반건설 회장 김상열4.jpg,0,C:\Users\성수현\face_image_align\수정본\training\tra...
호반건설 회장 김상열40.jpg,0,C:\Users\성수현\face_image_align\수정본\training\tra...
호반건설 회장 김상열41.jpg,7,C:\Users\성수현\face_image_align\수정본\training\tra...
호반건설 회장 김상열42.jpg,0,C:\Users\성수현\face_image_align\수정본\training\tra...


In [262]:
df_train_face.to_csv('./train_to_class.txt')

In [190]:
df_face.to_csv('./toclass.txt')

In [193]:
df_face[df_face['class'] == 11]

Unnamed: 0,class,imagepath
B1A4 공찬21.jpg,11,C:\Users\성수현\face_image_align\수정본\validate\val...
BAP 영재38.jpg,11,C:\Users\성수현\face_image_align\수정본\validate\val...
FT 아일랜드 최종훈41.jpg,11,C:\Users\성수현\face_image_align\수정본\validate\val...
HOT 강타21.jpg,11,C:\Users\성수현\face_image_align\수정본\validate\val...
NCT 성찬3.jpg,11,C:\Users\성수현\face_image_align\수정본\validate\val...
...,...,...
프로듀스X101 이한결36.jpg,11,C:\Users\성수현\face_image_align\수정본\training\tra...
플레디스 최민기3.jpg,11,C:\Users\성수현\face_image_align\수정본\training\tra...
플레디스 최민기36.jpg,11,C:\Users\성수현\face_image_align\수정본\training\tra...
헤일로 오운33.jpg,11,C:\Users\성수현\face_image_align\수정본\training\tra...
