In [1]:
# 处理标错的人脸

In [2]:
# 1.清洗文件夹下的数据：找出某个人文件夹下图片中明显与其他图片不一致的图
# 2.选择一张图片A作为target，其他图片B、C、D作为对照，计算该组的平均欧式距离d_A = （d(AB)+d(AC)+d(AD)）/ 3
# 3.如果d_A大于阈值，则删除A
# 4.其他文件夹类似处理


In [3]:
# 计算欧式距离

In [4]:
# 导入包
import cv2
import numpy as np
import matplotlib.pyplot as plt
import dlib
# %matplotlib inline
plt.rcParams['figure.dpi'] = 200

In [5]:
# 关键点 检测模型
shape_detector = dlib.shape_predictor('./weights/shape_predictor_68_face_landmarks.dat')
# resnet模型
face_descriptor_extractor = dlib.face_recognition_model_v1('./weights/dlib_face_recognition_resnet_model_v1.dat')

In [6]:
# 提取单张图片的特征描述符
def getFaceFeat(fileName):
    
    #读取
    img=cv2.imdecode(np.fromfile(fileName,dtype=np.uint8),-1)
    if img is None:
        return None
    
    # 转为RGB
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    # 初始化
    face_descriptor = None
    # 整个裁剪图就是人脸，无需再检测
    h,w = img.shape[:2]
    l,t,r,b = 0,0,w,h
    # 构造DLIB类型
    face = dlib.rectangle(l,t,r,b) 
    # 获取关键点
    points = shape_detector(img,face)
    # 获取特征描述符
    face_descriptor = face_descriptor_extractor.compute_face_descriptor(img,points)
    # 转为numpy 格式的数组
    face_descriptor = [f for f in face_descriptor]
    face_descriptor = np.asarray(face_descriptor,dtype=np.float64)
    face_descriptor = np.reshape(face_descriptor,(1,-1))
    
    return face_descriptor

In [7]:
# 测试一张
feat_test = getFaceFeat('./chinese_faces_cleaned/丁冠森/丁冠森_3.jpg')

In [8]:
feat_test.shape

(1, 128)

In [9]:
feat_test

array([[-0.09162728,  0.0269032 ,  0.00419354, -0.03777689, -0.1112555 ,
         0.00155301, -0.04292136, -0.16072276,  0.09233204, -0.13669872,
         0.19544446, -0.0638658 , -0.25811556, -0.09072338, -0.05978008,
         0.16551164, -0.18635371, -0.15972316, -0.01661953,  0.0016304 ,
         0.08407176,  0.02550029, -0.00946881,  0.07499165, -0.08698544,
        -0.36041629, -0.13661049, -0.04493103, -0.02882903, -0.0563573 ,
        -0.07394323,  0.09138102, -0.19533134, -0.06992982,  0.04107945,
         0.08403249,  0.019404  , -0.03721263,  0.17354757,  0.03375476,
        -0.20123416,  0.11073305,  0.08685884,  0.27991101,  0.16776469,
         0.05436033,  0.02277713, -0.12089485,  0.14030497, -0.12234369,
         0.01376736,  0.17563035,  0.11202144,  0.03512541, -0.05397094,
        -0.12582307, -0.00058609,  0.1117458 , -0.12600984,  0.08447472,
         0.11468242, -0.1358276 , -0.02549904, -0.09458837,  0.16046469,
         0.00975216, -0.10300556, -0.24271287,  0.0

In [10]:
import glob,tqdm

In [11]:
import shutil
import os


In [12]:
# 获取所有人名
person_list = glob.glob('./chinese_faces_cleaned/*')

In [13]:
person_list[8]

'./chinese_faces_cleaned\\丛飞'

In [14]:
len(person_list)

10

In [15]:
# 遍历每个人的文件夹
for person in tqdm.tqdm(person_list):
    
    
    # 初始化特征列表，记录文件名
    feature_list = None
    record_file = []
    # 获取该人名下的所有图片
    file_name = person+'/*.jpg'
    img_list = glob.glob(file_name)
    # 遍历图片
    for img_file in img_list:
        # 获取每一张图片的特征
        feat = getFaceFeat(img_file)
        #过滤数据
        if feat is not None: 
            
            if feature_list is None:
                feature_list = feat
            else:
                # 特征列表
                feature_list = np.concatenate((feature_list,feat),axis=0)
            # 记录一下文件名
            record_file.append(img_file)
    
    if feature_list is None:
        continue
        
    # 计算欧式距离
    # 依次计算一个特征描述符与所有特征的距离
    for i in range(len(feature_list)):
        dist_list = np.linalg.norm((feature_list[i]-feature_list),axis=1)
        dist_average = np.average(dist_list)

        # print(dist_average)
        #如果大于特征阈值，说明它与其他不同
        if dist_average > 0.6:
            
            remove_file = record_file[i]
            
            # 先复制到chinese_faces_mislabeled下，再在路径中删除
            person_class = person.split('\\')[-1]
            # 创建需要保存的目录
            save_dir = './chinese_faces_mislabeled/'+person_class
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            
            # 复制
            shutil.copy(remove_file, save_dir)            
            # 删除
            os.remove(remove_file)

            print('删除'+remove_file)
    
        
    # break
    
    

 10%|████████▎                                                                          | 1/10 [00:03<00:29,  3.26s/it]

删除./chinese_faces_cleaned\Angelababy\丁海峰_2.jpg


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.56s/it]
