In [1]:
import numpy as np
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import imutils
import pandas as pd

import itertools
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as TF

import time
import random
import math



!conda info --env



# conda environments:
#
base                     C:\Users\yongwook\anaconda3
outline               *  C:\Users\yongwook\anaconda3\envs\outline



In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


## Network
Based on https://colab.research.google.com/drive/1-28T5nIAevrDo6MwN0Qi_Cgdy9TEiSP_?usp=sharing#scrollTo=XH_bqPXo6YG8

Resnext50을 이용한다. 일단은 Greyscale(컬러로 확장도 가능하나 실익이 크지 않다.)

https://towardsdatascience.com/face-landmarks-detection-with-pytorch-4b4852f5e9c4

In [4]:
class Network(nn.Module):
    def __init__(self,num_classes=(landmark_number*2)):
        super().__init__()
        self.model_name='resnet50'
        #self.model=models.resnet18(pretrained=True)
        
        self.model=models.resnet50(pretrained=True)


        self.model.conv1=nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

        # for param in self.parameters():
        #     param.requires_grad = False

        # RGB:        self.model.conv1=nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        
        #self.model.conv1=nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        #self.model.conv1=nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

        self.model.fc=nn.Linear(self.model.fc.in_features, num_classes)

        
    def forward(self, x):
        x=self.model(x)
        return x

In [5]:
def pixel_distance(landmark, reference):
    ''' pixel_distance(landmark: np.array[[x, y], ..] 
                        reference: np.array[[x, y]] with true landmark value

        return: average: float average distance,
                each: np.array[distance, ..] with distance of each landmark
    '''
    each = []
    for i in range(len(landmark)):
        each.append(np.linalg.norm(landmark[i] - reference[i]))

    each = np.array(each)
    average = np.average(each)

    return average, each


In [6]:
def pixel_difference(landmark, reference):
    each = np.empty((len(landmark), 2))
    for i in range(len(landmark)):
        each[i] = (landmark[i] - reference[i])

    print(each)
    
    return each

## From Face detection to landmark detection, IRL

https://github.com/timesler/facenet-pytorch

- With pip:
pip install facenet-pytorch

- or clone this repo, removing the '-' to allow python imports:
git clone https://github.com/timesler/facenet-pytorch.git facenet_pytorch

- or use a docker container (see https://github.com/timesler/docker-jupyter-dl-gpu):
docker run -it --rm timesler/jupyter-dl-gpu pip install facenet-pytorch && ipython

In [7]:
#######################################################################
def return_path (common_path):
    # common_path = r'AutoAlign_test\B25776___________000_lat'
    pi, pt, fi, ft = "_photo.jpg", "_photo.txt", "_film.jpg", "_film.txt"
    image_path = common_path + pi
    tsv_path = common_path + pt
    film_path = common_path + ft
    film_img_path = common_path + fi

    return image_path, tsv_path, film_img_path, film_path
#######################################################################


In [8]:
def padded_resize(im, desired_size=1024):
    old_size = im.size  # old_size[0] is in (width, height) format

    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    # use thumbnail() or resize() method to resize the input image

    # thumbnail is a in-place operation

    # im.thumbnail(new_size, Image.ANTIALIAS)

    im = im.resize(new_size, Image.ANTIALIAS)
    # create a new image and paste the resized on it

    new_im = Image.new("RGB", (desired_size, desired_size), (255,255,255))
    new_im.paste(im, (0,0))

    return new_im

def load_tsv(path):
    # Loading dataframe
    df = pd.read_csv(path,  sep='\t')
    df = df.iloc[:99, 0:3]
    
    df.columns = ['name', 'X', 'Y']

    return df

def extract_landmarks(df, landmark_regex, landmark_length):
    # (gathering only needed landmarks)
    df = df.loc[df['name'].str.contains(landmark_regex, regex=True), :]
    # there are **18** landmarks that is unique and valid among all files
    # should we sort df?
    df = df.sort_values(by=['name'])
    df = df.loc[:, ['X', 'Y']]
    df = df.reset_index(drop=True)

    # ... and landmark
    landmark = df.to_numpy(dtype=np.float32)
    return landmark

def transform_landmarks(matrix, landmarks):
    '''
    transform_landmarks(matrix: np.array(), landmarks: np.array()):
        matrix: numpy.array(), a 2x3 matrix array which is affine transformation.
        landmarks: numpy.array(), a (n, 2) shaped array that contains landmarks information.

    1. Add "1" to each coordinate. (x, y) --> (x, y, 1)
    2. Transpose and multiply with matrix.
        [[a, b, c],   [[x,         [[x',
         [d, e, f]] X   y,      =    y'], ...]
                        1], ...] 
            matrix  X landmarks = transformed matrix
    3. Transpose back to original format, and return.

    returns:
        result: numpy.array(), a (n, 2) shaped array that contains transformed landmarks information.
        
    '''
    ones = np.ones((1, len(landmarks)))
    homography_landmarks = np.concatenate((landmarks, ones.T), axis=1)
    result = np.dot(matrix, homography_landmarks.T).T
    return result

def find_original_scale(size, original_info):
    width, height = original_info
    new_width, new_height = size, size
    if width >= height:
        new_height = height / width * size
    else:
        new_width = width / height * size

    return new_width, new_height

In [10]:
from facenet_pytorch import MTCNN

valid_prefix = ['40374___________000_lat', 'B4867___________000_lat', 'B7524___________000_lat', 'B13616___________000_lat', '25160___________000_lat', 'B25611___________000_lat', 'B4151___________000_lat', '19951816___________000_lat', '48301___________000_lat', 'B16939___________000_lat', 'B17545___________000_lat', 'B5080___________000_lat', '40275___________000_lat', '0101___________000_lat', 'B10243___________000_lat', 'B15809___________000_lat', '17285___________000_lat', '12488___________000_lat', 'B25776___________000_lat', '14644377___________000_lat', 'B22163___________000_lat', 'B15167___________000_lat', '22248198___________000_lat', 'B12007___________000_lat', 'B15955___________000_lat', '01042185___________3618_lat', 'B8753___________000_lat', '48329___________000_lat', 'B17915___________000_lat', 'B23418___________000_lat', 'B19358___________000_lat', 'B19336___________000_lat', 'B13247___________000_lat', 'B7887___________000_lat', 'B22050___________000_lat', '23987___________000_lat', 'B19857___________000_lat', '45325___________000_lat', 'B10606___________000_lat', 'B19011___________000_lat', 'B4667___________000_lat', 'B9871___________000_lat', 'B6863___________000_lat', 'B11375___________000_lat', 'B12007___________001_lat', 'B14138___________000_lat', 'B23013___________000_lat', '28800___________000_lat', '43476___________000_lat', '21706747___________000_lat']

# If required, create a face detection pipeline using MTCNN:
mtcnn = MTCNN(device=device)

best_network = Network()
# best_network.load_state_dict(torch.load(f"model/{time_str}_{landmark_number}_{num_epochs}_{weights_path}_{best_network.model_name}.tar")['network_state_dict'])
best_network.load_state_dict(torch.load(f"model/0602_1303_6_100_face_landmarks_transfer__resnet50.tar")['network_state_dict'])
best_network.eval()

names = []
photo_images = []
photo_landmarks = []
film_landmarks = []
original_info = []
film_sizes = []

root = "AutoAlign_test/"
for name in valid_prefix:
    image_path, tsv_path, film_img_path, film_path = return_path(root + name)
    names.append(name)

    input_image = Image.open(image_path)
    original_info.append(input_image.size)
    input_image = padded_resize(input_image, 320)
    photo_images.append(input_image)

    photo_landmark = extract_landmarks(load_tsv(tsv_path), '29@[2479]|30@[34]', 6)
    film_landmark = extract_landmarks(load_tsv(film_path), '29@[2479]|30@[34]', 6)
    photo_landmarks.append(photo_landmark)
    film_landmarks.append(film_landmark)

    film_sizes.append(Image.open(film_img_path).size)


print(f"input images: {len(photo_images)}")

# grayscale_image = input_image.convert('L')
# height, width = input_image.size[0], input_image.size[1]
# print(height, width)
# Get cropped and prewhitened image tensor
boxes, probs = mtcnn.detect(photo_images)
# print(boxes)

crop_images = []
crop_info = []

for count, box in enumerate(boxes):
    image = photo_images[count]
    grayscale_image = image.convert('L')

    # print(box)
    if box is None:
        print(f"face not detected: {names[count]}")
        names[count] = None
        photo_images[count] = None
        photo_landmarks[count] = None
        film_landmarks[count] = None
        original_info[count] = None
        film_sizes[count] = None

    if (box is not None):
        face = box[0]
        x0, y0, x1, y1 = face
        face_width = x1 - x0
        x0, y0, x1, y1 = int(x0)+face_width * 0.05, int(y0), int(x1)+face_width * 0.05, int(y1) 
        crop_image = TF.resized_crop(grayscale_image, y0, x0, y1-y0, x1-x0, size=(224, 224))
        crop_image = TF.to_tensor(crop_image)
        crop_image = TF.normalize(crop_image, [0.6945], [0.33497])
        crop_images.append(crop_image)
        crop_info.append((x0, y0, x1, y1))
    # print(count, box, image)

names = [x for x in names if x is not None]
photo_images = [x for x in photo_images if x is not None]
photo_landmarks = [x for x in photo_landmarks if x is not None]
film_landmarks = [x for x in film_landmarks if x is not None]
original_info = [x for x in original_info if x is not None]
film_sizes = [x for x in film_sizes if x is not None]

print(f"face detected: {len(crop_images)}")

batch_input = torch.stack(crop_images)
with torch.no_grad():
   landmarks = best_network(batch_input)

result = []
for count, info in enumerate(crop_info):
    landmark = landmarks[count].view(landmark_number,2).detach().numpy() + 0.5
    landmark = (landmark * np.array([[info[2]-info[0], info[3]-info[1]]]) + np.array([[info[0], info[1]]]))
    result.append(landmark)

# Uncomment to show image of landmark on photo

# figure, axis = plt.subplots(10, 5)
# plt.figure(figsize=(30,15))
# for count, image in enumerate(photo_images):
#     landmark = result[count]
#     axis[count % 10, count // 10].imshow(image)
#     axis[count % 10, count // 10].scatter(landmark[:,0], landmark[:,1], c = 'c', s = 1)
# plt.show()

dist, rect = [], []
for count, (name, info, photo_landmark, landmark_found, film_landmark, film_size) in enumerate(zip(names, original_info, photo_landmarks, result, film_landmarks, film_sizes)):
    
    fixed_landmark = landmark_found * np.array(info) / np.array(find_original_scale(320, info))
    landmark_dist = np.average(np.abs(fixed_landmark-photo_landmark) / np.array(info), axis=0)
    dist.append(landmark_dist)

    matrix, _ = cv2.estimateAffinePartial2D(film_landmark, fixed_landmark, method=cv2.LMEDS)
    sol_matrix, _ = cv2.estimateAffinePartial2D(film_landmark, photo_landmark, method=cv2.LMEDS)
    corners = np.array([[0,0], [0, film_size[1]], [film_size[0], 0], [film_size[0], film_size[1]]])
    
    ours = transform_landmarks(matrix, corners)
    sol = transform_landmarks(sol_matrix, corners)

    avr = np.average(np.abs(ours-sol) / np.array(info), axis=0)
    rect.append(avr)

dist_count = 0
dist_count_loose = 0
rect_count = 0    
rect_count_loose = 0    
print("======== landmark avg ==========")
for i, d in enumerate(dist):
    print(f"{names[i]} average: {d} | {np.average(d) < 0.01}")
    if np.average(d) < 0.01:
        dist_count += 1
    if np.average(d) < 0.03:
        dist_count_loose += 1

print("==========transform avg==========")
for i, avr in enumerate(rect):
    print(f"{names[i]} average: {avr} | {np.average(avr) < 0.01}")
    if np.average(avr) < 0.01:
        rect_count += 1
    if np.average(avr) < 0.03:
        rect_count_loose += 1
    
print(f"landmark average distance under 1%: {dist_count}")
print(f"landmark average distance under 3%: {dist_count_loose}")
print(f"film image corner distance under 1%: {rect_count}")
print(f"film image corner distance under 3%: {rect_count_loose}")

input images: 50
  batch_boxes, batch_points = np.array(batch_boxes), np.array(batch_points)
  boxes = np.array(boxes)
  points = np.array(points)
face not detected: B19358___________000_lat
face detected: 49
40374___________000_lat average: [0.02205823 0.01780756] | False
B4867___________000_lat average: [0.00303025 0.00416456] | True
B7524___________000_lat average: [0.02662856 0.0100958 ] | False
B13616___________000_lat average: [0.00187316 0.00451644] | True
25160___________000_lat average: [0.01285987 0.01752303] | False
B25611___________000_lat average: [0.0057532  0.00487935] | True
B4151___________000_lat average: [0.0014622  0.00776098] | True
19951816___________000_lat average: [0.00501336 0.01719048] | False
48301___________000_lat average: [0.00286184 0.01379597] | True
B16939___________000_lat average: [0.00191867 0.00531133] | True
B17545___________000_lat average: [0.00467457 0.01364503] | True
B5080___________000_lat average: [0.00269714 0.0039181 ] | True
40275_______