# 顔のキャプチャ画像から目線を推定する（使うところ
* モデルを読み込む
* 平均画像を作成する

* OpenCVで顔のキャプチャ画像を取得する
* キャプチャ画像から顔の部分を切り取る
* 画像に前処理を行う
* ネットワークに通す
* 7x5の画像で表示する

In [1]:
import os
import pickle
from PIL import Image
import numpy as np
import glob
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from chainer.links.caffe import CaffeFunction
from matplotlib import pylab as plt

from chainer.datasets import LabeledImageDataset
from itertools import chain

In [2]:
class GoogLeNet(Chain):
    insize = 224
    
    def __init__(self):
        super(GoogLeNet, self).__init__()
        with self.init_scope():
            self.conv1 =  L.Convolution2D(3, 64, 7, stride=2, pad=3)
            self.conv2_reduce = L.Convolution2D(64, 64, 1)
            self.conv2 = L.Convolution2D(64, 192, 3, stride=1, pad=1)
            self.inception_3a = L.Inception(192, 64, 96, 128, 16, 32, 32)
            self.inception_3b = L.Inception(256, 128, 128, 192, 32, 96, 64)
            self.inception_4a = L.Inception(480, 192, 96, 208, 16, 48, 64)
            self.inception_4b = L.Inception(512, 160, 112, 224, 24, 64, 64)
            self.inception_4c = L.Inception(512, 128, 128, 256, 24, 64, 64)
            self.inception_4d = L.Inception(512, 112, 144, 288, 32, 64, 64)
            self.inception_4e = L.Inception(528, 256, 160, 320, 32, 128, 128)
            self.inception_5a = L.Inception(832, 256, 160, 320, 32, 128, 128)
            self.inception_5b = L.Inception(832, 384, 192, 384, 48, 128, 128)
            self.loss3_fc = L.Linear(1024, 1000)
            
            self.loss1_conv = L.Convolution2D(512, 128, 1)
            self.loss1_fc1 = L.Linear(2048, 1024)
            self.loss1_fc2 = L.Linear(1024, 1000)
            
            self.loss2_conv = L.Convolution2D(528, 128, 1)
            self.loss2_fc1 = L.Linear(2048, 1024)
            self.loss2_fc2 = L.Linear(1024, 1000)
        
    def __call__(self, x):
        h = F.relu(self.conv1(x))
        h = F.max_pooling_2d(h, 3, stride=2)
        h = F.local_response_normalization(h, n=5, k=1, alpha=2e-05)
        h = F.relu(self.conv2_reduce(h))
        h = F.relu(self.conv2(h))
        h = F.local_response_normalization(h, n=5, k=1, alpha=2e-05)
        h = F.max_pooling_2d(h, 3, stride=2)
        
        h = self.inception_3a(h)
        h = self.inception_3b(h)
        h = F.max_pooling_2d(h, 3, stride=2)
        h = self.inception_4a(h)
        
        b = F.average_pooling_2d(h, 5, stride=3)
        b = F.relu(self.loss1_conv(b))
        b = F.relu(self.loss1_fc1(b))
        b = self.loss1_fc2(b)
        
        h = self.inception_4b(h)
        h = self.inception_4c(h)
        h = self.inception_4d(h)
        
        b = F.average_pooling_2d(h, 5, stride=3)
        b = F.relu(self.loss2_conv(b))
        b = F.relu(self.loss2_fc1(b))
        b = self.loss2_fc2(b)
        
        h = self.inception_4e(h)
        h = F.max_pooling_2d(h, 3, stride=2)
        h = self.inception_5a(h)
        h = self.inception_5b(h)
        
        h = F.average_pooling_2d(h, 7, stride=1)
        y = self.loss3_fc(F.dropout(h, 0.4))
        return y

# モデルを読み込む

In [3]:
model = L.Classifier(GoogLeNet())
chainer.serializers.load_npz('result/2000epoch.npz', model)

# データセットを読み込む

In [4]:
# 画像フォルダ
IMG_DIR = 'datas'
# 各注視点ごとのフォルダ
dnames = glob.glob('{}/*'.format(IMG_DIR))
# キャプチャのパス
fnames = [glob.glob('{}/*.jpg'.format(d)) for d in dnames
          if not os.path.exists('{}/ignore'.format(d))]
fnames = list(chain.from_iterable(fnames))

labels = [os.path.basename(os.path.dirname(fn)) for fn in fnames]
dnames = [os.path.basename(d) for d in dnames
          if not os.path.exists('{}/ignore'.format(d))]
labels = [dnames.index(l) for l in labels]

d = LabeledImageDataset(list(zip(fnames, labels)))

# 平均画像を作成する

In [5]:
width, height = 224, 224

# 平均画像を用意する
imgArray = None
for fname in fnames:
    img = Image.open(fname)
    img = img.resize((width, height), Image.BICUBIC)
    im = np.asarray(img, dtype=np.float32).transpose(2, 0, 1)
    if type(imgArray) == type(None):
        imgArray = np.asarray(im)/len(fnames)
    else:
        imgArray += np.asarray(im)/len(fnames)
mean_image = imgArray.copy()

# OpenCVで顔のキャプチャ画像を取得する

In [6]:
import cv2

In [7]:
cap = cv2.VideoCapture(0)
area_w = 210
area_h = 210

In [38]:
plist_x = [0 for i in range(20)]
plist_y = [0 for i in range(20)]

while True:
    ret, frame = cap.read()
    f_width = frame.shape[1]
    f_height = frame.shape[0]
    # 縮小
    frame = cv2.resize(frame, (int(f_width/2), int(f_height/2)))
    # トリミング
    frame = frame[int(f_height/4-area_h/2):int(f_height/4+area_h/2), int(f_width/4-area_w/2):int(f_width/4+area_w/2)]
    # 縮小2回め
    frame = cv2.resize(frame, (224, 224))
    # 配列順序入れ替え
    test_frame = np.asarray(frame, dtype=np.float32).transpose(2,0,1)
    # 平均画像を引く
    test_frame = test_frame - mean_image
    test_frame = test_frame.reshape(1,3,224,224)
    # 学習済モデルにいれる
    y, = model.predictor(Variable(test_frame))
    view_point = y.data[0:35]
    view_point = view_point.reshape(5,7)
    image = np.array(view_point, dtype=np.uint8)
    point = np.argmax(view_point)
    p_x = int(point%7)
    p_y = int(point/7)
    plist_x.insert(0, p_x)
    plist_x.pop()
    plist_y.insert(0, p_y)
    plist_y.pop()
    
    
    cv2.putText(frame, str(np.mean(plist_x))+','+str(np.mean(plist_y)), (0, 50), cv2.FONT_HERSHEY_PLAIN, 3, (0, 255, 0), 3, cv2.LINE_AA)
    cv2.imshow('camera capture', frame)
    cv2.imshow('View point', image)
    k = cv2.waitKey(1)
    if k == 27:
        break
        
cap.release()
cv2.destroyAllWindows()

KeyboardInterrupt: 

In [None]:
frame.shape