# Using convolutional neural nets to detect facial keypoints tutorial
### implement this in torch7
---


## see detail tutorial : [kaggle-tutorial](http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/)

---

## Prerequisites

I assume you have the just torch7, csvigo, image.


In [None]:
-- Setting data's config.

IMG_DIM = 96

MAX_PIXEL_VAL = 255.0
MAX_FEATURE = 30
MAX_TRAIN_IMG = 7049
MAX_TEST_IMG = 1783
MAX_TEST_OUTPUT = 27124

FILEPATH_DATA_DIR = "../data/"
FILEPATH_TRAIN = FILEPATH_DATA_DIR.."training.csv"
FILEPATH_TEST = FILEPATH_DATA_DIR.."test.csv"
FILEPATH_TEST_FEATURE = FILEPATH_DATA_DIR.."IdLookupTable.csv"
FILEPATH_TEST_OUTPUT = FILEPATH_DATA_DIR.."test_output.csv"

In [None]:
-- Lua Library about csv
require 'csvigo'

In [None]:
csvFile = io.open(FILEPATH_TRAIN, 'r')
header = csvFile:read()
feature_map = header:split(',')

-- original index 31 -> "Image"  but, there is error.
-- So remove and insert.
table.remove(feature_map, 31)
table.insert(feature_map, "Image")

In [None]:
-- Define 
feature_data = torch.Tensor(MAX_TRAIN_IMG, MAX_FEATURE)
image_data = torch.Tensor(MAX_TRAIN_IMG, IMG_DIM*IMG_DIM)
csvigoFile = csvigo.load(FILEPATH_TRAIN)

In [None]:
for i=1, MAX_TRAIN_IMG do
    image_index = MAX_FEATURE+1
    local x = csvigoFile[feature_map[image_index]][i]
    local image = x:split(' ')
    
    -- Scale pixel values to [0, 1]
    image_data[i] = torch.Tensor(image)/MAX_PIXEL_VAL

    image_feature = {}

    for j=1, MAX_FEATURE do
        local point_info = csvigoFile[feature_map[j]]
        local x = tonumber(point_info[i])
        if(x ~= nil) then
            image_feature[j] = x/IMG_DIM
        else
            -- values are missing
            image_feature[j] = -1
        end
    end
    feature_data[i] = torch.Tensor(image_feature)
end

In [None]:
print(feature_data[3000])

In [None]:
-- Prepare data
torch.save(FILEPATH_DATA_DIR .. "feature_data.raw", feature_data, 'binary')
torch.save(FILEPATH_DATA_DIR .. "image_data.raw", image_data, 'binary')

In [None]:
-- Load data
feature_data = torch.load(FILEPATH_DATA_DIR.."feature_data.raw", 'binary')
image_data = torch.load(FILEPATH_DATA_DIR.."image_data.raw", 'binary')

csvFile = io.open(FILEPATH_TRAIN, 'r')
header = csvFile:read()
csvFile:close()
feature_map = header:split(',')

table.remove(feature_map, 31)
table.insert(feature_map, "Image")

## The data

The training dataset for the Facial Keypoint Detection challenge consists of 7,049 96x96 gray-scale images.

In [None]:
showImage = function(index)
    
    -- 1D => 2D Tensor
    temp = torch.Tensor(IMG_DIM, IMG_DIM)
    
    imagedata = image_data[index]
    for j=1, IMG_DIM do
        table_x = {}
        for k=1, IMG_DIM do
            table.insert(table_x, imagedata[k + (j-1)*IMG_DIM])
        end
        temp[j] = torch.Tensor(table_x)
    end
    
    -- Represent Feature
    BLACK = 0; WHITE = 1
    featuredata = feature_data[index]
    
    for i=1, 30, 2 do
        point_x = torch.round(featuredata[i] * IMG_DIM * 100) / 100
        point_y = torch.round(featuredata[i+1] * IMG_DIM * 100) / 100
        
        --print(feature_map[i], featuredata[i], featuredata[i] * IMG_DIM, point_x)
        --print(feature_map[i+1], featuredata[i+1], featuredata[i+1] * IMG_DIM, point_y)
        
        temp[point_y][point_x] = BLACK
        temp[point_y-1][point_x] = WHITE
        temp[point_y+1][point_x] = WHITE
        temp[point_y][point_x-1] = WHITE
        temp[point_y][point_x+1] = WHITE
    end
    
    itorch.image(temp)
end

In [None]:
showImage(1)

It's not necessary that you go through every single detail of this function. But let's take a look at what the script above outputs:

In [None]:
for i=1, MAX_FEATURE do
    local byte_vec = torch.ne(feature_data:select(2, i), -1.0)
    print (feature_map[i], torch.sum(byte_vec))
end

There's only 2,140 images in the dataset that have all 30 target values present. Initially, we'll train with these 2,140 samples only.

In [None]:
num_images = 0
image_id_map = {}

for i=1, MAX_TRAIN_IMG do
    local byte_vec = torch.ne(feature_data:select(1, i), -1.0)
    if torch.sum(byte_vec) == MAX_FEATURE then
        num_images = num_images + 1
        image_id_map[num_images] = i
    end
end
print("Num images with all the 30 feature vectors : " .. num_images)

## Data augmentation

image reference : http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/

![original](http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/)
![flipped](http://danielnouri.org/media/kfkd/samples3.png)

Image =>    Original / Flipped 

Since we're flipping the images, we'll have to make sure we also flip the target values.

In [None]:
flip_indices = { 
    {1, 3}, {2, 4},
    {5, 9}, {6, 10}, {7, 11}, {8, 12},
    {13, 17}, {14, 18}, {15, 19}, {16, 20},
    {23, 25}, {24, 26} 
}

In [None]:
for i=1, #flip_indices do
    original = flip_indices[i][1]
    flipped = flip_indices[i][2]
    print("#", feature_map[original], "->" , feature_map[flipped])
end

In [None]:
flip_feature_data = torch.Tensor(MAX_TRAIN_IMG, MAX_FEATURE)
flip_image_data = torch.Tensor(MAX_TRAIN_IMG, IMG_DIM*IMG_DIM)

In [None]:
for i=1, MAX_TRAIN_IMG do
    local x = image_data[i]
    flip_x = {}
    
    -- Flip left to right
    for j=1, IMG_DIM do
        for k = IMG_DIM, 1, -1 do
            table.insert(flip_x, x[k + (j-1)*IMG_DIM])
        end
    end
    flip_image_data[i] = torch.Tensor(flip_x)
    
    local y = feature_data[i]
    flip_y = {}
    for j=1, MAX_FEATURE do
        index = j
        for k=1, #flip_indices do
            original = flip_indices[k][1]
            flipped = flip_indices[k][2]

            if index == original then index = flipped break end
            if index == flipped then index = original break end
        end
        
        if index%2 == 1 then
            if y[index] ~= -1 then
                table.insert(flip_y, 1 - y[index])
            else
                table.insert(flip_y, -1)
            end
        else
            table.insert(flip_y, y[index])
        end
    end
    flip_feature_data[i] = torch.Tensor(flip_y)
end

In [None]:
-- Prepare flipped data
torch.save(FILEPATH_DATA_DIR .. "flip_feature_data.raw", flip_feature_data, 'binary')
torch.save(FILEPATH_DATA_DIR .. "flip_image_data.raw", flip_image_data, 'binary')

In [None]:
-- Load flipped data
flip_feature_data = torch.load(FILEPATH_DATA_DIR.."flip_feature_data.raw", 'binary')
flip_image_data = torch.load(FILEPATH_DATA_DIR.."flip_image_data.raw", 'binary')

In [None]:
-- Load Image.
loadImageWithFeature = function(index, imagedata, featuredata)
    temp = torch.Tensor(IMG_DIM, IMG_DIM)
    
    image_pixels = imagedata[index]

    for j=1, IMG_DIM do
        table_x = {}
        for k=1, IMG_DIM do
            table.insert(table_x, image_pixels[k + (j-1)*IMG_DIM])
        end
        temp[j] = torch.Tensor(table_x)
    end
    
    -- Represent Feature
    BLACK = 0; WHITE = 1
    feature_points = featuredata[index]
    
    for i=1, 30, 2 do
        point_x = torch.round(feature_points[i] * IMG_DIM * 100) / 100
        point_y = torch.round(feature_points[i+1] * IMG_DIM * 100) / 100
        
        temp[point_y][point_x] = BLACK
        temp[point_y-1][point_x] = WHITE
        temp[point_y+1][point_x] = WHITE
        temp[point_y][point_x-1] = WHITE
        temp[point_y][point_x+1] = WHITE
    end
    
    return temp
end

In [None]:
-- Check flip image.
image1 = loadImageWithFeature(1, image_data, feature_data)
flip_image1 = loadImageWithFeature(1, flip_image_data, flip_feature_data)
itorch.image({image1, flip_image1})

In [None]:
-- Reshape 1D -> 2D
for i=1, MAX_TRAIN_IMG do
    image_data[i] = torch.reshape(image_data[i], 96, 96)
    flip_image_data[i] = torch.reshape(flip_image_data[i], 96, 96)
end

In [None]:
require 'nn'
require 'image'
require 'optim'

In [None]:
-- Convolutional Network

model = nn.Sequential()

nfeats = 1
nstates = {32, 64, 128, 500, 500}
filtsize = 3
padding = (filtsize-1)/2
poolsize = 2
noutputs = MAX_FEATURE

-- stage 1 : filter bank -> squashing -> L2 pooling
model:add(nn.SpatialConvolutionMM(nfeats, nstates[1], filtsize, filtsize, 1, 1, padding, padding))
model:add(nn.ReLU())
model:add(nn.SpatialMaxPooling(poolsize,poolsize,poolsize,poolsize))

-- stage 2 : filter bank -> squashing -> L2 pooling
model:add(nn.SpatialConvolutionMM(nstates[1], nstates[2], filtsize, filtsize, 1, 1, padding, padding))
model:add(nn.ReLU())
model:add(nn.SpatialMaxPooling(poolsize,poolsize,poolsize,poolsize))

-- stage 2 : filter bank -> squashing -> L2 pooling
model:add(nn.SpatialConvolutionMM(nstates[2], nstates[3], filtsize, filtsize, 1, 1, padding, padding))
model:add(nn.ReLU())
model:add(nn.SpatialMaxPooling(poolsize,poolsize,poolsize,poolsize))

-- stage 4 : standard 3-layer neural network
model:add(nn.View(nstates[3]*12*12))
model:add(nn.Dropout(0.5))
model:add(nn.Linear(nstates[3]*12*12, nstates[4]))
model:add(nn.ReLU())
model:add(nn.Linear(nstates[4], nstates[5]))
model:add(nn.ReLU())
model:add(nn.Linear(nstates[5], noutputs))

In [None]:
num_images_training = math.floor((80*num_images)/100)
num_images_validating = num_images - num_images_training
print("Num Train Images : " .. num_images_training .. " Num Validating Images : " .. num_images_validating)

In [None]:
batchSize = 64  -- 1 : pure stochastic
epochSize = 1000
threadNum = 3
seedNum = 1

torch.setnumthreads(threadNum)
torch.manualSeed(seedNum)

criterion = nn.MSECriterion()
x, dl_dx = model:getParameters()

sgd_params = {
   learningRate = .01, --1e-3,
   learningRateDecay = .001, --1e-4,
   weightDecay = 0,
   momentum = .9
}

In [None]:
feval = function(x_new)
    if x ~= x_new then
        x:copy(x_new)
    end
    
    dl_dx:zero()
    local loss_x = 0
    
    for batch_num = 1, batchSize do
        _nidx_ = (_nidx or 0) + 1
        if _nidx_ > num_images_training then _nidx_ = 1 end
        
        if _nidx_ % 50 == 0 then
            collectgarbage()
        end
        
        local image_id = image_id_map[shuffle_idx[_nidx_]]
        
        local inputs1 = image_data[image_id]:view(1, 96, 96)
        local target1 = feature_data[image_id]
        
        local loss1 = criterion:forward(model:forward(inputs1), target1)
        model:backward(inputs1, criterion:backward(model.output, target1))
        loss_x = loss_x + loss1
        
        local inputs2 = flip_image_data[image_id]:view(1, 96, 96)
        local target2 = flip_feature_data[image_id]
        
        local loss2 = criterion:forward(model:forward(inputs2), target2)
        model:backward(inputs2, criterion:backward(model.output, target2))
        loss_x = loss_x + loss2
    end
    
    loss_x = loss_x/batchSize
    dl_dx = dl_dx:div(batchSize)
    
    return loss_x, dl_dx
end    

In [None]:
for epoch =1, epochSize do
    model:training()
    shuffle_idx = torch.randperm(num_images_training)
    
    current_loss = 0
    
    local time = sys.clock()
    for img_id = 1, num_images_training, batchSize do
        _, fs = optim.nag(feval, x, sgd_params)
        current_loss = current_loss + math.sqrt(fs[1])
    end
    
    time = sys.clock() - time
    time = time / num_images_training
    
    current_loss = current_loss * batchSize / num_images_training
    print(epoch .. ' current loss = ' .. current_loss)
    
    model:evaluate()
    local validation_loss = 0.0
    start_time = sys.clock()
    for i = num_images_training, num_images do
        local image_id = image_id_map[i]
        inputs = image_data[image_id]:view(1, 96, 96)
        
        local target = feature_data[image_id]
        local forward_output = model:forward(inputs)
        
        local byte_vec_fea = torch.ne(feature_data:select(1, image_id), -1.0)
        local byte_vec_non_fea = torch.eq(feature_data:select(1, image_id), -1.0)
        local zeroed_target = torch.cmul(target:double(), byte_vec_fea:double())
        local selected_output = torch.cmul(forward_output:double(), byte_vec_non_fea:double())
        local equalised_target = torch.add(zeroed_target:double(), selected_output:double())
        
        local error = equalised_target - forward_output
        local mse = torch.norm(error) / math.sqrt(torch.sum(byte_vec_fea))
        validation_loss = validation_loss + mse
    end
    print(epoch.. ' current validation loss ' .. validation_loss / num_images_validating)
    
    time = sys.clock() - start_time
    time = time / num_images_training
    print("==> time to validate 1 sample = " .. (time * 1000) .. ' ms')
    
    if(epoch % 50 == 0) then
        modsav = model:clone('weight', 'bias')
        torch.save(FILEPATH_DATA_DIR .. 'trained_model_' .. epoch .. '.t7' , modsav)
    end
end

In [None]:
torch.setdefaulttensortype('torch.FloatTensor')

inv_feature_map = {}
for i=1, MAX_FEATURE do
    inv_feature_map[feature_map[i]] = i
end

testImageFile = csvigo.load(FILEPATH_TEST)
testFeatureFile = csvigo.load(FILEPATH_TEST_FEATURE)

test_data = torch.Tensor(MAX_TEST_IMG, IMG_DIM*IMG_DIM)
feature_data = torch.Tensor(MAX_TEST_IMG, MAX_FEATURE)
savedModel = torch.load(FILEPATH_DATA_DIR .. "trained_model_400.t7")

In [None]:
local validation_loss = 0.0
for i=1, MAX_TEST_IMG do
    local x = testImageFile["Image"][i]
    local image = x:split(' ')
    local input_1d = torch.Tensor(image)/MAX_PIXEL_VAL; test_data[i] = input_1d
    local inputs = input_1d:view(1, 96, 96):type('torch.FloatTensor')
    inputs = inputs:double()
    local myPrediction = savedModel:forward(inputs)
    feature_data[i] = torch.Tensor(myPrediction:float())
end

In [None]:
testOutputFile = csvigo.File(FILEPATH_TEST_OUTPUT, "w")
testOutputFile:write({"RowId", "Location"})

function trim1(s)
    return (s:gsub("^%s*(.-)%s*$", "%1"))
end

for i=1, MAX_TEST_OUTPUT do
    local imageId = testFeatureFile["ImageId"][i]
    local featureId = inv_feature_map[trim1(testFeatureFile["FeatureName"][i])]
    local location = feature_data[imageId][featureId]*96
    if(location > 95) then location = 95; end
    if(location < 0) then location = 0; end
    
    print(imageId, trim1(testFeatureFile["FeatureName"][i]), location)
    
    testOutputFile:write({i, location})
end
testOutputFile:close()

In [None]:
predict_image = loadImageWithFeature(1000, test_data, feature_data)
itorch.image(predict_image)