# Style Transfer with Deep Learning using the VGG-19 network

In [None]:
import os
from six.moves import urllib
from scipy.io import loadmat
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from scipy.misc import imresize
%matplotlib inline


def download_hook(count, block_size, total_size):
        if count % 20 == 0 or count * block_size == total_size:
            percentage = 100.0 * count * block_size / total_size
            barstring = ["=" for _ in range(int(percentage / 2.0))] + [">"] + ["." for _ in range(50 - int(percentage / 2.0))]
            barstring = "[" + "".join(barstring) + "]"
            outstring = '%02.02f%% (%02.02f of %02.02f MB)\t\t' + barstring
            print(outstring % (percentage, count * block_size / 1024.0 / 1024.0, total_size / 1024.0 / 1024.0), end='\r')

The VGG-19 model is quite large, so be a little patient.

In [None]:
path = "http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat"
fname = "vgg-19.mat"
if not os.path.exists(fname):
    print("Downloading ...")
    filepath, _ = urllib.request.urlretrieve(path, filename=fname, reporthook=download_hook)
    print("Done.")

    
if not os.path.exists("content.jpg"):    
    urllib.request.urlretrieve("", filename="content.jpg") # Attribution: Gage Skidmore
    urllib.request.urlretrieve("", filename="style.jpg")    

Find a description (in the form of python code) of the loaded model at
https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/assignments/style_transfer/vgg_model.py

In [None]:
original_layers = loadmat(fname)["layers"][0]
original_layers.shape

The downloaded file contains the VGG-19 model, consisting of 43 trained layers. You can access the neccessary information as follows:
* The name of layer `i` can be found by accessing `original_layers[i][0][0][0][0]`
* The layer weight matrix can be found by accessing `original_layers[i][0][2][0][0]`
* The bias can be found by accessing `original_layers[i][0][2][0][1]`

In [None]:
def get_layer_name(i):
    return original_layers[i][0][0][0][0]

def get_layer_weights(i):
    return original_layers[i][0][0][2][0][0]

def get_layer_bias(i):
    return original_layers[i][0][0][2][0][1]
    
def get_layer_params(i):
    return (get_layer_weights(i), get_layer_bias(i))

layer_names = [get_layer_name(i) for i in range(len(original_layers))]

def get_layer_by_name(name):
    return layer_names.index(name)

Let's get an inuition of what the VGG-19 network looks like:

In [None]:
print(", ".join(layer_names))
conv_layers = [ln for ln in layer_names if ln.startswith("conv")]
pool_layers = [ln for ln in layer_names if ln.startswith("pool")]

print(conv_layers)

The network consists of a series of convolutional layers making sense of the input content, finally the network makes a decision about what is being depicted in the input image by moving the content through some fully connected layers followed by a softmax function.

We now have an understanding of what the VGG-19 network consists of, and we also are able to access its weights and biases. What we need to do next is move the model to tensorflow. We only need to rebuild those parts that we're going to use later on--that is, we can ignore the fully connected layers, as they are only needed for making guesses about the kind of object depicted in the input image. Everything that's really interesting happens in the convolutional layers.

This means: We are not trying to rebuild the complete model. We will take the convolutional layers that are needed to reason about the image style and image content, and we will only work with those layers.

In [None]:
def create_activated_convlayer(prev, i):
    layer_index = get_layer_by_name(conv_layers[i])
    W, b = get_layer_params(layer_index)
    W = tf.constant(W)
    b = tf.constant(np.reshape(b, (b.size)))
    conv = tf.nn.conv2d(prev, filter=W, strides=[1,1,1,1], padding='SAME') + b
    return tf.nn.relu(conv)

def create_pool_layer(prev):
    return tf.nn.avg_pool(prev, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

def get_next_convlayer(i, prev):
    next_i = i + 1
    return (next_i, create_activated_convlayer(prev, i))

def get_next_convlayer_name(i):
    return conv_layers[i]

def get_last_convlayer_name(i):
    return conv_layers[i - 1]

_"For image synthesis we found that replacing the
max-pooling operation by average pooling improves the gradient flow and one obtains slightly
more appealing results, which is why the images shown were generated with average pooling."_

Let's keep this in mind and rebuild the model parts that we're going to need.

In [None]:
tf.reset_default_graph()

model = {}

content = plt.imread("content.jpg")    
style   = plt.imread("style.jpg")   


scale_down = 5


height = content.shape[0] // scale_down
width = content.shape[1] // scale_down
index = 0

model['in'] = tf.Variable(np.zeros((1, height, width, 3)), dtype=tf.float32)
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model['in'])
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model[get_last_convlayer_name(index)])
model['avgpool1'] = create_pool_layer(model[get_last_convlayer_name(index)])
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model['avgpool1'])
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model[get_last_convlayer_name(index)])
model['avgpool2'] = create_pool_layer(model[get_last_convlayer_name(index)])
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model['avgpool2'])
for i in range(3):
    index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model[get_last_convlayer_name(index)])
model['avgpool3'] = create_pool_layer(model[get_last_convlayer_name(index)])
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model['avgpool3'])
for i in range(3):
    index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model[get_last_convlayer_name(index)])
model['avgpool4'] = create_pool_layer(model[get_last_convlayer_name(index)])
index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model['avgpool4'])
for i in range(3):
    index, model[get_next_convlayer_name(index - 1)] = get_next_convlayer(index, model[get_last_convlayer_name(index)])
model['avgpool5'] = create_pool_layer(model[get_last_convlayer_name(index)])

In [None]:
model

For working with input images, the image's mean accross each color channel has to be subtracted, so our images have the same format like those images the VGG-19 model was trained with.

In [None]:
#mr = np.mean(content[:,:,0])
#mg = np.mean(content[:,:,1])
#mb = np.mean(content[:,:,2])
#        --- (Not sure whether to use the fixed numbers or the calculated means actually) ---
#means = np.reshape([mr, mg, mb], (1,1,3))
means = np.reshape([116.779, 123.68, 103.939], (1,1,3))

def preprocess_image(img_in):
    img = img_in.astype("float32")
    img = imresize(img, (height, width))
    
    img = img - means
    img = img[np.newaxis]
    return means, img
    
def unprocess_image(img_in):
    img = img_in
    img = img[0]
    img = img + means
    img = np.clip(img, 0, 255).astype('uint8')
    return img
    
    
means_c, processed_content = preprocess_image(content)
means_s, processed_style   = preprocess_image(style)
unprocessed = unprocess_image(processed_content)
unprocessed_style = unprocess_image(processed_style)

plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(unprocessed.astype('uint8'))
plt.show()
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(unprocessed_style.astype('uint8'))
plt.show()

_"For the images […] we matched the content representation on layer ‘conv4 2’ and the
style representations on layers ‘conv1 1’, ‘conv2 1’, ‘conv3 1’, ‘conv4 1’ and ‘conv5 1’"_



Higher convolutional layers are good at capturing the overall content of an image. The lowest layers catch simple features (like horizontal and vertical edges or curves), whereas the layers in between capture features of medium complexity (like noses and eyes or more complex style patterns). Hence, we can take a high-level layer and see how it reacts when receiving the content image. The activation values can be interpreted as a representation of the content we would like to preserve. To get a representation of the style we would like to apply, we look how different layers react when receiving the style image as an input. 



_You may want to play around with the layers and weights; different choices will lead to differing results._

In [None]:
content_layer = 'conv4_2'
style_layers = ['conv1_1', 'conv2_1', 'conv3_1', 'conv4_1', 'conv5_1']
style_weights = [.8, .8, .8, .8, .8]


sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

content_features = sess.run(model[content_layer], feed_dict={model['in']: processed_content})
style_features = sess.run([model[sl] for sl in style_layers], feed_dict={model['in']: processed_style})

For the content loss, we will basically compare the current activations of the content layer with the saved values it produced when receiving the actual content image. The style loss is a bit more complicated and involves the calculation of _gram matrices_. Take a look at the original paper if you are more interested in this topic.

https://arxiv.org/pdf/1508.06576.pdf

We also calculate the _variation loss_. This will be added to the total loss and make sure that neighboring pixels are relatively similar, so as to avoid clutter.

In [None]:
def content_loss(p):
    size = np.prod(content_features.shape[1:])
    return (1 / (2.0 * size)) * tf.reduce_sum(tf.pow((p - content_features), 2)) 
    

def gram_matrix(features, n, m):
    features_t = tf.reshape(f, (m, n))
    return tf.matmul(tf.transpose(features_t), features_t)

def style_loss(a, x):
    n = a.shape[3]
    m = a.shape[1] * a.shape[2]
    a_matrix = gram_matrix(a, n, m)
    g_matrix = gram_matrix(x, n, m)
    return (1 / (4 * n**2 * m**2)) * tf.reduce_sum(tf.pow(g_matrix - a_matrix, 2))

def var_loss(x):
    h, w = x.get_shape().as_list()[1], x.get_shape().as_list()[2]
    dx = tf.square(x[:, :h - 1, :w - 1, :] - x[:, :h - 1, 1:, :])
    dy = tf.square(x[:, :h - 1, :w - 1, :] - x[:, 1:, :w - 1, :])
    return tf.reduce_sum(tf.pow(dx + dy, 1.25))

The $\alpha$-, $\beta$- and $\gamma$- values are used to balance the style, content and variation loss. If your results are not satisfying, these are some values you might want to adjust.

In [None]:
e = [style_loss(sf, model[ln]) for sf, ln in zip(style_features, style_layers)]
styleloss = sum([style_weights[l] * e[l] for l in range(len(style_layers))])
contentloss = content_loss(model[content_layer])
varloss = var_loss(model['in'])

alpha = 1
beta = 100
gamma = 0.1

total_loss = alpha * contentloss + beta * styleloss + gamma * varloss

The authors of the original paper proposed to assign white noise to the input of our network and let the network transform this noise into a combined representation of the desired content and style. However, we can make the task of restoring the image content easier by inputting a noisy representation of our content image to the network. 

In [None]:
noise_ratio = 0.7
content_ratio = 1. - noise_ratio
noise = np.random.uniform(-15, 15, processed_content.shape)
input_image = (processed_content * content_ratio) + noise_ratio * noise

unp = unprocess_image(input_image)
plt.imshow(unp.astype("uint8"))
plt.show()

It was possible to train the network using a large learning rate, but you might want to adjust these settings.

In [None]:
optimizer = tf.train.AdamOptimizer(1).minimize(total_loss)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(model['in'].assign(input_image))
pass

## Training the Network

It takes about 100 iterations until the created image will look somewhat like a merge of the content image with the desired style. Then, the output slowly becomes more visually appealing. I stopped training before the loss had converged, as the outputs mostly looked neat enough much earlier and the training would otherwise take quite long. I run the training steps on a Tesla K80 GPU for about 15-20 minutes for most content/style combinations until I was satisfied with the results.

In [None]:
for i in range(1000):
    if i % 50 == 0:
        m_in = sess.run(model['in'])
        plt.imshow(unprocess_image(m_in, means_c).astype("uint8"))
        plt.show()
        
    _, ls = sess.run([optimizer, total_loss])
        
    if i % 10 == 0:        
        print(i, ls)