##### Copyright 2018 The TensorFlow Authors.



In [0]:
!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf

import matplotlib.pyplot as plt

import numpy as np
import os
import time
from PIL import Image
import pandas as pd
from sklearn.utils import shuffle

Collecting tensorflow-gpu==2.0.0-alpha0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/66/32cffad095253219d53f6b6c2a436637bbe45ac4e7be0244557210dc3918/tensorflow_gpu-2.0.0a0-cp36-cp36m-manylinux1_x86_64.whl (332.1MB)
[K     |████████████████████████████████| 332.1MB 31kB/s 
Collecting tb-nightly<1.14.0a20190302,>=1.14.0a20190301 (from tensorflow-gpu==2.0.0-alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/a9/51/aa1d756644bf4624c03844115e4ac4058eff77acd786b26315f051a4b195/tb_nightly-1.14.0a20190301-py3-none-any.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 42.3MB/s 
Collecting google-pasta>=0.1.2 (from tensorflow-gpu==2.0.0-alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/f9/68/a14620bfb042691f532dcde8576ff82ee82e4c003cdc0a3dbee5f289cee6/google_pasta-0.1.6-py3-none-any.whl (51kB)
[K     |████████████████████████████████| 61kB 31.7MB/s 
Collecting tf-estimator-nightly<1.14.0.dev2019030116,>=1.14.0.dev2019030115 (from t

In [0]:
!ls

sample_data


### Mount google drive with dataset


In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
drive_path = '/content/gdrive/My Drive/SSL/'
!ls '{drive_path}'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
glove.840B.300d.zip  models  train.csv	train.zip  val.csv  val.zip


In [0]:
drive_models_dir = drive_path + 'models/transformer_victor'
os.makedirs(drive_models_dir, exist_ok=True)

In [0]:
import shutil
def copy_file_to_drive(path):
  shutil.copy(path, Path(drive_models_dir)/ Path(path).name)  

### Paths to important files and directories

In [0]:
drive_path_to_train_csv = 'train.csv'
drive_path_to_val_csv = 'val.csv'
drive_path_to_train_zip = 'train.zip'
drive_path_to_val_zip = 'val.zip'

train_csv_filepath = drive_path + drive_path_to_train_csv
val_csv_filepath = drive_path + drive_path_to_val_csv
train_zip_filepath = drive_path + drive_path_to_train_zip
val_zip_filepath = drive_path + drive_path_to_val_zip
train_set_dir = './train/'
val_set_dir = './val/'

train_cache = './train_cache/'
val_cache = './val_cache/'

!ls

gdrive	sample_data


### Unzip the datasets if they have not been unzipped

In [0]:
if not os.path.exists(val_set_dir):
  !time  unzip -q '{val_zip_filepath}' 


real	0m6.148s
user	0m2.220s
sys	0m1.211s


In [0]:
if not os.path.exists(train_set_dir):
  !time  unzip -q '{train_zip_filepath}'


real	3m38.038s
user	1m14.126s
sys	0m29.605s


### Load csv files with with image ids and captions. Limit the dataset for faster training when testing.

In [0]:
num_train_examples = 200000
train_df = pd.read_csv(train_csv_filepath)[:num_train_examples]
val_df = pd.read_csv(val_csv_filepath)[:128]

train_df[:2]

Unnamed: 0,image_id,caption,url
0,0,a very typical bus station,http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...
1,1,sierra looked stunning in this top and this sk...,http://78.media.tumblr.com/3b133294bdc7c7784b7...


### Determine paths for images and add start/end tokens to captions for both datasets. Shuffle training set.

In [0]:
def collect_paths_captions(df, set_dir):
  all_captions = []
  all_img_paths = []
  for index, row in df.iterrows():
    caption = '<start> ' + row['caption'] + ' <end>'
    image_id = row['image_id']
    full_image_path = set_dir + str(image_id) + '.jpg'
    all_img_paths.append(full_image_path)
    all_captions.append(caption)
  return all_captions, all_img_paths

train_captions, train_paths = collect_paths_captions(train_df, train_set_dir)
val_captions, val_paths = collect_paths_captions(val_df, val_set_dir)

# shuffle train data
train_captions, train_paths = shuffle(train_captions, train_paths, 
                                      random_state=1)

print(train_captions[:3], len(train_captions))

['<start> funny greeting card with smiling snail . <end>', '<start> photo of happy couple sitting on a branch taken by person <end>', '<start> wine glass at a vineyard <end>'] 200000


## Preprocess and tokenize the captions

* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., "surfing", "football", etc).
* Next, we'll limit the vocabulary size to the words which appear at least K times. All other words will be replaced with the token "UNK" (for unknown).
* Finally, we create a word --> index mapping and vice-versa.
* We will then pad all sequences to the be same length as the longest one.



In [0]:
min_frequency = 5

tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
top = len([1 for word, count in tokenizer.word_counts.items() if count >= min_frequency]) + 1 # add 1 for <unk>
tokenizer.num_words = top

In [0]:
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [0]:
vocabulary_size = top + 1  # added 1 for <pad>

In [0]:
vocabulary_size

12318

In [0]:
# creating the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)
val_seqs = tokenizer.texts_to_sequences(val_captions)

In [0]:
# padding each vector to the max_length of the captions
# if the max_length parameter is not provided, pad_sequences calculates that automatically
train_cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')
val_cap_vector = tf.keras.preprocessing.sequence.pad_sequences(val_seqs, padding='post')

## Preprocess the images using InceptionV3
Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer.

First, we will need to convert the images into the format inceptionV3 expects by:
* Resizing the image to (299, 299)
* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3).

In [0]:
IMAGE_SIZE = (299, 299)

def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

## Initialize InceptionV3 and load the pretrained Imagenet weights

To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture.
* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector).
* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```.
* We avoid doing this during training so it does not become a bottleneck.
* After all the images are passed through the network, we pickle the dictionary and save it to disk.

In [0]:
image_model = tf.keras.applications.InceptionV3(include_top=False,
                                              weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


## Caching the features extracted from InceptionV3

We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \* 8 \* 2048 floats per image. 

Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.

### Caching option 1: using TFrecords: cache multiple image feature maps into the same file. 
Advantages to numpy save: 
1.   Less random disk access when reading -> faster data loading -> faster training
2.   Can compress files

First setup the cache folder:



In [0]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def serialize_row(feature_map, caption):
  feature = {
# seriaizing as strings is much faster then floats/ints
#       'feature_map': tf.train.Feature(float_list=tf.train.FloatList(value=feature_map.numpy().reshape(-1))),
#       'caption_vector': tf.train.Feature(int64_list=tf.train.Int64List(value=caption.numpy())),
      'feature_map': _bytes_feature(tf.io.serialize_tensor(feature_map).numpy()),
      'caption_vector': _bytes_feature(tf.io.serialize_tensor(caption).numpy())
  }
    
  example = tf.train.Example(features=tf.train.Features(feature=feature))
  return example

# set compression
# compression = tf.io.TFRecordCompressionType.NONE
# options = tf.io.TFRecordOptions(compression)
def create_fragment_record(features, captions, tfrecord_path):
  with tf.io.TFRecordWriter(tfrecord_path) as writer:
    for feature, caption in zip(features, captions):
      tf_example = serialize_row(feature, caption)
      writer.write(tf_example.SerializeToString())
    
    
def cache_features(img_paths, captions_vector, cache_folder, batch_size=64, batches_per_file=4):
  start = time.time()
  
  image_ds = tf.data.Dataset.from_tensor_slices(img_paths)
  captions_ds = tf.data.Dataset.from_tensor_slices(captions_vector)

  image_ds = image_ds.map(lambda path: load_image(path)[0], 
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
  ds = tf.data.Dataset.zip((image_ds, captions_ds))
  ds = ds.batch(batch_size=batch_size, drop_remainder=True)
  
  features = []
  captions = []
  fragment_index = 0
  
  for batch_idx, (image_batch, captions_batch) in enumerate(ds):
    features_batch = image_features_extract_model(image_batch)
    features_batch = tf.reshape(features_batch, (features_batch.shape[0], -1, features_batch.shape[3]))
    
    for feature_map, caption in zip(features_batch, captions_batch):
      features.append(feature_map)
      captions.append(caption)
      
    if (batch_idx + 1) % batches_per_file == 0:
      # write to new file
      cache_file = cache_folder + str(fragment_index) + ".tfrecords"
      create_fragment_record(features, captions, cache_file)
      features = []
      captions = []
      fragment_index += 1
      
  if features:
      cache_file = cache_folder + str(fragment_index) + ".tfrecords"
      create_fragment_record(features, captions, cache_file)
      
  print(f'Caching took: {time.time() - start:.4f} seconds')

In [0]:
if not os.path.exists(val_cache):
  os.makedirs(val_cache, exist_ok=True)
  cache_features(val_paths, val_cap_vector, val_cache)

Caching took: 7.4591 seconds


In [0]:
if not os.path.exists(train_cache):
  os.makedirs(train_cache, exist_ok=True)
  cache_features(train_paths, train_cap_vector, train_cache)

Caching took: 2130.7068 seconds


### Caching option 2: using numpy save: cache each feature map in separate file.

In [0]:
# def cache_feature_maps(image_dataset):
#   start = time.time()
#   for img, path in image_dataset:
#       batch_features = image_features_extract_model(img)
#       batch_features = tf.reshape(batch_features,
#                                 (batch_features.shape[0], -1, batch_features.shape[3]))

#       for bf, p in zip(batch_features, path):
#           path_of_feature = p.numpy().decode("utf-8")
#           np.save(path_of_feature, bf.numpy())
#   print('Caching took: ', time.time() - start, 'seconds')

In [0]:
# image_ds = tf.data.Dataset.from_tensor_slices(train_paths + val_paths)
# image_ds = image_ds.map(
#   load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(64)
# print('Train + validation set caching...')
# cache_feature_maps(image_ds)

### Compute training sequence max length. Used only at evaluation.

In [0]:
# This will find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [0]:
# calculating the max_length
# used to store the attention weights
max_length = calc_max_length(train_seqs)

## Split the data into training and testing (no need, already have a split in the csv files)


# Create training and validation sets using 80-20 split
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector,
                                                                    cap_vector,
                                                                    test_size=0.2,
                                                                    random_state=0)

## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.



In [0]:
BATCH_SIZE = 256
BUFFER_SIZE = 4000
# embedding_dim = 256
# units = 512
vocab_size = vocabulary_size
# shape of the vector extracted from InceptionV3 is (64, 2048)
# # these two variables represent that
# features_shape = 2048
# attention_features_shape = 64

### 1) Loading dataset cached with option 1.

In [0]:
from pathlib import Path
tfrecords = [str(path) for path in Path(train_cache).glob("*.tfrecords")]
raw_image_dataset = tf.data.Dataset.from_tensor_slices(tfrecords)
raw_image_dataset = raw_image_dataset.shuffle(len(tfrecords))
raw_image_dataset = raw_image_dataset.apply(
    tf.data.experimental.parallel_interleave(
        lambda filename: tf.data.TFRecordDataset(filename, compression_type=''),
        cycle_length=4,
        sloppy=True,
        buffer_output_elements=4, 
        prefetch_input_elements=4,

    ))
  
# raw_image_dataset = tf.data.TFRecordDataset(tfrecords, compression_type='')
# raw_image_dataset = raw_image_dataset.shuffle(len(tfrecords))

In [0]:
# Create a dictionary describing the features.  
image_feature_description = {
    'feature_map': tf.io.FixedLenFeature([], tf.string),
    'caption_vector': tf.io.FixedLenFeature([], tf.string)
}


def _parse_image_function(example_proto):
  # Parse the input tf.Example proto using the dictionary above.
  dict_features = tf.io.parse_single_example(example_proto, image_feature_description)
  
  feature_map = tf.io.parse_tensor(dict_features['feature_map'], out_type=tf.float32)
  caption = tf.io.parse_tensor(dict_features['caption_vector'], out_type=tf.int32)

  return feature_map, caption

dataset = raw_image_dataset.map(_parse_image_function)
dataset

for i, record in enumerate(dataset.take(3)):
  print(i, repr(record[0].shape), repr(record[1].shape))

0 TensorShape([64, 2048]) TensorShape([59])
1 TensorShape([64, 2048]) TensorShape([59])
2 TensorShape([64, 2048]) TensorShape([59])


### 2) Loading dataset cached with option 2.

def map_func(img_path, cap):
    img_tensor = np.load(img_path.decode('utf-8') + '.npy')
    return img_tensor, cap

dataset = tf.data.Dataset.from_tensor_slices((train_paths, train_cap_vector))

dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

### Shuffle, batch dataset

In [0]:
# shuffling and batching
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

for i, record in enumerate(dataset.take(3)):
  print(i, repr(record[0].shape), repr(record[1].shape))

0 TensorShape([256, 64, 2048]) TensorShape([256, 59])
1 TensorShape([256, 64, 2048]) TensorShape([256, 59])
2 TensorShape([256, 64, 2048]) TensorShape([256, 59])


## Model
https://www.tensorflow.org/alpha/tutorials/text/transformer

In [0]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [0]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  sines = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  cosines = np.cos(angle_rads[:, 1::2])
  
  pos_encoding = np.concatenate([sines, cosines], axis=-1)
  
  pos_encoding = pos_encoding[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [0]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions so that we can add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [0]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [0]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_v, depth)

  return output, attention_weights

In [0]:
# class LayerNorm(tf.keras.layers.Layer):
#     "Construct a layernorm module (See citation for details)."
#     def __init__(self, features, epsilon=1e-6):
#         super(LayerNorm, self).__init__()
#         self.a_2 = tf.Variable(initial_value=tf.zeros_initializer()(
#                                shape=(features,), dtype='float32'), trainable=True) + 1
#         self.b_2 = tf.Variable(initial_value=tf.zeros_initializer()(
#                                shape=(features,), dtype='float32'), trainable=True)
#         self.eps = epsilon

#     def forward(self, x):
#         mean = tf.math.reduce_mean(x, axis=-1, keep_dims=True)
#         std = tf.math.reduce_std(x, axis=-1, keep_dims=True)
#         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_v, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_v, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_v, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_v, d_model)
        
    return output, attention_weights

In [0]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [0]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.experimental.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [0]:
class CNN_Encoder(tf.keras.Model):
    # Since we have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [0]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, max_len, 
               rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = CNN_Encoder(d_model)
    self.pos_encoding = positional_encoding(max_len, self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):
    
    seq_len = tf.shape(x)[1]

    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)

    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    return x  # (batch_size, input_seq_len, d_model)

In [0]:
sample_encoder = Encoder(num_layers=6, d_model=512, num_heads=8, 
                         dff=2048, max_len=64)

sample_encoder_output = sample_encoder(tf.random.uniform((64, 64, 2048)), 
                                       training=False, mask=None)

print (sample_encoder_output.shape) 

(64, 64, 512)


In [0]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, 
               rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)

    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)  
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [0]:
sample_decoder = Decoder(num_layers=6, d_model=512, num_heads=8, 
                         dff=2048, target_vocab_size=8000)

output, attn = sample_decoder(tf.random.uniform((64, 26)), 
                              enc_output=sample_encoder_output, 
                              training=False, look_ahead_mask=None, 
                              padding_mask=None)

output.shape, attn['decoder_layer2_block2'].shape

(TensorShape([64, 26, 512]), TensorShape([64, 8, 26, 64]))

In [0]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, max_len, 
               target_vocab_size, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           max_len, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [0]:
sample_transformer = Transformer(
    num_layers=10, d_model=512, num_heads=8, dff=2048, 
    max_len=64, target_vocab_size=vocab_size)


temp_input = tf.random.uniform((64, 64, 2048))
temp_target = tf.random.uniform((64, 59))

fn_out, _ = sample_transformer(temp_input, temp_target, training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)
fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)


TensorShape([64, 59, 12318])

In [0]:
num_layers = 6
d_model = 512
dff = 2048
num_heads = 8

target_vocab_size = vocab_size
dropout_rate = 0.1

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [0]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [0]:
MAX_LEN_INPUT = 64
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          MAX_LEN_INPUT, target_vocab_size, dropout_rate)

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_mean(loss_)

In [0]:
@tf.function
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 None, 
                                 combined_mask, 
                                 None)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

In [0]:
# checkpoint_path = "./checkpoints/train/transformer"
checkpoint_path = drive_models_dir
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [0]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [0]:
EPOCHS = 20

In [0]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  
for epoch in range(16, EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  for (batch, (img_tensor, tar)) in enumerate(dataset):
    train_step(img_tensor, tar)
    
    if batch % 500 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if epoch % 5 == 0:
    ckpt_save_path = ckpt_manager.save(epoch)
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 17 Batch 0 Loss 0.0657 Accuracy 0.1633
Epoch 17 Batch 500 Loss 0.0752 Accuracy 0.1628
Epoch 17 Loss 0.0815 Accuracy 0.1615
Time taken for 1 epoch: 1485.6080477237701 secs

Epoch 18 Batch 0 Loss 0.0625 Accuracy 0.1617
Epoch 18 Batch 500 Loss 0.0720 Accuracy 0.1637
Epoch 18 Loss 0.0756 Accuracy 0.1628
Time taken for 1 epoch: 1487.27600979805 secs

Epoch 19 Batch 0 Loss 0.0713 Accuracy 0.1716


In [0]:
!ls ./checkpoints/train/transformer


In [0]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

## Caption!

* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the end token.
* And store the attention weights for every time step.

In [0]:
def evaluate(image):
    attention_plot = np.zeros((max_length, attention_features_shape))

    hidden = decoder.reset_state(batch_size=1)

    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

In [0]:
def plot_attention(image, result, attention_plot):
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for l in range(len_result):
        temp_att = np.resize(attention_plot[l], (8, 8))
        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

In [0]:
# captions on the validation set
rid = np.random.randint(0, len(val_paths))
print(rid)
image = val_paths[rid]
real_caption = ' '.join([tokenizer.index_word[i] for i in val_cap_vector[rid] if i not in [0]])
result, attention_plot = evaluate(image)

print ('Real Caption:', real_caption)
print ('Prediction Caption:', ' '.join(result))
plot_attention(image, result, attention_plot)
# opening the image
Image.open(val_paths[rid])

## Try it on your own images
For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)


In [0]:
image_url = 'https://tensorflow.org/images/surf.jpg'
image_extension = image_url[-4:]
image_path = tf.keras.utils.get_file('image'+image_extension,
                                     origin=image_url)

result, attention_plot = evaluate(image_path)
print ('Prediction Caption:', ' '.join(result))
plot_attention(image_path, result, attention_plot)
# opening the image
Image.open(image_path)

# Next steps

Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention](../sequences/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset.