# Lab13-2 Image Captioning
110065508 李丞恩

In [1]:
# HYPER_PARAMETERS
IMG_SIZE_W = 224
IMG_SIZE_H = 448

BATCH_SIZE = 50
BUFFER_SIZE = 5000

embedding_dim = 128
units = 64

vocab_size = 29 # 26個英文字母 + <start>, <end>, <pad>
max_length = 7

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import re
import numpy as np
import os
import time
import json
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
from glob import glob
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tqdm.notebook import tqdm
from tensorflow.keras.layers import Conv2D, MaxPool2D, Reshape, Dense

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Select GPU number 1
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


# 讀取spec
---
- 讀取圖片的Caption
- 讀取圖片的路徑

In [3]:
# Read the file
f = open("./spec_train_val.txt", "r")

captions = []
img_path = []

for line in f:
    ss = line.strip('\n').split(' ')
    img_path.append("words_captcha" + "/" + ss[0] + ".png")
    captions.append(ss[1])

# Cloase the file
f.close()

# 分割dataset為100000-20000

In [4]:
# Create training and validation sets using an 100000-20000 split
img_path_train, img_path_val, captions_train, captions_val = train_test_split(img_path,
                                                                              captions,
                                                                              test_size=20000,
                                                                              random_state=0)

len(img_path_train), len(img_path_val), len(captions_train), len(captions_val)

(100000, 20000, 100000, 20000)

In [5]:
# HYPER_PARAMETERS
IMG_SIZE_W = 224
IMG_SIZE_H = 448

BATCH_SIZE = 100
BUFFER_SIZE = 5000

embedding_dim = 128
units = 64

vocab_size = 29 # 26個英文字母 + <start>, <end>, <pad>
max_length = 7

num_steps = len(img_path_train) // BATCH_SIZE

# define map_func
---
- 把剛剛讀的caption換成數字，a = 1，b = 2，依此類推。
- pad   = 0
- start = 27
- end   = 28

In [6]:
# Load the numpy files
def map_func(img_path, captions):
    # load image
    img = tf.io.read_file(img_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.dtypes.cast(img, tf.float32)
    img = img / 255.0
    img = tf.image.resize(img, (IMG_SIZE_W, IMG_SIZE_H))
    
    # process caption
    cap = []
    cap.append(27) # start = 27
    for i in range(6):
        number = 0 # empty = 0
        if i < len(captions):
            number = (captions[i]) - 96
        elif i == len(captions):
            number = 28

        cap.append(np.int64(number))
    
    return img, cap

In [7]:
dataset = tf.data.Dataset.from_tensor_slices((img_path_train, captions_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int64]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [8]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [9]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()

        # input = (batch_size, 224, 448, 3)

        self.conv1_1 = Conv2D(filters=32, kernel_size=(3, 3), padding='same', activation='relu')
        self.conv1_2 = Conv2D(filters=32, kernel_size=(3, 3), padding='same', activation='relu')
        self.pool1   = MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        # pool1 = (batch_size, 112, 224, 32)

        self.conv2_1 = Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu')
        self.conv2_2 = Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu')
        self.pool2   = MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        # pool2 = (batch_size, 56, 112, 64)

        self.conv3_1 = Conv2D(filters=128, kernel_size=(3, 3), padding='same', activation='relu')
        self.conv3_2 = Conv2D(filters=128, kernel_size=(3, 3), padding='same', activation='relu')
        self.pool3   = MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        # pool3 = (batch_size, 28, 56, 128)

        self.conv4_1 = Conv2D(filters=256, kernel_size=(3, 3), padding='same', activation='relu')
        self.conv4_2 = Conv2D(filters=256, kernel_size=(3, 3), padding='same', activation='relu')
        self.pool4   = MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        # pool4 = (batch_size, 14, 28, 256)
        
        self.conv5_1 = Conv2D(filters=256, kernel_size=(3, 3), padding='same', activation='relu')
        self.pool5   = MaxPool2D(pool_size=(2, 2), strides=(2, 2))
        # pool5 = (batch_size, 7, 14, 512)

        self.R = Reshape((-1, 256))
        # reshape = (batch_size, 98, 512)
        
        self.E_1 = Dense(192          , activation='relu')
        self.E_2 = Dense(embedding_dim, activation='relu')
        # embedding = (batch_size, 98, embedding_dim)

    def call(self, x):
        x = self.conv1_1(x)
        x = self.conv1_2(x)
        x = self.pool1(x)

        x = self.conv2_1(x)
        x = self.conv2_2(x)
        x = self.pool2(x)

        x = self.conv3_1(x)
        x = self.conv3_2(x)
        x = self.pool3(x)

        x = self.conv4_1(x)
        x = self.conv4_2(x)
        x = self.pool4(x)
        
        x = self.conv5_1(x)
        x = self.pool5(x)

        x = self.R(x)
        
        x = self.E_1(x)
        x = self.E_2(x)
        
        return x

In [10]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [11]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [12]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [13]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer = optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

In [14]:
@tf.function
def train_step(img, target):
    loss = 0
             
    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([27] * BATCH_SIZE, 1)

    with tf.GradientTape() as tape:  
                                
        features = encoder(img)
        
        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    ## 平均每個字預測的cross entrophy
    total_loss = (loss / int(target.shape[1]))

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

# 開始train

---
- 燒顯卡的時候到了

In [None]:
loss_plot = []

EPOCHS = 50

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img, target)
        total_loss += t_loss

    loss_plot.append(total_loss / num_steps)

    if epoch % 5 == 0:
        ckpt_manager.save()

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, total_loss/num_steps))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Loss 1.723897
Time taken for 1 epoch 693.6380391120911 sec

Epoch 2 Loss 1.452572
Time taken for 1 epoch 615.5227978229523 sec

Epoch 3 Loss 0.343950
Time taken for 1 epoch 618.6903860569 sec

Epoch 4 Loss 0.120944
Time taken for 1 epoch 604.7163732051849 sec

Epoch 5 Loss 0.040951
Time taken for 1 epoch 606.2124140262604 sec

Epoch 6 Loss 0.018616
Time taken for 1 epoch 630.2027697563171 sec

Epoch 7 Loss 0.016430
Time taken for 1 epoch 588.1956033706665 sec

Epoch 8 Loss 0.013663
Time taken for 1 epoch 585.7850868701935 sec

Epoch 9 Loss 0.015161
Time taken for 1 epoch 589.6416227817535 sec

Epoch 10 Loss 0.012006
Time taken for 1 epoch 619.8796455860138 sec

Epoch 11 Loss 0.008165
Time taken for 1 epoch 619.4291632175446 sec

Epoch 12 Loss 0.010550
Time taken for 1 epoch 620.2097067832947 sec

Epoch 13 Loss 0.009158
Time taken for 1 epoch 616.1064252853394 sec

Epoch 14 Loss 0.007547
Time taken for 1 epoch 616.0037415027618 sec

Epoch 15 Loss 0.016336
Time taken for 1 epoch 

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

In [None]:
def evaluate(img_path):
    # load image
    img = tf.io.read_file(img_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.dtypes.cast(img, tf.float32)
    img = img / 255.0
    img = tf.image.resize(img, (IMG_SIZE_W, IMG_SIZE_H))
    img = tf.expand_dims(img, 0)
    
    hidden = decoder.reset_state(batch_size = 1)
    features = encoder(img)
    dec_input = tf.expand_dims([27], 0)
    result = ""
    
    for i in range(1, max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if predicted_id >= 1 and predicted_id <= 26:
            result = result + chr(predicted_id + 96)

        if predicted_id == 28:
            return result

        dec_input = tf.expand_dims([predicted_id], 0)

    return result

# Testing
---
- Accuracy > 99%

In [None]:
match = 0
for (index, img_path) in enumerate(img_path_val):
    predict = evaluate(img_path)
    if predict == captions_val[index]:
        match = match + 1
    else:
        print(predict, captions_val[index])

In [None]:
print("Accuracy: ", (match / len(captions_val)))

# 輸出檔案

In [None]:
with open('Lab13-2_108062648.txt', 'w') as f:
    for i in range(120000, 140000):
        name = 'a' + str(i)
        path = './words_captcha/'+ name + '.png'
        result = evaluate(path)
        
        f.write(name + ' ' + result +'\n')