# Assignment

## Requirement:

### 1. should design your own model architecture. In other words, do not load the model or any pre-trained weights directly from other sources.
### 2. use the first 100,000 images as training data, the next 20,000 as validation data, and the rest as testing data.
### 3. Only if the whole word matches exactly does it count as correct
### 4. predict the answer to the testing data and write them in a file.
### 5. testing accuracy should be at least 90%.

## Procedure:

### 0. Load Data:
- load and split data into training and validation set

### 1. Preprocessing:
- create dictionary: index2char, char2index.(for annotations <-> indices)
- find maximum length in annotations and convert annotations to index and pad to max_length
- resize images into sizes "based on what feature extractor you use".
- normalize images pixels into -1~1

### 2. Design Feature Extractor:
- 
- 

### 3. Design Encoder and Decoder
- encoder, decoder, attention-based的設計和lab一樣

### 4. Design Model and Training
- 把input images丟到feature extractor得到features 
- 再把features, hidden_states, decoder_input丟到decoder
- loss function, optimizer和lab一樣

### 5. Do validation
- 
- 

### 6. predict on Testing data

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle

2021-12-16 01:25:12.128181: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


2021-12-16 01:25:13.008498: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-12-16 01:25:13.088481: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-16 01:25:13.088967: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3060 computeCapability: 8.6
coreClock: 1.852GHz coreCount: 28 deviceMemorySize: 11.77GiB deviceMemoryBandwidth: 335.32GiB/s
2021-12-16 01:25:13.088985: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-12-16 01:25:13.090625: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-12-16 01:25:13.090654: I tensorflow/stream_executor/pl

In [4]:
BATCH_SIZE = 20 # 40
SHUFFLE_BUFFER_SIZE = 5000
embedding_dim = 256
units = 512
IMG_HEIGHT = 448
IMG_WIDTH = 224
LEARNING_RATE = 5e-5


## 0. Load Data

In [5]:
IMAGE_DIR = './words_captcha/'
annotation_file = './words_captcha/spec_train_val.txt'

with open(annotation_file, 'r') as f:
    lines = f.readlines()
f.close()

train_img_name = []
val_img_name = []
train_annotation = []
val_annotation = []
num = 0

for line in lines:
    line = line.strip('\n')
    line = line.split(' ')
    if num < 100000:
        train_img_name.append(line[0])
        train_annotation.append(line[1])
    else:
        val_img_name.append(line[0])
        val_annotation.append(line[1])
    num+=1

## 1. Preprocessing

### 1-(1): create dictionary: index2char, char2index

In [6]:
char2idx = {}
idx2char = {}

# add token <pad> to dictionary
char2idx['<pad>'] = 0
idx2char[0] = '<pad>'

# only a~z appears in capcha
for i in range(1, 27):
    char2idx[chr(ord('a') + i - 1)] = i
    idx2char[i] = chr(ord('a') + i - 1)
    
# add token <start>, <end> to dictionary
char2idx['<start>'] = 27
idx2char[27] = '<start>'
char2idx['<end>'] = 28
idx2char[28] = '<end>'



In [7]:
print(char2idx)
print(idx2char)

{'<pad>': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '<start>': 27, '<end>': 28}
{0: '<pad>', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 27: '<start>', 28: '<end>'}


### 1-(2): find max_length of annotations and convert annotations to indcies

In [8]:
def max_length(annotations):
    max_len = 0
    for annotation in annotations:
        if len(annotation) > max_len:
            max_len = len(annotation)
    return max_len

In [9]:
max_len_train = max_length(train_annotation)
max_len_val = max_length(val_annotation)

max_len = max(max_len_train, max_len_val) + 2 ## '+2' for <start> and <end> indices!
print(max_len)

7


In [10]:
train_annotation_idx = []
val_annotation_idx = []

for annotation in train_annotation:
    # pad <start> index
    annotation_idx = [27]
    for character in annotation:
        annotation_idx.append(char2idx[character])
        
    # pad <end> index
    annotation_idx.append(28)
    
    while len(annotation_idx) < max_len:
        # pad <pad> index
        annotation_idx.append(0)
    train_annotation_idx.append(annotation_idx)
    
for annotation in val_annotation:
    annotation_idx = [27]
    for character in annotation:
        annotation_idx.append(char2idx[character])
    annotation_idx.append(28)
    while len(annotation_idx) < max_len:
        annotation_idx.append(0)
    val_annotation_idx.append(annotation_idx)

In [11]:
print(train_annotation[0])
print(train_annotation_idx[0])

thus
[27, 20, 8, 21, 19, 28, 0]


### 1-(3) resize images into sizes based on VGG19 architecture, and do normalize to 1~-1

In [12]:
#IMG_SIZE = 224

In [13]:
def load_image(image_name, annotation):
    img = tf.io.read_file(IMAGE_DIR + image_name + '.png')
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_HEIGHT, IMG_WIDTH)) # resize image(為的是要放到feature extractor裡面)
    img = img/255 - 1. # normalize to 1~-1
    return img, annotation

### create tf.dataset and make batches

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_img_name,train_annotation_idx))
val_dataset = tf.data.Dataset.from_tensor_slices((val_img_name,val_annotation_idx))

train_dataset = train_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(200)

val_dataset = val_dataset.shuffle(SHUFFLE_BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.prefetch(200)

## 2. Design Feature Extractor: (based on VGG19)

VGG19 architecture: 

https://www.google.com/search?q=vgg19+architecture&tbm=isch&source=iu&ictx=1&fir=OiS_Va5y9f7f_M%252CGfR54ZAyOPXJ0M%252C_%253BgWpOWCUibA2HHM%252C-fAqcRc2_0E69M%252C_%253BBmAVVY3fiaEWSM%252Cr5RcjSElbvC9zM%252C_%253BIi4dkHHJCPBHhM%252C6SpiHVnsTHGfqM%252C_%253BN0Ee1uegeQS2MM%252CtmYWlqyrDnMDKM%252C_%253BDAYKPdDc3L8LWM%252CGfR54ZAyOPXJ0M%252C_%253Bulz740AMBJwELM%252CAQWccUmcjehYAM%252C_%253B7TyTAnvvZQlh2M%252Cz2PiMjqBWglM2M%252C_%253BfnUCv4NSRiu4sM%252CJ0XOR7dnwxmcfM%252C_%253BJCwII581R6z1lM%252CJVUs7QkfNRs0vM%252C_%253BetpWE5UJ8thKGM%252CG9LNfrtX5LmIQM%252C_%253BVZClRgHVIQzRDM%252CPLQfnY1iNdmakM%252C_%253B_r1pSVOfWcucoM%252CAzBUl5wwOfzzaM%252C_%253BMGWJupu_SmhpCM%252CJV6PdoPibOBEKM%252C_%253Bo3UI0f48t9XjTM%252CigQ3rReKeD7THM%252C_&vet=1&usg=AI4_-kRgiOWpbYylrLF6gz4Uqc-Symtxfw&sa=X&ved=2ahUKEwjY48WH0uP0AhVcs1YBHV_AC78Q9QF6BAgkEAE#imgrc=gWpOWCUibA2HHM

Input: 448x224x3

Output: 7x7x1024

In [15]:
class conv_relu(tf.keras.layers.Layer):
    def __init__(self, filters, size, stride):
        super(conv_relu, self).__init__()
        self.conv = tf.keras.layers.Conv2D(filters, size, stride, padding="same",
                      kernel_initializer=tf.keras.initializers.TruncatedNormal())
        self.batchnorm = tf.keras.layers.BatchNormalization()
        self.lkrelu = tf.keras.layers.LeakyReLU(0.1)

    def call(self, inputs, training):
        x = self.conv(inputs)
        x = self.batchnorm(x,training = training) ## batch normalization!
        x = self.lkrelu(x) ## leaky relu!
        return x

In [16]:
class Feature_Extracter(tf.keras.Model):

    def __init__(self):
        super(Feature_Extracter, self).__init__()
        self.cr1_1 = conv_relu(64,3,1)
        self.cr1_2 = conv_relu(64,3,1)
        self.max_pooling1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr2_1 = conv_relu(128,3,1)
        self.cr2_2 = conv_relu(128,3,1)
        self.max_pooling2 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr3_1 = conv_relu(256,3,1)
        self.cr3_2 = conv_relu(256,3,1)
        self.cr3_3 = conv_relu(256,3,1)
        self.cr3_4 = conv_relu(256,3,1)
        self.max_pooling3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr4_1 = conv_relu(512,3,1)
        self.cr4_2 = conv_relu(512,3,1)
        self.cr4_3 = conv_relu(512,3,1)
        self.cr4_4 = conv_relu(512,3,1)
        self.max_pooling4 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr5_1 = conv_relu(512,3,1)
        self.cr5_2 = conv_relu(512,3,1)
        self.cr5_3 = conv_relu(512,3,1)
        self.cr5_4 = conv_relu(512,3,1)
        self.max_pooling5 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr6_1 = conv_relu(1024,3,1)
        self.cr6_2 = conv_relu(1024,3,1)
        self.cr6_3 = conv_relu(1024,3,1)
        self.cr6_4 = conv_relu(1024,3,1)

    def call(self, inputs, training):
        x = self.cr1_1(inputs,training)
        x = self.cr1_2(x,training)
        x = self.max_pooling1(x)
        x = self.cr2_1(x,training)
        x = self.cr2_2(x,training)
        x = self.max_pooling2(x)
        x = self.cr3_1(x,training)
        x = self.cr3_2(x,training)
        x = self.cr3_3(x,training)
        x = self.cr3_4(x,training)
        x = self.max_pooling3(x)
        x = self.cr4_1(x,training)
        x = self.cr4_2(x,training)
        x = self.cr4_3(x,training)
        x = self.cr4_4(x,training)        
        x = self.max_pooling4(x)
        x = self.cr5_1(x,training)
        x = self.cr5_2(x,training)
        x = self.cr5_3(x,training)
        x = self.cr5_4(x,training)
        x = self.max_pooling5(x)
        x = self.cr6_1(x,training)
        x = self.cr6_2(x,training)
        x = self.cr6_3(x,training)
        x = self.cr6_4(x,training)
        return x

In [17]:
feature_extracter = Feature_Extracter()
feature_extracter.build((None, IMG_HEIGHT, IMG_WIDTH, 3))
feature_extracter.summary()

Model: "feature__extracter"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_relu (conv_relu)        multiple                  2048      
_________________________________________________________________
conv_relu_1 (conv_relu)      multiple                  37184     
_________________________________________________________________
max_pooling2d (MaxPooling2D) multiple                  0         
_________________________________________________________________
conv_relu_2 (conv_relu)      multiple                  74368     
_________________________________________________________________
conv_relu_3 (conv_relu)      multiple                  148096    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 multiple                  0         
_________________________________________________________________
conv_relu_4 (conv_relu)      multiple           

## 3. Design Encoder and Decoder

和lab一樣

In [18]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [19]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [20]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [21]:
vocab_size = len(char2idx)
num_steps = len(train_img_name) // BATCH_SIZE 

# Shape of the vector extracted from VGG19 is (64, 2048)
# These two variables represent that vector shape
features_shape = 1024
attention_features_shape = 49 ## 因為出來是7x7x1024, reshape後變49x1024

In [22]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

## 4. Design Model and Training

In [23]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

### Checkpoint

In [24]:
checkpoint_path = "./checkpoints/assignment/train_vgg19"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [25]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

### Training

In [26]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []

In [27]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([char2idx['<start>']] * BATCH_SIZE, 1)

    with tf.GradientTape() as tape:
        
        features = feature_extracter(img_tensor,True) ## 必須真的去extract features
        features = tf.reshape(features,(features.shape[0], -1, features.shape[3])) ## reshape 成 7x7x1024
        features = encoder(features)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1) ## training 時可以用teacher forcing（也就是把已知的training label當成input，就可以平行）

    total_loss = (loss / int(target.shape[1]))

    trainable_variables = feature_extracter.trainable_variables + encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [28]:
EPOCHS = 10

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    
    ''' training part '''
    total_train_loss = 0

    for (batch, (img_tensor, target)) in enumerate(train_dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_train_loss += t_loss
        print ('Epoch {} {}/{} Train Loss {:.6f}'.format(epoch + 1,batch+1,num_steps,total_train_loss/(batch+1)),end='\r')
    print('')
    
    ''' validation part '''
    equal_num = 0
    total_val_loss = 0
    
    for (batch, (img_tensor, target)) in enumerate(val_dataset):
        val_loss = 0
        
        hidden = decoder.reset_state(batch_size=target.shape[0])
        dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
        features = feature_extracter(img_tensor,False)
        features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
        features = encoder(features)
        
        ''' create batch大的result，第一個都是index = 27代表<start>，每decoder predict後就concat到裡面，最後在比對 '''
        result = np.full((BATCH_SIZE, 1), 27) 
        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            predicted_id = tf.argmax(predictions,axis=1).numpy() ## 機率最高的word對應的index
            val_loss += loss_function(target[:, i], predictions)
            result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
            dec_input = tf.expand_dims(predicted_id, 1) ## 這裡不是teacher forcing
        target_array = target.numpy()
        total_val_loss += (val_loss / int(target.shape[1]))
        
        ''' 對predict出來的結果，如果完全match ground truth 就++ '''
        for i in range(BATCH_SIZE):
            for j in range(max_len):
                if result[i][j] == 28 and target_array[i][j] == 28:
                    if (result[i][1:j] == target_array[i][1:j]).all():
                        equal_num+=1
                    break
        print ('Validation Accuracy {:.6f}, Validation Loss {:.6f}'.format(float(equal_num)/((batch+1)*BATCH_SIZE),total_val_loss/(batch+1)),end='\r')
    
    print('')


    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    #if epoch % 5 == 0:
    ckpt_manager.save()
        
    output_string = 'Epoch {} Train Loss {:.6f} Validation Accuracy {:.6f} Validation Loss {:.6f}\n'.format(epoch + 1,
                                                             total_loss/num_steps,float(equal_num)/20000.,total_val_loss/val_num_steps)
    with open('./lab13-2_v4.log','a') as f:
        f.write(output_string)
    f.close()
    print ('Epoch {} Train Loss {:.6f} Validation Accuracy {:.6f}'.format(epoch + 1,
                                                             total_loss/num_steps,float(equal_num)/20000.))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    

2021-12-16 01:25:14.913092: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-16 01:25:14.936258: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2592000000 Hz


NotImplementedError: in user code:

    /tmp/ipykernel_17734/1481518333.py:19 train_step  *
        predictions, hidden, _ = decoder(dec_input, features, hidden)
    /tmp/ipykernel_17734/2338771246.py:27 call  *
        output, state = self.gru(x)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:668 __call__  **
        return super(RNN, self).__call__(inputs, **kwargs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1030 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent_v2.py:426 call
        inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:868 _process_inputs
        initial_state = self.get_initial_state(inputs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:650 get_initial_state
        init_state = get_initial_state_fn(
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:1963 get_initial_state
        return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:2998 _generate_zero_filled_state_for_cell
        return _generate_zero_filled_state(batch_size, cell.state_size, dtype)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:3016 _generate_zero_filled_state
        return create_zeros(state_size)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py:3011 create_zeros
        return array_ops.zeros(init_state_size, dtype=dtype)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:2911 wrapped
        tensor = fun(*args, **kwargs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:2960 zeros
        output = _constant_if_small(zero, shape, dtype, name)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:2896 _constant_if_small
        if np.prod(shape) < 1000:
    <__array_function__ internals>:5 prod
        
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3030 prod
        return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out,
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/numpy/core/fromnumeric.py:87 _wrapreduction
        return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
    /home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:867 __array__
        raise NotImplementedError(

    NotImplementedError: Cannot convert a symbolic Tensor (rnn__decoder/gru/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported


## 5. Do validation

In [None]:
ckpt.restore('./checkpoints/assignment/ckpt-10')

In [None]:
equal_num = 0
total_val_loss = 0
for (batch, (img_tensor, target)) in enumerate(val_dataset):
    val_loss = 0
    hidden = decoder.reset_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
    features = feature_extracter(img_tensor,False)
    features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
    features = encoder(features)
    result = np.full((BATCH_SIZE, 1), 27)
    for i in range(1, target.shape[1]):
        # passing the features through the decoder
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions,axis=1).numpy()
        val_loss += loss_function(target[:, i], predictions)
        result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
        dec_input = tf.expand_dims(predicted_id, 1)
    target_array = target.numpy()
    total_val_loss += (val_loss / int(target.shape[1]))
    for i in range(BATCH_SIZE):
        for j in range(max_len):
            if result[i][j] == 28 and target_array[i][j] == 28:
                if (result[i][1:j] == target_array[i][1:j]).all():
                    equal_num+=1
                break
    print ('Validation Accuracy {:.6f}, Validation Loss {:.6f}'.format(float(equal_num)/((batch+1)*BATCH_SIZE),total_val_loss/(batch+1)),end='\r')

## 6. Predict Testing data

In [None]:
test_img_name = []

for i in range(120000,140000):
    test_img_name.append('a'+str(i))

print(len(test_img_name))

In [None]:
def load_test_image(image_name):
    img = tf.io.read_file(IMAGE_DIR + image_name + '.png')
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_HEIGHT, IMG_WIDTH, 3))
    img = img/255 - 1.
    return img

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices(test_img_name)
test_dataset = test_dataset.map(load_test_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.prefetch(200)

In [None]:
num=0
for batch, img_tensor in enumerate(test_dataset):
    hidden = decoder.reset_state(batch_size=BATCH_SIZE)
    dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
    features = feature_extracter(img_tensor,False)
    features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
    features = encoder(features)
    result = np.full((BATCH_SIZE, 1), 27)
    for i in range(1, max_len):
        # passing the features through the decoder
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions,axis=1).numpy()
        result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
        dec_input = tf.expand_dims(predicted_id, 1)
    for i in range(BATCH_SIZE):
        output_str = ''
        num = num+1
        hit = False
        for j in range(1,max_len):
            if result[i][j] == 28:
                hit = True
                break
            else:
                output_str = output_str + idx_to_character[result[i][j]]
        if hit != True:
            print(num)
        with open('./Lab13-2_110062539.txt','a') as f:
            f.write('a' + str(119999 + num) + ' ' + output_str+'\n')
        f.close()
print(num)