In [79]:
import tensorflow as tf
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
import imageio
import gdown
from statistics import mean, stdev
from tensorflow.keras import datasets, layers, models
import re
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import Adam

In [2]:
mpg = []
align = []

for folders, _, filenames in os.walk(os.getcwd()):
    for file in filenames:
        file_path = os.path.join(folders, file)
        if file_path.endswith('mpg'):
            mpg.append(file_path)
        elif file_path.endswith('align'):
            align.append(file_path)

In [11]:
def load_mpg(mpg_file):
    
    cam = cv2.VideoCapture(mpg_file)
    frames = []
    
    for frame in range(int(cam.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cam.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frame = tf.cast(frame, tf.float32)
        frame = frame[190:236,80:220,:]
        frames.append(frame)
        
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    
    return tf.cast((frames - mean), tf.float32) / std



In [19]:
vocabulary = [word for word in "abcdefghijklmnopqrstuvwxyz'?!123456789"]

char_to_num = tf.keras.layers.StringLookup(vocabulary=vocabulary, oov_token=" ")
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token=" ", invert=True)

In [77]:
def load_align(align_file):
    words = []
    words_list = []
    with open(align_file) as file:
        for file in file:
            words.append(file.strip().split(' ')[2])
    
    words = list(map(lambda x: x.replace('sil', ' '), words))
    
    for word in words:
        if word == ' ':
            words_list.append(word)
        else:
            words_list.extend((word, " "))
    
    words_list = words_list[:-2]
    
    return char_to_num(tf.reshape(tf.strings.unicode_split(words_list, 'UTF-8'), (-1)))[1:]

load_align(align[0])

<tf.Tensor: shape=(21,), dtype=int64, numpy=
array([ 2,  9, 14,  0,  2, 12, 21,  5,  0,  1, 20,  0,  6,  0, 20, 23, 15,
        0, 14, 15, 23], dtype=int64)>

In [54]:
def load_data(mpg_file):
    mpg = tf.convert_to_tensor(mpg_file).numpy().decode('utf-8')
    root = tf.strings.split(mpg, sep='\\')[:7]
    constant = tf.constant(['alignments'])
    align = tf.concat([root, constant], axis=0)
    align_end = tf.strings.split(mpg, sep='\\')[7:]
    align = tf.concat([align, align_end], axis=0)
    align = tf.strings.reduce_join((tf.strings.regex_replace(align, 'mpg', 'align')), separator='\\').numpy().decode('utf-8')
    
    return load_mpg(mpg), load_align(align)

In [55]:
load_data(mpg[0])

(<tf.Tensor: shape=(75, 46, 140, 1), dtype=float32, numpy=
 array([[[[ 1.4685191 ],
          [ 1.4685191 ],
          [ 1.4310399 ],
          ...,
          [ 0.38162336],
          [ 0.38162336],
          [ 0.38162336]],
 
         [[ 1.4685191 ],
          [ 1.4685191 ],
          [ 1.4310399 ],
          ...,
          [ 0.38162336],
          [ 0.38162336],
          [ 0.38162336]],
 
         [[ 1.4310399 ],
          [ 1.4310399 ],
          [ 1.4685191 ],
          ...,
          [ 0.30666503],
          [ 0.30666503],
          [ 0.30666503]],
 
         ...,
 
         [[ 1.0187691 ],
          [ 1.0187691 ],
          [ 0.9812899 ],
          ...,
          [ 0.08179008],
          [ 0.08179008],
          [ 0.04431092]],
 
         [[ 1.0187691 ],
          [ 1.0187691 ],
          [ 0.9812899 ],
          ...,
          [ 0.08179008],
          [ 0.04431092],
          [ 0.04431092]],
 
         [[ 1.0187691 ],
          [ 1.0187691 ],
          [ 0.9812899 ],
          

In [49]:
def mappable_function(file):
    return tf.py_function(load_data, [file], (tf.float32, tf.int64))

In [86]:
data = tf.data.Dataset.list_files(mpg)

data = data.map(mappable_function)
data = data.padded_batch(2, padded_shapes=([75, None, None, None], [40]))
data = data.prefetch(500)
data = data.cache()

train = data.take(450)
test = data.take(450)

In [103]:
model = tf.keras.Sequential()

model.add(tf.keras.layers.Conv3D(128, input_shape=(75, 46, 140, 1), kernel_size=(3,3,3), padding='same'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.MaxPooling3D((1,2,2)))

model.add(tf.keras.layers.Conv3D(256, input_shape=(75, 46, 140, 1), kernel_size=(3,3,3), padding='same'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.MaxPooling3D((1,2,2)))

model.add(tf.keras.layers.Conv3D(75, input_shape=(75, 46, 140, 1), kernel_size=(3,3,3), padding='same'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.MaxPooling3D((1,2,2)))

model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten()))

model.add(tf.keras.layers.Bidirectional(LSTM(128, recurrent_initializer="orthogonal" ,return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Bidirectional(LSTM(128, recurrent_initializer="orthogonal" ,return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(char_to_num.vocabulary_size()+1, kernel_initializer="he_normal", activation="softmax"))

In [104]:
def loss_function(y_pred, y_true):
    
    batch_length = tf.shape(y_true)[0]
    input_length = tf.shape(y_pred)[1]
    label_length = tf.shape(y_true)[1]
    
    input_length = tf.expand_dims(tf.ones((batch_length,), tf.int32) * input_length, axis=-1)
    label_length = tf.expand_dims(tf.ones((batch_length,), tf.int32) * label_length, axis=-1)
    
    return tf.keras.backend.ctc_batch_cost(y_pred, y_true, input_length, label_length)

In [105]:
class Callback(tf.keras.callbacks.Callback):
    def __init__(self, data):
        self.data = data.as_numpy_iterator()
        
    def on_epoch_end(self, model):
        next_data = self.data.next()
        prediction = self.model.predict(next_data[0])
        decode = tf.keras.backend.ctc_decode(prediction, [75,75], greedy=False)[0][0].numpy()
        
        for batch in range(len(prediction)):
            print('Prediction: {}'.format(tf.strings.reduce_join(decode[batch]))).numpy().decode('utf-8')
            print('Actual: {}'.format(tf.strings.reduce_join(next_data[1][batch]))).numpy().decode('utf-8')
            print('~' * 100)

In [106]:
def lr_scheduler(epochs, lr):
    
    if epochs < 30:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

In [107]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss=loss_function)

In [108]:
schedule_callback = LearningRateScheduler(lr_scheduler)
example_callback = Callback(test)
model.fit(train, validation_data=test, epochs=100, callbacks=[schedule_callback, example_callback])

Epoch 1/100


KeyboardInterrupt: 

In [110]:
test_data = test.as_numpy_iterator()

In [111]:
samples = test_data.next()

In [113]:
predict = model.predict(samples[0])



In [128]:
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in samples[1]]

[<tf.Tensor: shape=(), dtype=string, numpy=b'place red with d six please             '>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'place green in x six now                '>]

In [130]:
decoded_value = tf.keras.backend.ctc_decode(predict, [75,75], greedy=False)[0][0].numpy()
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded_value]

[<tf.Tensor: shape=(), dtype=string, numpy=b'x8x8b8m8m8b8b8b8bmb8b8b8mbkbkbt                                            '>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'xw8wkwkwmwkwkwkwkwkwkwkwkwmwmkmt                                           '>]