In [15]:
''' This testing is done on Python version == 3.8.10'''
!pip freeze>requirements.txt

In [1]:
import cv2
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import time
from typing import List
import numpy as np
import imageio

In [16]:
'''This code is for devices with no GPU'''
physical_devices = tf.config.list_physical_devices('CPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], False)
except:
    pass

In [3]:
def load_video(path:str) -> List[float]: 

    cap = cv2.VideoCapture(path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[230:266,130:220,:])
    cap.release()
    
    frames = tf.cast(frames,tf.float32)
    mean = tf.math.reduce_mean(frames,axis=[0,1,2],keepdims=True)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32),axis=[0,1,2],keepdims=True)
    normalized_frames = (frames-mean)/std

    return normalized_frames

In [4]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]

In [5]:
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

In [6]:
def load_alignments(path:str) -> List[str]: 
    with open(path, 'r') as f: 
        lines = f.readlines() 
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil': 
            tokens = [*tokens,' ',line[2]]
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

In [7]:
def load_data(path: str): 
    path = bytes.decode(path.numpy())
    #file_name = path.split('/')[-1].split('.')[0]
    # File name splitting for windows
    file_name = path.split('\\')[-1].split('.')[0]
    video_path = os.path.join('s22.mpg_vcd','s22',f'{file_name}.mpg')
    alignment_path = os.path.join('s22_transcripts','align',f'{file_name}.align')
    frames = load_video(video_path) 
    alignments = load_alignments(alignment_path)
    
    return frames, alignments

In [8]:
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, TimeDistributed, Flatten # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler # type: ignore
from tensorflow.keras.initializers import Orthogonal # type: ignore

In [9]:
model = Sequential()
model.add(Conv3D(128, 3, input_shape = (75,36,90,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))

model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(128, kernel_initializer=Orthogonal(), return_sequences=True)))
model.add(Dropout(.5))

model.add(Bidirectional(LSTM(128, kernel_initializer=Orthogonal(), return_sequences=True)))
model.add(Dropout(.5))

model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.load_weights(os.path.join('models', 'checkpoint'))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x17d7d8a7790>

In [10]:
def mappable_function(path:str) ->List[str]:
    result = tf.py_function(load_data, [path], (tf.float32, tf.int64))
    return result

In [11]:
a = load_data(tf.convert_to_tensor('.\\s22.mpg_vcd\\s22\\bbaj7p.mpg'))

In [12]:
smpl = model.predict(tf.expand_dims(a[0], axis=0))



In [13]:
decoded = tf.keras.backend.ctc_decode(smpl ,input_length=[75],greedy=True)[0][0].numpy()

In [14]:
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]

[<tf.Tensor: shape=(), dtype=string, numpy=b'bin blue at seven please'>]