In [57]:
import numpy as np
import tensorflow as tf
import cv2 

In [58]:

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="finalModelCUSTOM.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print(input_details)
print(output_details)

# Test model on random input data.
input_shape = input_details[0]['shape']
# 
# input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
# interpreter.set_tensor(input_details[0]['index'], input_data)
# 
# interpreter.invoke()
# 
# # The function `get_tensor()` returns a copy of the tensor data.
# # Use `tensor()` in order to get a pointer to the tensor.
# output_data = interpreter.get_tensor(output_details[0]['index'])
# print(output_data)

[{'name': 'serving_default_conv1_input:0', 'index': 0, 'shape': array([  1, 290,  40, 120,   1]), 'shape_signature': array([ -1, 290,  40, 120,   1]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
[{'name': 'StatefulPartitionedCall:0', 'index': 115, 'shape': array([ 1,  1, 29]), 'shape_signature': array([-1, -1, 29]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
[  1 290  40 120   1]


In [59]:
def padTensor(tensor, target_shape, padding_value=0):
  """Pads a NumPy tensor to the specified target shape with a given padding value.

  Args:
      tensor: The NumPy tensor to be padded.
      target_shape: The desired padded shape (tuple).
      padding_value: The value to use for padding (default: 0).

  Returns:
      The padded NumPy tensor.
  """

  pad_width = [(0, max(target_shape[i] - tensor.shape[i], 0)) for i in range(len(target_shape))]
  return np.pad(tensor, pad_width, mode='constant', constant_values=padding_value)

In [60]:
faceCascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

def faceDetection(img):
    # TROUBLESHOOTING
    # print("max size:",img.shape, img.shape[0] - 3 * padding, img.shape[1] - 3 * padding)
    return faceCascade.detectMultiScale(
        img,
        scaleFactor=1.3,
        minNeighbors=5,
        minSize=(30, 30),
    )

def cropForMouth(img):
    global lastKnownCrop
    rects = faceDetection(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
    
    # finding the largest face in a given image 
    largestFace = (0,0,0,0)
    for (x, y, w, l) in rects:
        if (w * l) > largestFace[2] * largestFace[3]:
            largestFace = (x, y,w,l)
        
    if largestFace == (0,0,0,0):
        largestFace =lastKnownCrop
    # cropping for face 
    lastKnownCrop = largestFace
    y1 = lastKnownCrop[1] 
    x1 = lastKnownCrop[0]
    y2 = y1 + lastKnownCrop[3] 
    x2 = x1 + lastKnownCrop[2]
    return img[y1 + int(0.65 * lastKnownCrop[3]): y2, x1 + int(0.05 * lastKnownCrop[2]): int(0.95 * x2)]

def loadVideo(path): 
    cap = cv2.VideoCapture(path)
    global lastKnownCrop, frameSizeOld, frameSize, newFrameSize, grayFrame
    global errorNums
    processedFrames = []
    isFirstFrame = True
    frameShape = None
    # for each frame 
    for n in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()
        
        # in case a frame is missing, just continue
        if frame is None or frame.shape[0] == 0: 
            continue
        
        if isFirstFrame: 
            frameShape  = frame.shape
            isFirstFrame = False
        
        if frame.shape != frameShape: 
            continue
        # crop only the mouth like we'll do on the RPI 
        frameSizeOld = frame.shape
        frame = cropForMouth(frame)
        frameSize = frame.shape
        frame = cv2.resize(frame, (120, 40))
        newFrameSize = frame.shape
        grayFrame = tf.image.rgb_to_grayscale(frame)
        processedFrames.append(grayFrame)
        # processedFrames = [*processedFrames, grayFrame]
    mean = tf.math.reduce_mean(processedFrames, keepdims=True)
    cap.release()    

   
    std = tf.math.reduce_std(tf.cast(processedFrames, tf.float32), keepdims=True)
    frames = tf.cast(processedFrames, tf.float32)
    normalizedFrames = (tf.cast(frames, tf.float32) - tf.cast(mean, tf.float32)) / tf.cast(std, tf.float32)
    return normalizedFrames

In [61]:
path = "A:\Lip Reading\Potential Datasets\BBC LRS2\\allFiles\\5570920046221178499_00015.mp4"
inputVid = loadVideo(path)
inputVid.shape

TensorShape([130, 40, 120, 1])

In [62]:
paddedVid  = padTensor(inputVid, target_shape = (290, 40, 120, 1))
paddedVid.shape

(290, 40, 120, 1)

In [63]:
batchedPaddedVid = paddedVid.reshape((1,) + paddedVid.shape)  # Add a dimension of size 1
batchedPaddedVid.shape

(1, 290, 40, 120, 1)

In [64]:

input_data = np.array(batchedPaddedVid, dtype=np.float32)
interpreter.set_tensor(0, input_data)

interpreter.invoke()

# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
output_data = interpreter.get_tensor(output_details[0]['index'])
print(output_data)
print(output_data.shape)

[[[7.23460573e-07 6.93142829e-06 2.65667361e-04 ... 2.46738011e-07
   6.97800851e-06 1.11549625e-05]
  [7.90351635e-07 4.23869824e-05 1.43869022e-06 ... 8.09406853e-09
   5.78710024e-06 1.69129824e-04]
  [1.64023356e-06 2.26260186e-03 5.22224695e-07 ... 2.80639085e-08
   5.45448256e-06 3.76769225e-04]
  ...
  [9.60396826e-01 7.50445892e-08 7.06979648e-08 ... 1.45323797e-09
   2.31208290e-07 3.95952016e-02]
  [5.99314459e-03 1.26904990e-06 1.10831252e-07 ... 1.26646305e-09
   4.98851068e-06 9.93992627e-01]
  [6.48846269e-01 5.35934578e-06 2.41780367e-06 ... 3.59239216e-08
   3.72095710e-05 3.50981325e-01]]]
(1, 290, 29)


In [65]:
vocab = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ "]
charToNum = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
numToChar = tf.keras.layers.StringLookup(vocabulary=charToNum.get_vocabulary(), oov_token="", invert=True)

In [66]:
decoded = tf.keras.backend.ctc_decode(output_data, input_length=[145*2], greedy=True)[0][0].numpy()

prediction = tf.strings.reduce_join(numToChar(decoded[0])).numpy().decode('utf-8')
prediction

'WHICH OF OUR CONTESTANTS TODAY IS GOING TO MAKE A THUMPING GREAT PROFIT'

In [67]:
def _remove_repeats(inds):
    is_not_repeat = np.insert(np.diff(inds).astype(bool), 0, True)
    return inds[is_not_repeat]

def _remove_blanks(inds, n_classes):
    return inds[inds < (n_classes - 1)]

def ctc_decode_np(y_pred, input_length):
    # Note:
    # Last element in alphabet treated as blank character
    # decoded_dense padded with -1

    n_samples = y_pred.shape[0]
    n_classes = y_pred.shape[-1]
    log_prob = np.zeros((n_samples, 1))
    decoded_dense = -np.ones_like(y_pred[..., 0])
    decoded_length = np.zeros((n_samples,), dtype=np.int32)

    for i in range(n_samples):
        print('-'*60)
        # [n_time_steps, alphabet_size]
        prob = y_pred[i]

        length = input_length[i]

        decoded = np.argmax(prob[:length], axis=-1)

        # print('decoded:', decoded)

        log_prob[i] = -np.sum(np.log(prob[np.arange(length), decoded]))

        decoded = _remove_repeats(decoded)

        # print('decoded remove_repeats:', decoded)

        decoded = _remove_blanks(decoded, n_classes)

        # print('decoded remove_blanks:', decoded)

        decoded_length[i] = len(decoded)
        decoded_dense[i, :len(decoded)] = decoded



    return decoded_dense[:, :np.max(decoded_length)], log_prob

# [batch_size, ]
input_length = np.array([290], dtype=np.int32)
# print('input_length.shape', input_length.shape)
ctc_decode_np(output_data, input_length=input_length)


------------------------------------------------------------


(array([[23.,  8.,  9.,  3.,  8., 27., 15.,  6., 27., 15., 21., 18., 27.,
          3., 15., 14., 20.,  5., 19., 20.,  1., 14., 20., 19., 27., 20.,
         15.,  4.,  1., 25., 27.,  9., 19., 27.,  7., 15.,  9., 14.,  7.,
         27., 20., 15., 27., 13.,  1., 11.,  5., 27.,  1., 27., 20.,  8.,
         21., 13., 16.,  9., 14.,  7., 27.,  7., 18.,  5.,  1., 20., 27.,
         16., 18., 15.,  6.,  9., 20.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32),
 array([[14.22221565]]))

In [68]:
scuffedDecode = ctc_decode_np(output_data, input_length=input_length)

prediction = tf.strings.reduce_join(numToChar(scuffedDecode[0])).numpy().decode('utf-8')
prediction

------------------------------------------------------------


'WHICH OF OUR CONTESTANTS TODAY IS GOING TO MAKE A THUMPING GREAT PROFIT'

In [137]:
import numpy as np

vocab = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ "]
charToIdx = {char: i for i, char in enumerate(vocab)}
IdxToChar = {i: char for i, char in enumerate(vocab)}
def encode_string(text):
  """Encodes a string into a vector of character indices.

  Args:
      text: The string to be encoded.

  Returns:
      A NumPy array of integer indices representing the encoded string.
  """
  encoded_vector = [charToIdx.get(char, charToIdx[" "]) for char in text]
  return np.array(encoded_vector, dtype=np.int32)

def decode_vector(encoded_vector):
  """Decodes a vector of character indices back to a string.

  Args:
      encoded_vector: A NumPy array of integer indices representing the encoded string.

  Returns:
      The decoded string.
  """
  
  # print(index_to_char.get(0))

  decoded_string = "".join([IdxToChar.get(idx-1, " ") for idx in encoded_vector])
  # - 1 because everything is offset by one for some reason
  return decoded_string

In [143]:
d = scuffedDecode[0][0]
finalMsg = ''.join(decode_vector(d))
    