In [1]:
# imports 
import numpy as np
import tensorflow as tf
import cv2 

In [12]:
# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="finalModelCUSTOM.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
inputDetails = interpreter.get_input_details()
outputDetails = interpreter.get_output_details()
print(inputDetails)
print('\n', outputDetails)

inputShape = inputDetails[0]['shape']
print("\n input dims", inputShape)


[{'name': 'serving_default_conv1_input:0', 'index': 0, 'shape': array([  1, 290,  40, 120,   1]), 'shape_signature': array([ -1, 290,  40, 120,   1]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]

 [{'name': 'StatefulPartitionedCall:0', 'index': 115, 'shape': array([ 1,  1, 29]), 'shape_signature': array([-1, -1, 29]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]

 input dims [  1 290  40 120   1]


## VIDEO PROCESSING FUNCS

In [4]:
faceCascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

def faceDetection(img):
    # TROUBLESHOOTING
    # print("max size:",img.shape, img.shape[0] - 3 * padding, img.shape[1] - 3 * padding)
    return faceCascade.detectMultiScale(
        img,
        scaleFactor=1.3,
        minNeighbors=5,
        minSize=(30, 30),
    )

def cropForMouth(img):
    global lastKnownCrop
    rects = faceDetection(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
    
    # finding the largest face in a given image 
    largestFace = (0,0,0,0)
    for (x, y, w, l) in rects:
        if (w * l) > largestFace[2] * largestFace[3]:
            largestFace = (x, y,w,l)
        
    if largestFace == (0,0,0,0):
        largestFace =lastKnownCrop
    # cropping for face 
    lastKnownCrop = largestFace
    y1 = lastKnownCrop[1] 
    x1 = lastKnownCrop[0]
    y2 = y1 + lastKnownCrop[3] 
    x2 = x1 + lastKnownCrop[2]
    return img[y1 + int(0.65 * lastKnownCrop[3]): y2, x1 + int(0.05 * lastKnownCrop[2]): int(0.95 * x2)]

def loadVideo(path): 
    cap = cv2.VideoCapture(path)
    global lastKnownCrop, frameSizeOld, frameSize, newFrameSize, grayFrame
    global errorNums
    processedFrames = []
    isFirstFrame = True
    frameShape = None
    # for each frame 
    for n in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()
        
        # in case a frame is missing, just continue
        if frame is None or frame.shape[0] == 0: 
            continue
        
        if isFirstFrame: 
            frameShape  = frame.shape
            isFirstFrame = False
        
        if frame.shape != frameShape: 
            continue
        # crop only the mouth like we'll do on the RPI 
        frameSizeOld = frame.shape
        frame = cropForMouth(frame)
        frameSize = frame.shape
        frame = cv2.resize(frame, (120, 40))
        newFrameSize = frame.shape
        grayFrame = tf.image.rgb_to_grayscale(frame)
        processedFrames.append(grayFrame)
        # processedFrames = [*processedFrames, grayFrame]
    mean = tf.math.reduce_mean(processedFrames, keepdims=True)
    cap.release()    

   
    std = tf.math.reduce_std(tf.cast(processedFrames, tf.float32), keepdims=True)
    frames = tf.cast(processedFrames, tf.float32)
    normalizedFrames = (tf.cast(frames, tf.float32) - tf.cast(mean, tf.float32)) / tf.cast(std, tf.float32)
    return normalizedFrames

In [9]:
def padTensor(tensor, target_shape, padding_value=0):
  """Pads a NumPy tensor to the specified target shape with a given padding value.

  Args:
      tensor: The NumPy tensor to be padded.
      target_shape: The desired padded shape (tuple).
      padding_value: The value to use for padding (default: 0).

  Returns:
      The padded NumPy tensor.
  """

  pad_width = [(0, max(target_shape[i] - tensor.shape[i], 0)) for i in range(len(target_shape))]
  return np.pad(tensor, pad_width, mode='constant', constant_values=padding_value)

## LOADING SAMPLE DATA 

In [11]:
path = "A:\Lip Reading\Potential Datasets\BBC LRS2\\allFiles\\5570920046221178499_00015.mp4"
inputVid = loadVideo(path)
print("input video shape:", inputVid.shape)

# padding the tensor to the appropriate dims 
paddedVid  = padTensor(inputVid, target_shape = (290, 40, 120, 1))
print("padded video shape:", paddedVid.shape)

# since model was trained in batches, the dataset needs to be reshaped with 1 more dim 
batchedPaddedVid = paddedVid.reshape((1,) + paddedVid.shape)  # Add a dimension of size 1
print("batched padded video shape:", batchedPaddedVid.shape)

input video shape: (130, 40, 120, 1)
padded video shape: (290, 40, 120, 1)
batched padded video shape: (1, 290, 40, 120, 1)


In [13]:
inputData = np.array(batchedPaddedVid, dtype=np.float32) # make data into a np arr
interpreter.set_tensor(0, inputData) # set tensor to input  data

interpreter.invoke() # "prediction" 

# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
outputData = interpreter.get_tensor(outputDetails[0]['index'])
print("output data shape:", outputData.shape)

output data shape: (1, 290, 29)


## CTC DECODING

In [20]:
vocab = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ "] # defines the vocab (MUST BE SAME AS IN THE FILE)   

# DICTS to replicate charToNum & numToChar from model 
charToIdx = {char: i for i, char in enumerate(vocab)} 
IdxToChar = {i: char for i, char in enumerate(vocab)}

In [25]:
def removeRepeats(inds):
    is_not_repeat = np.insert(np.diff(inds).astype(bool), 0, True)
    return inds[is_not_repeat]

def removeBlanks(inds, numClasses):
    return inds[inds < (numClasses - 1)]

def ctcDecode(yPred, inputLen):
    # Notes:     
    # last  element in "vocab" treated as blank character
    # decodedDense padded with -1

    numSamples = yPred.shape[0]
    numClasses = yPred.shape[-1]
    logProb = np.zeros((numSamples, 1))
    decodedDense = -np.ones_like(yPred[..., 0])
    decodedLen = np.zeros((numSamples,), dtype=np.int32)

    for i in range(numSamples):
        prob = yPred[i]

        length = inputLen[i]

        decoded = np.argmax(prob[:length], axis=-1)

        logProb[i] = -np.sum(np.log(prob[np.arange(length), decoded]))

        decoded = removeRepeats(decoded)

        decoded = removeBlanks(decoded, numClasses)

        decodedLen[i] = len(decoded)
        decodedDense[i, :len(decoded)] = decoded

    return decodedDense[:, :np.max(decodedLen)], logProb

def encodeString(text):
  encodedVec = [charToIdx.get(char, charToIdx[" "]) for char in text]
  return np.array(encodedVec, dtype=np.int32)

def decodeVector(encodedVector):
  # - 1 because everything is offset by one for some reason
  decodedString = "".join([IdxToChar.get(idx-1, "") for idx in encodedVector])
  finalMsg = ''.join(decodedString)

  return finalMsg

inputLen = np.array([290], dtype=np.int32) # max frame ct 
decoded = ctcDecode(outputData, inputLen=inputLen)
print(decodeVector(decoded[0][0]))

WHICH OF OUR CONTESTANTS TODAY IS GOING TO MAKE A THUMPING GREAT PROFIT


# CONDENSED FUNCTION   

In [28]:
def makePrediction(interpreter, vidPath):
    
    # vid processing 
    inputVid = loadVideo(vidPath)
    paddedVid  = padTensor(inputVid, target_shape = (290, 40, 120, 1))
    batchedPaddedVid = paddedVid.reshape((1,) + paddedVid.shape)  # Add a dimension of size 1
    
    # making a prediction 
    inputData = np.array(batchedPaddedVid, dtype=np.float32) # make data into a np arr
    interpreter.set_tensor(0, inputData)
    interpreter.invoke() # "prediction" 
    
    # decode the final ctc decoded tensor 
    outputData = interpreter.get_tensor(outputDetails[0]['index'])
    return decodeVector(ctcDecode(outputData, inputLen=inputLen)[0][0])


In [29]:
makePrediction(interpreter=interpreter, vidPath=path)

'WHICH OF OUR CONTESTANTS TODAY IS GOING TO MAKE A THUMPING GREAT PROFIT'