In [19]:
# imports 
import os 
import tensorflow as tf 
import cv2 
import numpy
from matplotlib import pyplot as plt

# setting GPU to be the device 
gpus = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

## basic functions

In [17]:
# setting up the functions to convert from chars to num and vice versa
vocab = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ'?!123456789 "]
charToNum = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
numToChar = tf.keras.layers.StringLookup(vocabulary=charToNum.get_vocabulary(), oov_token="", invert=True)

# facial detection vars 
faceCascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
lastKnownCrop = (0, 0, 256, 250)

In [111]:
faceCascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

def faceDetection(img):
    # TROUBLESHOOTING
    # print("max size:",img.shape, img.shape[0] - 3 * padding, img.shape[1] - 3 * padding)
    return faceCascade.detectMultiScale(
        img,
        scaleFactor=1.3,
        minNeighbors=5,
        minSize=(30, 30),
    )

def cropForMouth(img) -> numpy.ndarray:
    global lastKnownCrop
    rects = faceDetection(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
    
    # finding the largest face in a given image 
    largestFace = (0,0,0,0)
    for (x, y, w, l) in rects:
        if (w * l) > largestFace[2] * largestFace[3]:
            largestFace = (x, y,w,l)
        
    if largestFace == (0,0,0,0):
        largestFace =lastKnownCrop
    # cropping for face 
    lastKnownCrop = largestFace
    y1 = lastKnownCrop[1] 
    x1 = lastKnownCrop[0]
    y2 = y1 + lastKnownCrop[3] 
    x2 = x1 + lastKnownCrop[2]
    print("lastKnownCrop", lastKnownCrop)
    print(y1 + int(0.65 * lastKnownCrop[3]), y2, x1 + int(0.05 * lastKnownCrop[2]), int(0.95 * x2))
    return img[y1 + int(0.65 * lastKnownCrop[3]): y2, x1 + int(0.05 * lastKnownCrop[2]): int(0.95 * x2)]

In [112]:
def loadData(path): 
    # tf has the paths as bytes so decode that
    path = bytes.decode(path.numpy())
    
    # extract just the file names
    fileName = path.split('\\')[-1].split('.')[0]
    
    # generate the respective paths of the data
    videoPath = os.path.join('data','s1',f'{fileName}.mpg')
    alignmentPath = os.path.join('data','alignments','s1',f'{fileName}.align')
    
    # return the frames and alignments
    frames = loadVideo(videoPath) 
    alignments = loadText(alignmentPath)
    return frames, alignments

def loadVideo(path): 
    cap = cv2.VideoCapture(path)
    global lastKnownCrop
    processedFrames = []
    
    # for each frame 
    videoWriter = cv2.VideoWriter('currentVideo.avi', cv2.VideoWriter_fourcc(*'MJPG'), 30, (150, 50)) 
    for n in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()
        
        # in case a frame is missing, just continue
        if frame is None or frame.shape[0] == 0: 
            continue
        
        # crop only the mout like we'll do on the RPI 
        frame = cropForMouth(frame)
        frame = cv2.resize(frame, (150, 50))
        
        grayFrame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        videoWriter.write(grayFrame)
        try: frame = tf.image.rgb_to_grayscale(frame)
        except: continue
    
    
        processedFrames.append(frame)
    cap.release()

    # generate the normalized frames (deviation from the average) 
    mean = tf.math.reduce_mean(processedFrames, keepdims=True)
    std = tf.math.reduce_std(tf.cast(processedFrames, tf.float32), axis=[0, 1, 2], keepdims=True)
    frames = tf.cast(processedFrames, tf.float32)
    normalizedFrames = (tf.cast(frames, tf.float32) - tf.cast(mean, tf.float32)) / tf.cast(std, tf.float32)

    return normalizedFrames

def loadText(path): 
    # open and parse the file 
    with open(path, 'r') as file: 
        lines = file.readlines()
    file.close()
    
    # return the number equivalent of each of the characters of the word 
    tokens = []
    words = lines[0].split()
    # print(words)
    del words[0]
    print(words)
    for word in words:
         tokens = [*tokens,' ', word]
    return charToNum(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]    

In [113]:
rootDir = 'A:\Lip Reading\Potential Datasets\BBC LRS2\mvlrs_v1\main'
rpath = "A:\\Lip Reading\\Potential Datasets\\BBC LRS2\\mvlrs_v1\\main\\5535415699068794046\\00001.mp4"
path = tf.convert_to_tensor(rpath)
path = bytes.decode(path.numpy())
fileName = path.split('\\')[-1].split('.')[0]
fileFolder = path.split('\\')[-2]

videoPath = os.path.join(rootDir,fileFolder,f'{fileName}.mp4')
alignmentPath = os.path.join(rootDir,fileFolder,f'{fileName}.txt')
# print(videoPath)
print(videoPath)
print(alignmentPath)
loadText(alignmentPath)
loadVideo(videoPath)

# for subdir, dirs, files in os.walk(rootdir):
#     for file in files:
#         print(os.path.join(subdir, file))

A:\Lip Reading\Potential Datasets\BBC LRS2\mvlrs_v1\main\5535415699068794046\00001.mp4
A:\Lip Reading\Potential Datasets\BBC LRS2\mvlrs_v1\main\5535415699068794046\00001.txt
['WHEN', "YOU'RE", 'COOKING', 'CHIPS', 'AT', 'HOME']
lets 160
lastKnownCrop (32, 4, 99, 99)
68 103 36 124
35
(35, 88, 3)
lets 160
lastKnownCrop (32, 4, 99, 99)
68 103 36 124
35
(35, 88, 3)
lets 160
lastKnownCrop (32, 4, 99, 99)
68 103 36 124
35
(35, 88, 3)
lets 160
lastKnownCrop (32, 4, 99, 99)
68 103 36 124
35
(35, 88, 3)
lets 160
lastKnownCrop (30, 4, 89, 89)
61 93 34 113
32
(32, 79, 3)
lets 160
lastKnownCrop (29, 4, 89, 89)
61 93 33 112
32
(32, 79, 3)
lets 160
lastKnownCrop (26, 4, 96, 96)
66 100 30 115
34
(34, 85, 3)
lets 160
lastKnownCrop (22, 3, 101, 101)
68 104 27 116
36
(36, 89, 3)
lets 160
lastKnownCrop (24, 3, 97, 97)
66 100 28 114
34
(34, 86, 3)
lets 160
lastKnownCrop (29, 4, 89, 89)
61 93 33 112
32
(32, 79, 3)
lets 160
lastKnownCrop (24, 4, 99, 99)
68 103 28 116
35
(35, 88, 3)
lets 160
lastKnownCrop (28

<tf.Tensor: shape=(35, 50, 150, 1), dtype=float32, numpy=
array([[[[ 0.10424688],
         [ 0.10424688],
         [ 0.13899586],
         ...,
         [ 1.3204606 ],
         [ 1.2857116 ],
         [ 1.2509626 ]],

        [[ 0.17374481],
         [ 0.17374481],
         [ 0.13899586],
         ...,
         [ 1.2509626 ],
         [ 1.2162137 ],
         [ 1.2162137 ]],

        [[ 0.31274065],
         [ 0.2779917 ],
         [ 0.24324274],
         ...,
         [ 1.1814647 ],
         [ 1.1814647 ],
         [ 1.1814647 ]],

        ...,

        [[ 3.4401472 ],
         [ 3.4401472 ],
         [ 3.4401472 ],
         ...,
         [-1.1119668 ],
         [-1.5289544 ],
         [-1.806946  ]],

        [[ 3.4401472 ],
         [ 3.4401472 ],
         [ 3.4401472 ],
         ...,
         [-1.2162137 ],
         [-1.6679502 ],
         [-1.9806908 ]],

        [[ 3.4401472 ],
         [ 3.4401472 ],
         [ 3.4401472 ],
         ...,
         [-1.3552096 ],
         [-1.80694