In [None]:
!pip install tensorflow opencv-python librosa
!pip install moviepy
!pip install SpeechRecognition

In [2]:
import cv2
import numpy as np
import tensorflow as tf
import librosa
import speech_recognition as sr
import moviepy.editor as mp

In [3]:
# Load video
cap = cv2.VideoCapture('/content/anchor-intro_DMoaE15J.mp4')
fps = int(cap.get(cv2.CAP_PROP_FPS))

In [4]:
# Extract frames
frames = []
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    # Resize and convert to grayscale
    frame = cv2.resize(frame, (128, 64))
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frames.append(frame)

frames = np.array(frames)
cap.release()

In [5]:
# Load video
video = mp.VideoFileClip("/content/anchor-intro_DMoaE15J.mp4")

# Extract audio
audio = video.audio
audio.write_audiofile("audio.wav")

MoviePy - Writing audio in audio.wav


                                                        

MoviePy - Done.




In [6]:
# Initialize recognizer
recognizer = sr.Recognizer()

# Load audio file
audio_file = sr.AudioFile('audio.wav')

# Convert audio to text
with audio_file as source:
    audio_data = recognizer.record(source)
    text_data = recognizer.recognize_google(audio_data)

print("Transcribed Text: ", text_data)

Transcribed Text:  good morning Colorado it's 8:32 a.m. Saturday August 27th I'm Angelica Lombardi Bonnie silkman has the morning off


In [7]:
# Total number of frames in the video
total_frames = int(video.fps * video.duration)

# Split the text into words
words = text_data.split()

# Approximate alignment of words with frames
# This simple alignment assumes each word corresponds to an equal number of frames.
words_per_frame = len(words) / total_frames
aligned_text = []

# Create aligned text for each frame
for i in range(total_frames):
    word_index = int(i * words_per_frame)
    if word_index < len(words):
        aligned_text.append(words[word_index])
    else:
        aligned_text.append("")  # Blank for frames with no words

aligned_text = np.array(aligned_text)

In [8]:
# Normalize frames
frames = frames / 255.0

In [22]:
# Convert text_data to categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Tokenize and convert text data to categorical with 20 classes
tokenizer = Tokenizer(num_words=20)  # Set the number of words (classes) to 20
tokenizer.fit_on_texts(aligned_text)
sequences = tokenizer.texts_to_sequences(aligned_text)

# Pad sequences to ensure consistent length (1 element per frame)
padded_sequences = np.array([seq[0] if len(seq) > 0 else 0 for seq in sequences])

# Convert padded sequences to one-hot encoding (categorical) with 20 classes
categorical_labels = to_categorical(padded_sequences, num_classes=20)

In [23]:
# Prepare the data
frames = np.expand_dims(frames, -1)
X_train = np.expand_dims(frames, axis=0)  # (1, timesteps, height, width, channels)
y_train = np.expand_dims(categorical_labels, axis=0)  # (1, timesteps, num_classes)

In [25]:
X_train = np.squeeze(X_train)
X_train = np.reshape(X_train, (1, 224, 64, 128, 1))

In [26]:
print(X_train.shape, y_train.shape)

(1, 224, 64, 128, 1) (1, 224, 20)


In [28]:
from tensorflow.keras.utils import to_categorical

# Assuming `sequences` is your list of word indices corresponding to frames
# Flatten `sequences` so that each entry is a single integer
flattened_sequences = np.array([seq[0] if len(seq) > 0 else 0 for seq in sequences])

# Ensure vocab_size matches the number of unique words in your tokenizer
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 because word_index is 1-based

# Convert sequences to categorical (one-hot encoding)
categorical_labels = to_categorical(flattened_sequences, num_classes=vocab_size)

In [29]:
print(y_train.shape)

(1, 224, 20)


In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, TimeDistributed, Flatten

model = Sequential([
    TimeDistributed(Conv2D(8, (3, 3), activation='relu'), input_shape=(None, 64, 128, 1)),  # Fewer filters
    TimeDistributed(MaxPooling2D((2, 2))),
    TimeDistributed(Flatten()),
    LSTM(64, return_sequences=True),  # Fewer units in LSTM
    Dense(32, activation='relu'),  # Fewer units in Dense layer
    Dense(20, activation='softmax')  # Output layer with the number of classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)



In [38]:
model.fit(X_train, y_train, batch_size=16, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 0.7277 - loss: 1.2778
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step - accuracy: 0.6652 - loss: 1.2419
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.7366 - loss: 1.1951
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step - accuracy: 0.7679 - loss: 1.1582
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step - accuracy: 0.7232 - loss: 1.1296
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step - accuracy: 0.7679 - loss: 1.0990
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - accuracy: 0.8125 - loss: 1.0373
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step - accuracy: 0.8348 - loss: 0.9984
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7b244ef0a6e0>

Test Model


In [39]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
model.save('/content/drive/MyDrive/LipReader_model.h5')



In [44]:
from tensorflow.keras.models import load_model

In [45]:
model= load_model('/content/drive/MyDrive/LipReader_model.h5')



In [46]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

# Load your trained model
model = load_model('/content/drive/MyDrive/LipReader_model.h5')

# Recompile the model
model.compile(optimizer=Adam(), loss=CategoricalCrossentropy(), metrics=['accuracy'])



In [None]:
# Load the video
video_path = 'path_to_your_video.mp4'
cap = cv2.VideoCapture(video_path)

# Define the codec and create VideoWriter object for saving the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output_video.mp4', fourcc, 20.0, (640, 480))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    #Preprocess the frame (resize, normalize, etc. according to your model input)
    processed_frame = cv2.resize(frame, (input_width, input_height))  # use your model's expected input size
    processed_frame = processed_frame.astype('float32') / 255.0
    processed_frame = np.expand_dims(processed_frame, axis=0)  # add batch dimension

    # Run the model prediction
    prediction = model.predict(processed_frame)

    # Post-process the prediction (e.g., decoding, thresholding)
    # This will depend on your specific use case and output of the model

    # Optionally, draw the prediction on the frame
    cv2.putText(frame, f'Prediction: {prediction}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    # Save the frame to the output video
    out.write(frame)

    # Display the frame with prediction (optional)
    cv2.imshow('Frame', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything when done
cap.release()
out.release()
cv2.destroyAllWindows()