In [2]:
import numpy as np
import os
import cv2
from keras.models import Sequential
from keras.layers import Conv3D, MaxPooling3D, TimeDistributed, Bidirectional, LSTM, Dense, Dropout, Flatten
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
# Function to load frames
def load_frames(folder_path, scale_percent=70):
    frames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            frame = cv2.imread(os.path.join(folder_path, filename), cv2.IMREAD_GRAYSCALE)
            width = int(frame.shape[1] * scale_percent / 100)
            height = int(frame.shape[0] * scale_percent / 100)
            dim = (width, height)
            resized_frame = cv2.resize(frame, dim, interpolation=cv2.INTER_AREA)
            frames.append(resized_frame)
    frames = np.array(frames)
    frames = np.expand_dims(frames, axis=-1)
    return frames






In [3]:
# Function to load alignments
def load_alignments(file_path):
    alignments = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            start, end, text = line.strip().split(' ', 2)
            alignments.append((int(start), int(end), text))
    return alignments

# Function to map timestamps to frame indices
def map_timestamps_to_frames(alignments, frame_rate):
    frame_timestamps = []
    for align in alignments:
        start, end, _ = align
        start_frame = int(start / 1000 * frame_rate)
        end_frame = int(end / 1000 * frame_rate)
        frame_timestamps.append((start_frame, end_frame))
    return frame_timestamps

# Function to split data based on frame indices
def split_data_by_frame_indices(frames, sequences, frame_timestamps, split_point_frame):
    train_frames = []
    train_sequences = []
    test_frames = []
    test_sequences = []
    
    for i, (start, end) in enumerate(frame_timestamps):
        group_frames = frames[start:end]
        if len(group_frames) > 0: # Ensure there are frames in the group
            if end < split_point_frame:
                train_frames.append(group_frames)
                train_sequences.append(sequences[i])
            elif start >= split_point_frame:
                test_frames.append(group_frames)
                test_sequences.append(sequences[i])
    
    return train_frames, test_frames, train_sequences, test_sequences

# Function to adjust the length of each group of frames
def adjust_frames_length(frames_group, max_length):
    if len(frames_group) > max_length:
        # Truncate the group to the maximum length
        return frames_group[:max_length]
    else:
        # Pad the group with zeros to the maximum length
        padding = np.zeros((max_length - len(frames_group), frames_group[0].shape[0], frames_group[0].shape[1], 1), dtype=np.uint8)
        return np.concatenate((frames_group, padding), axis=0)

# Function to create the LipNet model
def create_lipnet_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv3D(128, (3, 3, 3), activation='relu', padding='same', input_shape=input_shape))
    model.add(MaxPooling3D(pool_size=(1, 1, 1)))
    model.add(Conv3D(256, (3, 3, 3), activation='relu', padding='same'))
    model.add(MaxPooling3D(pool_size=(1, 1, 1)))
    model.add(Conv3D(75, (3, 3, 3), activation='relu', padding='same'))
    model.add(MaxPooling3D(pool_size=(1, 1, 1)))
    model.add(TimeDistributed(Flatten()))
    model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_initializer='Orthogonal')))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_initializer='Orthogonal')))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.summary()
    return model

In [4]:
# Load frames and alignments
frames_folder = 'D:/Voices-AWS/reading/24fa'
alignments_file = 'D:/Voices-AWS/reading/24fa.align'
frames = load_frames(frames_folder)
alignments = load_alignments(alignments_file)

In [5]:
# Calculate frame rate based on the total number of frames and the video duration
video_duration = 101934 # This is the end timestamp of the last alignment in your .align file (in milliseconds)
total_frames = 3089
frame_rate = total_frames / (video_duration / 1000) # Convert video_duration to seconds

frame_timestamps = map_timestamps_to_frames(alignments, frame_rate)

In [6]:
def text_to_sequence(text):
    return list(text)

sequences = [text_to_sequence(align[2]) for align in alignments]

# Split data
split_point = 81207 # This should be in milliseconds
split_point_frame = int(split_point / 1000 * frame_rate) # Convert split_point to frame index
train_frames, test_frames, train_sequences, test_sequences = split_data_by_frame_indices(frames, sequences, frame_timestamps, split_point_frame)


In [7]:
video_duration = 101934 # This is the end timestamp of the last alignment in your .align file (in milliseconds)
total_frames = frames.shape[0]
frame_rate = total_frames / (video_duration / 1000) # Convert video_duration to seconds

frame_timestamps = map_timestamps_to_frames(alignments, frame_rate)

In [8]:
# Determine the maximum number of frames in any group
max_frames_per_group = max(max(len(group) for group in train_frames), max(len(group) for group in test_frames))

# Adjust the length of each group of frames
train_frames_adjusted = [adjust_frames_length(group, max_frames_per_group) for group in train_frames]
test_frames_adjusted = [adjust_frames_length(group, max_frames_per_group) for group in test_frames]
print(len(train_frames_adjusted))
print(len(train_sequences))

11
11


In [9]:
# Extend the mapping to include all unique characters found in the text
char_to_int = {char: i for i, char in enumerate('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.,!?;:\'\"()[]{}<>_+=@#$%^&*-=|\\/ ')} # Include space character
char_to_int.update({symbol: i + len(char_to_int) for i, symbol in enumerate(['â†«', 'â†«Fâ†«', '[/]', '‰'])}) # Extended mapping for special symbols
# Convert sequences to lists of integers, handling spaces correctly
def convert_sequence_to_int(seq):
    return [char_to_int[char] for char in seq if char in char_to_int]

# Convert sequences to lists of integers using the extended mapping
train_sequences_int = [convert_sequence_to_int(seq) for seq in train_sequences]
test_sequences_int = [convert_sequence_to_int(seq) for seq in test_sequences]

# Pad the sequences
max_length = max(max(len(seq) for seq in train_sequences_int), max(len(seq) for seq in test_sequences_int))
train_sequences_padded = pad_sequences(train_sequences_int, maxlen=652, padding='post')
test_sequences_padded = pad_sequences(test_sequences_int, maxlen=652, padding='post')

# # Convert the padded sequences to one-hot encoded vectors
train_labels = to_categorical(train_sequences_padded)
test_labels = to_categorical(test_sequences_padded)
print(len(train_labels[0]))

652


In [10]:
train_frames_single = np.concatenate(train_frames_adjusted, axis=0)
# Convert to tensor
train_frames_adjusted_tensor = tf.convert_to_tensor(train_frames_single, dtype=tf.float32)
train_labels_tensor = tf.convert_to_tensor(train_labels, dtype=tf.float32)

# Ensure the number of samples in train_frames_adjusted_tensor matches the number of samples in train_labels_tensor
#assert train_frames_adjusted_tensor.shape[0] == train_labels_tensor.shape[0], "Data cardinality mismatch"
# Define input shape based on your frames
input_shape = (frames.shape[1], frames.shape[2], 1, 1) # Assuming grayscale frames with depth of 1
num_classes = len(train_sequences[0]) # Number of characters in the sequences

MemoryError: Unable to allocate 27.1 GiB for an array with shape (7172, 756, 1344, 1) and data type float32

In [None]:
# Create and compile the model
model = create_lipnet_model(input_shape, num_classes)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(train_frames_adjusted_tensor, train_labels_tensor, epochs=10, batch_size=32)

In [None]:
# # Train the model
# epochs = 10 # Number of epochs to train
# model.fit(train_dataset, epochs=epochs)