In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.nn import CTCLoss

import cv2
import os
import numpy as np
import json

In [87]:
def preprocess_video(video_path, img_w=100, img_h=50, frames_n=75):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0

    while frame_count < frames_n:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize frame
        frame = cv2.resize(frame, (img_w, img_h))
        # Convert BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Normalize pixel values
        frame = frame / 255.0
        frames.append(frame)
        frame_count += 1

    cap.release()

    # Pad with black frames if the video has fewer than `frames_n` frames
    while len(frames) < frames_n:
        frames.append(np.zeros((img_h, img_w, 3)))

    # Convert to numpy array
    frames = np.array(frames)
    return frames

In [88]:
def text_to_labels(text_path, char_to_idx):
    # read in json file
    with open(text_path, 'r') as f:
        data = json.load(f)
    # Extract the text from the JSON data
    text = data['text']

    return [char_to_idx[char] for char in text if char in char_to_idx], text

# Example character-to-index mapping
char_to_idx = {char: idx for idx, char in enumerate(" abcdefghijklmnopqrstuvwxyz'.?!")}

In [89]:
def preprocess_chunked_data(video_dir, text_dir, output_dir, char_to_idx, img_w=100, img_h=50, frames_n=75):
    """
    Preprocess all video and text chunks in the specified directories.

    Args:
        video_dir (str): Directory containing video chunks.
        text_dir (str): Directory containing text chunks.
        output_dir (str): Directory to save preprocessed data.
        char_to_idx (dict): Character-to-index mapping.
        img_w (int): Width to resize video frames.
        img_h (int): Height to resize video frames.
        frames_n (int): Number of frames to extract per video chunk.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get all video and label files
    video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".mp4")])
    label_files = sorted([f for f in os.listdir(text_dir) if f.endswith(".json")])

    for video_file in video_files:
        # Extract video ID and chunk number from the filename
        video_id, chunk_number = video_file.split("__video__")
        chunk_number = chunk_number.split(".")[0]

        # Find the corresponding label file
        label_file = f"{video_id}__text__{chunk_number}.json"
        if label_file not in label_files:
            print(f"Warning: No matching label file for {video_file}")
            continue

        # Full paths for video and label files
        video_path = os.path.join(video_dir, video_file)
        label_path = os.path.join(text_dir, label_file)

        # Preprocess video
        frames = preprocess_video(video_path, img_w, img_h, frames_n)

        # Convert text to labels
        labels, text = text_to_labels(label_path, char_to_idx)

        # Save preprocessed video as .npy file
        chunk_id = f"{video_id}__chunk__{chunk_number}"
        video_output_path = os.path.join(output_dir, f"videos/{chunk_id}.npy")
        np.save(video_output_path, frames)

        # Save labels as .json file
        label_output_path = os.path.join(output_dir, f"labels/{chunk_id}.json")
        with open(label_output_path, "w") as f:
            json.dump({"text": text, "labels": labels}, f)

        print(f"Processed and saved: {video_output_path} and {label_output_path}")

In [90]:
class LipNetDataset(Dataset):
    def __init__(self, video_dir, label_dir):
        """
        Initialize the LipNetDataset.

        Args:
            video_dir (str): Directory containing preprocessed video `.npy` files.
            label_dir (str): Directory containing preprocessed label `.json` files.
        """
        self.video_dir = video_dir
        self.label_dir = label_dir
        self.video_files = sorted([f for f in os.listdir(video_dir) if f.endswith(".npy")])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith(".json")])

        # Ensure the number of video and label files match
        assert len(self.video_files) == len(self.label_files), "Mismatch between video and label files."

    def __len__(self):
        """
        Return the number of samples in the dataset.
        """
        return len(self.video_files)

    def __getitem__(self, idx):
        """
        Get a single sample from the dataset.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: (frames, labels), where:
                - frames (torch.Tensor): Preprocessed video frames of shape (C, T, H, W).
                - labels (torch.Tensor): Corresponding label sequence as a tensor of integers.
        """
        # Load video frames
        video_path = os.path.join(self.video_dir, self.video_files[idx])
        frames = np.load(video_path, allow_pickle=True)
        frames = torch.tensor(frames, dtype=torch.float32).permute(3, 0, 1, 2)  # (C, T, H, W)

        # Load labels
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        with open(label_path, "r") as f:
            label_data = json.load(f)
        labels = torch.tensor(label_data["labels"], dtype=torch.long)

        return frames, labels

    def save(self, save_path):
        """
        Save the dataset object for later use.

        Args:
            save_path (str): Path to save the dataset object.
        """
        torch.save(self, save_path)
        print(f"Dataset saved to {save_path}")

    @staticmethod
    def load(load_path):
        """
        Load a saved dataset object.

        Args:
            load_path (str): Path to the saved dataset object.

        Returns:
            LipNetDataset: Loaded dataset object.
        """
        dataset = torch.load(load_path)
        print(f"Dataset loaded from {load_path}")
        return dataset

In [91]:
class LipNet(nn.Module):
    def __init__(self, img_c=3, img_w=100, img_h=50, frames_n=75, absolute_max_string_len=32, output_size=28):
        super(LipNet, self).__init__()
        self.img_c = img_c
        self.img_w = img_w
        self.img_h = img_h
        self.frames_n = frames_n
        self.absolute_max_string_len = absolute_max_string_len
        self.output_size = output_size

        # First 3D Convolutional Block
        self.conv1 = nn.Conv3d(img_c, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.dropout1 = nn.Dropout(0.5)

        # Second 3D Convolutional Block
        self.conv2 = nn.Conv3d(32, 64, kernel_size=(3, 5, 5), stride=(1, 1, 1), padding=(1, 2, 2))
        self.pool2 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.dropout2 = nn.Dropout(0.5)

        # Third 3D Convolutional Block
        self.conv3 = nn.Conv3d(64, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.dropout3 = nn.Dropout(0.5)

        # GRU layers
        feature_map_size = 1728 # To dynamically calculated flattened size: 96 * (img_w // 8) * (img_h // 8)
        self.gru1 = nn.GRU(feature_map_size, 256, batch_first=True, bidirectional=True)
        self.gru2 = nn.GRU(512, 256, batch_first=True, bidirectional=True)

        # Dense layer for character predictions
        self.fc = nn.Linear(512, output_size)

    def forward(self, x):
        # First 3D Convolutional Block
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = self.dropout1(x)

        # Second 3D Convolutional Block
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = self.dropout2(x)

        # Third 3D Convolutional Block
        x = F.relu(self.conv3(x))
        x = self.pool3(x)
        x = self.dropout3(x)

        # Reshape for RNN
        batch_size, channels, frames, height, width = x.size()
        x = x.permute(0, 2, 1, 3, 4).contiguous()  # (batch, frames, channels, height, width)
        x = x.view(batch_size, frames, -1)  # Flatten height and width

        # GRU layers
        x, _ = self.gru1(x)
        x, _ = self.gru2(x)

        # Dense layer for character predictions
        x = self.fc(x)
        x = F.log_softmax(x, dim=-1)

        return x

In [92]:
def collate_fn(batch):
    """
    Collate function to handle variable-length sequences in a batch.

    Args:
        batch (list): List of tuples (frames, labels).

    Returns:
        tuple: (frames, labels, input_lengths, label_lengths)
    """
    frames, labels = zip(*batch)

    # Stack frames into a tensor (batch_size, C, T, H, W)
    frames = torch.stack(frames)

    # Compute input lengths (number of frames per video)
    input_lengths = torch.tensor([frames.size(2)] * len(frames), dtype=torch.long)

    # Compute label lengths (number of characters per label sequence)
    label_lengths = torch.tensor([len(label) for label in labels], dtype=torch.long)

    # Concatenate labels into a single tensor
    labels = torch.cat(labels)

    return frames, labels, input_lengths, label_lengths

In [93]:
# Define the training loop
def train_lipnet(model, dataset, epochs=10, batch_size=8, learning_rate=1e-4, device='cuda'):
    """
    Train the LipNet model using CTC loss.

    Args:
        model (nn.Module): The LipNet model.
        dataset (Dataset): The dataset containing preprocessed video and label data.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        learning_rate (float): Learning rate for the optimizer.
        device (str): Device to train on ('cuda' or 'cpu').
    """
    # Move model to the specified device
    model.to(device)

    # Create DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    ctc_loss = CTCLoss(blank=0)  # Assuming 0 is the blank token index

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0

        for batch_idx, (frames, labels, input_lengths, label_lengths) in enumerate(dataloader):
            # Move data to the specified device
            frames = frames.to(device)  # (batch_size, C, T, H, W)
            labels = labels.to(device)  # (total_label_length)
            input_lengths = input_lengths.to(device)  # (batch_size)
            label_lengths = label_lengths.to(device)  # (batch_size)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(frames)  # (batch_size, T, output_size)

            # Reshape outputs for CTC loss
            outputs = outputs.permute(1, 0, 2)  # (T, batch_size, output_size)

            # Compute CTC loss
            loss = ctc_loss(outputs, labels, input_lengths, label_lengths)
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Accumulate loss
            epoch_loss += loss.item()

        # Print epoch loss
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss / len(dataloader):.4f}")

    print("Training complete!")

In [94]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Using device: mps


In [95]:
frames_n = 200

In [96]:
input_video_dir = f"{os.getcwd()}/input_data/videos/"
input_text_dir = f"{os.getcwd()}/input_data/labels/"
output_dir = f"{os.getcwd()}/preprocessed_data/"

preprocess_chunked_data(input_video_dir, input_text_dir, output_dir, char_to_idx, img_w=100, img_h=50, frames_n=frames_n)

Processed and saved: /Users/emmettstorts/Documents/slip-ml/preprocessed_data/videos/KjB6r-HDDI0__chunk__0.npy and /Users/emmettstorts/Documents/slip-ml/preprocessed_data/labels/KjB6r-HDDI0__chunk__0.json
Processed and saved: /Users/emmettstorts/Documents/slip-ml/preprocessed_data/videos/KjB6r-HDDI0__chunk__1.npy and /Users/emmettstorts/Documents/slip-ml/preprocessed_data/labels/KjB6r-HDDI0__chunk__1.json
Processed and saved: /Users/emmettstorts/Documents/slip-ml/preprocessed_data/videos/KjB6r-HDDI0__chunk__2.npy and /Users/emmettstorts/Documents/slip-ml/preprocessed_data/labels/KjB6r-HDDI0__chunk__2.json
Processed and saved: /Users/emmettstorts/Documents/slip-ml/preprocessed_data/videos/KjB6r-HDDI0__chunk__3.npy and /Users/emmettstorts/Documents/slip-ml/preprocessed_data/labels/KjB6r-HDDI0__chunk__3.json
Processed and saved: /Users/emmettstorts/Documents/slip-ml/preprocessed_data/videos/KjB6r-HDDI0__chunk__4.npy and /Users/emmettstorts/Documents/slip-ml/preprocessed_data/labels/KjB6r-H

In [97]:
# Initialize the model
model = LipNet(img_c=3, img_w=100, img_h=50, frames_n=frames_n, output_size=len(char_to_idx))

# Load the dataset
dataset = LipNetDataset(video_dir=f"{os.getcwd()}/preprocessed_data/videos/",
                        label_dir=f"{os.getcwd()}/preprocessed_data/labels/")

# Train the model
train_lipnet(model, dataset, epochs=10, batch_size=8, device='cpu')

Epoch [1/10], Loss: 69.4857
Epoch [2/10], Loss: 34.4837
Epoch [3/10], Loss: 29.8299
Epoch [4/10], Loss: 49.0106
Epoch [5/10], Loss: 23.8238
Epoch [6/10], Loss: 37.7125
Epoch [7/10], Loss: 15.2390
Epoch [8/10], Loss: 15.4510
Epoch [9/10], Loss: 13.9788
Epoch [10/10], Loss: 13.6406
Training complete!
