In [9]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import train_test_split

In [10]:
class LipReadingDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        self.labels = []
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))  # each folder = a class

        for label, cls in enumerate(self.classes):
            cls_folder = os.path.join(root_dir, cls)
            for file in os.listdir(cls_folder):
                if file.endswith(".npy"):
                    self.samples.append(os.path.join(cls_folder, file))
                    self.labels.append(label)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sequence = np.load(self.samples[idx])  # shape = (T, 112, 112) with variable T
        label = self.labels[idx]

        # Add channel dimension: (T, 1, 112, 112)
        sequence = np.expand_dims(sequence, axis=1).astype(np.float32)

        # Convert to torch tensor
        sequence = torch.tensor(sequence)   # shape (T,1,112,112)
        label = torch.tensor(label)

        return sequence, label

In [11]:
def collate_fn(batch):
    # batch is a list of (sequence, label)
    sequences, labels = zip(*batch)
    lengths = [seq.shape[0] for seq in sequences]  # store true lengths

    # Pad sequences to max length in batch
    max_len = max(lengths)
    padded_sequences = []
    for seq in sequences:
        pad_len = max_len - seq.shape[0]
        if pad_len > 0:
            pad = torch.zeros((pad_len, 1, 112, 112), dtype=torch.float32)
            seq = torch.cat([seq, pad], dim=0)
        padded_sequences.append(seq)

    padded_sequences = torch.stack(padded_sequences)  # (B, T, 1, 112, 112)
    labels = torch.stack(labels)

    return padded_sequences, lengths, labels  # keep lengths for LSTM packing

In [12]:

class LipReadingModel(nn.Module):
    def __init__(self, num_classes):
        super(LipReadingModel, self).__init__()

        # CNN for spatial features
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),  # (1,112,112) -> (32,112,112)
            nn.ReLU(),
            nn.MaxPool2d(2),                # (32,56,56)

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),                # (64,28,28)

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),                # (128,14,14)
        )

        self.flatten_dim = 128 * 14 * 14

        # LSTM for temporal sequence
        self.lstm = nn.LSTM(
            input_size=self.flatten_dim,
            hidden_size=256,
            num_layers=2,
            batch_first=True
        )

        # Classifier
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x, lengths):
        """
        x: (batch, seq_len, 1, 112, 112)
        lengths: list/array of actual sequence lengths before padding
        """
        batch_size, seq_len, C, H, W = x.shape
        x = x.view(batch_size * seq_len, C, H, W)

        # CNN
        features = self.cnn(x)  # (batch*seq_len, 128, 14, 14)
        features = features.view(batch_size, seq_len, -1)  # (batch, seq_len, flatten_dim)

        # Pack sequence for variable lengths
        packed = pack_padded_sequence(features, lengths, batch_first=True, enforce_sorted=False)

        # LSTM
        packed_out, (hn, cn) = self.lstm(packed)

        # hn: (num_layers, batch, hidden_size)
        # take the last layer's hidden state
        final_out = hn[-1]  # (batch, hidden_size)

        # Classifier
        out = self.fc(final_out)
        return out

In [13]:
root_dir = r"C:\Users\HP\Desktop\MLProject\face_dataset"

dataset = LipReadingDataset(root_dir=root_dir)
train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)

train_set = torch.utils.data.Subset(dataset, train_idx)
val_set = torch.utils.data.Subset(dataset, val_idx)

train_loader = DataLoader(train_set, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_set, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

num_classes = len(dataset.classes)
model = LipReadingModel(num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


Using device: cpu


In [15]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    train_loss, correct, total = 0, 0, 0

    for sequences, lengths, labels in train_loader:   # unpack lengths too
        sequences, labels = sequences.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(sequences, lengths)  # pass lengths
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_acc = 100. * correct / total

    # Validation
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for sequences, lengths, labels in val_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences, lengths)  # pass lengths
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    val_acc = 100. * val_correct / val_total

    print(f"Epoch [{epoch+1}/{EPOCHS}] "
          f"Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}% "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc:.2f}%")


Epoch [1/10] Train Loss: 1.1394, Train Acc: 23.61% Val Loss: 1.1041, Val Acc: 38.89%
Epoch [2/10] Train Loss: 1.1012, Train Acc: 30.56% Val Loss: 1.0843, Val Acc: 33.33%
Epoch [3/10] Train Loss: 1.0807, Train Acc: 41.67% Val Loss: 1.0081, Val Acc: 44.44%
Epoch [4/10] Train Loss: 0.9258, Train Acc: 54.17% Val Loss: 0.8341, Val Acc: 50.00%
Epoch [5/10] Train Loss: 0.8015, Train Acc: 65.28% Val Loss: 0.7872, Val Acc: 72.22%
Epoch [6/10] Train Loss: 0.7004, Train Acc: 61.11% Val Loss: 0.6866, Val Acc: 72.22%
Epoch [7/10] Train Loss: 0.4553, Train Acc: 76.39% Val Loss: 0.6855, Val Acc: 77.78%
Epoch [8/10] Train Loss: 0.4576, Train Acc: 83.33% Val Loss: 0.5921, Val Acc: 66.67%
Epoch [9/10] Train Loss: 0.4968, Train Acc: 81.94% Val Loss: 0.6930, Val Acc: 66.67%
Epoch [10/10] Train Loss: 0.3219, Train Acc: 87.50% Val Loss: 0.5079, Val Acc: 83.33%


In [16]:
torch.save(model.state_dict(), "lip_reading_model.pth")
print("Model saved as lip_reading_model.pth")

Model saved as lip_reading_model.pth
