In [35]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [36]:
class LipReadingDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        self.labels = []
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))  # each folder = a class

        for label, cls in enumerate(self.classes):
            cls_folder = os.path.join(root_dir, cls)
            for file in os.listdir(cls_folder):
                if file.endswith(".npy"):
                    self.samples.append(os.path.join(cls_folder, file))
                    self.labels.append(label)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sequence = np.load(self.samples[idx])  # shape (30, 112, 112)
        label = self.labels[idx]

        # Add channel dimension: (30, 1, 112, 112)
        sequence = np.expand_dims(sequence, axis=1).astype(np.float32)

        # Convert to torch tensor
        sequence = torch.tensor(sequence)
        label = torch.tensor(label)

        return sequence, label

In [37]:
class LipReadingModel(nn.Module):
    def __init__(self, num_classes):
        super(LipReadingModel, self).__init__()

        # CNN for spatial features
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),  # (1,112,112) -> (32,112,112)
            nn.ReLU(),
            nn.MaxPool2d(2),                # (32,56,56)

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),                # (64,28,28)

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),                # (128,14,14)
        )

        self.flatten_dim = 128 * 14 * 14

        # LSTM for temporal sequence
        self.lstm = nn.LSTM(input_size=self.flatten_dim, hidden_size=256, num_layers=2, batch_first=True)

        # Classifier
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        # x: (batch, seq_len, 1, 112, 112)
        batch_size, seq_len, C, H, W = x.shape
        x = x.view(batch_size * seq_len, C, H, W)

        # CNN
        features = self.cnn(x)  # (batch*seq_len, 128, 14, 14)
        features = features.view(batch_size, seq_len, -1)  # (batch, seq_len, flatten_dim)

        # LSTM
        lstm_out, _ = self.lstm(features)  # (batch, seq_len, 256)
        final_out = lstm_out[:, -1, :]     # last frame's hidden state

        # Classifier
        out = self.fc(final_out)
        return out



In [38]:
root_dir = r"C:\Users\HP\Desktop\MLProject\face_dataset"

dataset = LipReadingDataset(root_dir=root_dir)
train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)

train_set = torch.utils.data.Subset(dataset, train_idx)
val_set = torch.utils.data.Subset(dataset, val_idx)

train_loader = DataLoader(train_set, batch_size=4, shuffle=True)
val_loader = DataLoader(val_set, batch_size=4, shuffle=False)

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

num_classes = len(dataset.classes)
model = LipReadingModel(num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Using device: cpu


In [40]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    train_loss, correct, total = 0, 0, 0

    for sequences, labels in train_loader:
        sequences, labels = sequences.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_acc = 100. * correct / total

    # Validation
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for sequences, labels in val_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    val_acc = 100. * val_correct / val_total

    print(f"Epoch [{epoch+1}/{EPOCHS}] "
          f"Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}% "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc:.2f}%")

RuntimeError: stack expects each tensor to be equal size, but got [30, 1, 112, 112] at entry 0 and [18, 1, 112, 112] at entry 1

In [34]:
torch.save(model.state_dict(), "lip_reading_model.pth")
print("Model saved as lip_reading_model.pth")

Model saved as lip_reading_model.pth
