# Sign Language Recognition with CNN+LSTM (PyTorch)

This notebook implements a CNN+LSTM model using PyTorch to classify sign language gestures from a dataset of hand and pose landmarks. The dataset contains 30 videos per sign, each with 30 frames, for 9 signs: Hello, Happy, Great, Young, Open, Friend, Strong, Water, Who. Each frame has 225 landmarks (126 from hands, 99 from pose).

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Constants
NUM_CLASSES = 9
FRAMES_PER_VIDEO = 30
LANDMARKS_PER_FRAME = 225
BATCH_SIZE = 32
EPOCHS = 50
DATASET_PATH = 'dataset'
SIGNS = ['Hello', 'Happy', 'Great', 'Young', 'Open', 'Friend', 'Strong', 'Water', 'Who']
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Loading

Load the landmark data from the dataset directory. Assumes landmarks are saved as `.npy` files in `dataset/sign/video_idx/landmarks_frame_idx.npy`. If landmarks are not pre-saved, use the `extract_landmarks` function from the original script to process frames.

In [None]:
def load_dataset(dataset_path):
    data = []
    labels = []
    for sign_idx, sign in enumerate(SIGNS):
        sign_path = os.path.join(dataset_path, sign)
        for video_idx in range(30):
            video_data = []
            video_path = os.path.join(sign_path, f'video_{video_idx}')
            for frame_idx in range(FRAMES_PER_VIDEO):
                landmark_file = os.path.join(video_path, f'landmarks_{frame_idx}.npy')
                if os.path.exists(landmark_file):
                    landmarks = np.load(landmark_file)
                    video_data.append(landmarks)
                else:
                    print(f'Missing landmarks for {landmark_file}')
                    video_data.append(np.zeros(LANDMARKS_PER_FRAME))
            if len(video_data) == FRAMES_PER_VIDEO:
                data.append(np.array(video_data))
                labels.append(sign_idx)
    data = np.array(data)  # Shape: (num_videos, FRAMES_PER_VIDEO, LANDMARKS_PER_FRAME)
    labels = np.array(labels)  # Shape: (num_videos,)
    
    # Convert to PyTorch tensors
    data = torch.tensor(data, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.long)
    
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    
    # Create TensorDatasets
    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    return train_loader, test_loader

# Load data
train_loader, test_loader = load_dataset(DATASET_PATH)
print(f'Train batches: {len(train_loader)}')
print(f'Test batches: {len(test_loader)}')

## Model Creation

Define a CNN+LSTM model. The CNN processes spatial features of landmarks per frame, and the LSTM captures temporal dependencies across frames.

In [None]:
class CNNLSTMModel(nn.Module):
    def __init__(self):
        super(CNNLSTMModel, self).__init__()
        # CNN layers
        self.cnn = nn.Sequential(
            nn.Linear(LANDMARKS_PER_FRAME, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        # LSTM layer
        self.lstm = nn.LSTM(64, 128, batch_first=True)
        # Output layer
        self.fc = nn.Linear(128, NUM_CLASSES)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        # x shape: (batch, FRAMES_PER_VIDEO, LANDMARKS_PER_FRAME)
        batch_size = x.size(0)
        # Apply CNN to each frame
        x = x.view(-1, LANDMARKS_PER_FRAME)  # (batch * FRAMES_PER_VIDEO, LANDMARKS_PER_FRAME)
        x = self.cnn(x)  # (batch * FRAMES_PER_VIDEO, 64)
        x = x.view(batch_size, FRAMES_PER_VIDEO, -1)  # (batch, FRAMES_PER_VIDEO, 64)
        # Apply LSTM
        x, _ = self.lstm(x)  # (batch, FRAMES_PER_VIDEO, 128)
        x = x[:, -1, :]  # Take last time step: (batch, 128)
        x = self.dropout(x)
        x = self.fc(x)  # (batch, NUM_CLASSES)
        return x

# Create model
model = CNNLSTMModel().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Print model summary
print(model)

## Model Training

Train the model on the training data with validation on the test data.

In [None]:
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs):
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        for data, labels in train_loader:
            data, labels = data.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_loss /= len(train_loader)
        train_acc = correct / total
        
        # Validation
        model.eval()
        test_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(DEVICE), labels.to(DEVICE)
                outputs = model(data)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        test_loss /= len(test_loader)
        test_acc = correct / total
        
        # Store metrics
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        
        print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
    
    return train_losses, test_losses, train_accuracies, test_accuracies

# Train model
train_losses, test_losses, train_accuracies, test_accuracies = train_model(model, train_loader, test_loader, criterion, optimizer, EPOCHS)

## Model Evaluation

Evaluate the model on the test set and plot training history.

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(test_accuracies, label='Test Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('training_history_pytorch.png')
plt.show()

## Save Model

Save the trained model for future use.

In [None]:
# Save model
torch.save(model.state_dict(), 'sign_language_cnn_lstm_pytorch.pt')
print('Model saved as sign_language_cnn_lstm_pytorch.pt')