In [None]:
# Cell 1
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import cv2
import numpy as np


In [None]:
# Cell 2
class SignLanguageDataset(Dataset):
    def __init__(self, image_paths, keypoint_paths, transform=None):
        self.image_paths = image_paths  # List of image file paths
        self.keypoint_paths = keypoint_paths  # List of skeletal data file paths
        self.transform = transform  # Any transformations for images

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        image = cv2.imread(self.image_paths[idx])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            image = self.transform(image)

        # Load skeletal data
        keypoints = np.load(self.keypoint_paths[idx])  # Assuming keypoints are stored as numpy arrays
        keypoints = torch.tensor(keypoints, dtype=torch.float32)

        return image, keypoints


In [None]:
# Cell 3
# Define image transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [None]:
# Cell 4
class MultimodalSignLanguageModel(nn.Module):
    def __init__(self, cnn_output_dim, lstm_hidden_dim, final_dim, num_classes):
        super(MultimodalSignLanguageModel, self).__init__()
        
        # Image branch: CNN for image feature extraction
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(64 * 16 * 16, cnn_output_dim)  # Adjust based on the image size
        )
        
        # Keypoint branch: LSTM for keypoint sequence processing
        self.lstm = nn.LSTM(input_size=42, hidden_size=lstm_hidden_dim, batch_first=True)
        
        # Combined output
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_dim + lstm_hidden_dim, final_dim),
            nn.ReLU(),
            nn.Linear(final_dim, num_classes)
        )

    def forward(self, image, keypoints):
        # Process image
        image_features = self.cnn(image)
        
        # Process keypoints
        _, (hn, _) = self.lstm(keypoints)
        keypoint_features = hn[-1]
        
        # Concatenate both
        combined = torch.cat((image_features, keypoint_features), dim=1)
        output = self.fc(combined)
        return output


In [None]:
# Cell 5
# Placeholder paths for images and keypoints (replace with your actual file paths)
image_paths = ["path_to_image_1.jpg", "path_to_image_2.jpg"]
keypoint_paths = ["path_to_keypoints_1.npy", "path_to_keypoints_2.npy"]

# Create dataset and DataLoader
dataset = SignLanguageDataset(image_paths, keypoint_paths, transform=transform)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
# Cell 6
# Model parameters
cnn_output_dim = 128
lstm_hidden_dim = 64
final_dim = 256
num_classes = 29  # Adjust based on the number of sign classes

# Initialize model, loss function, and optimizer
model = MultimodalSignLanguageModel(cnn_output_dim, lstm_hidden_dim, final_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
# Cell 7
num_epochs = 5  # Adjust based on your needs

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for images, keypoints in data_loader:
        # Move data to appropriate device (if using GPU)
        images, keypoints = images.float(), keypoints.float()
        
        # Forward pass
        outputs = model(images, keypoints)
        labels = torch.randint(0, num_classes, (images.size(0),))  # Random labels (replace with actual labels)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item() * images.size(0)
    
    # Calculate average loss
    epoch_loss = running_loss / len(dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')


In [None]:
# Cell 8
# Placeholder for evaluation function
def evaluate(model, data_loader):
    model.eval()
    total, correct = 0, 0
    
    with torch.no_grad():
        for images, keypoints in data_loader:
            images, keypoints = images.float(), keypoints.float()
            outputs = model(images, keypoints)
            _, predicted = torch.max(outputs, 1)
            labels = torch.randint(0, num_classes, (images.size(0),))  # Replace with actual labels
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f}%')

# Evaluate on validation DataLoader (replace with actual validation set)
evaluate(model, data_loader)
