Steps to Process Videos
Traverse Directories: Recursively explore all subdirectories to locate video files.

Preprocess Each Video:

Normalize FPS.
Resize frames.
Convert to tensor format.
Save Output: Save the preprocessed video tensors to .pt files in a structured output directory mirroring the input hierarchy.



In [1]:
import os
import cv2
import torch
import numpy as np

# Base directory paths
base_path = r"D:\ASL\Dataset"
sub_dirs = ["train", "test", "val"]

# Target properties for preprocessing
TARGET_FPS = 30
TARGET_SIZE = (224, 224)
OUTPUT_DIR = r"D:\ASL\ProcessedDataset"

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to normalize FPS and resize frames
def preprocess_video(video_path, target_fps=TARGET_FPS, target_size=TARGET_SIZE):
    cap = cv2.VideoCapture(video_path)
    original_fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / original_fps

    # Calculate target frame count
    target_frame_count = int(duration * target_fps)
    frames = []

    for i in range(target_frame_count):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * original_fps / target_fps)
        ret, frame = cap.read()
        if ret:
            # Resize frame
            resized_frame = cv2.resize(frame, target_size)
            frames.append(resized_frame)
    cap.release()

    # Convert frames to tensor
    frames_tensor = torch.tensor(np.array(frames), dtype=torch.float32).permute(0, 3, 1, 2) / 255.0
    return frames_tensor

# Function to process and save videos
def process_videos(base_path, sub_dirs, output_dir):
    for sub_dir in sub_dirs:
        sub_dir_path = os.path.join(base_path, sub_dir)
        for root, _, files in os.walk(sub_dir_path):
            for file in files:
                if file.endswith((".mp4", ".avi", ".mov")):  # Add other formats as needed
                    video_path = os.path.join(root, file)
                    print(f"Processing: {video_path}")

                    # Preprocess video
                    try:
                        frames_tensor = preprocess_video(video_path)
                        # Save processed tensor
                        relative_path = os.path.relpath(root, base_path)
                        output_path = os.path.join(output_dir, relative_path)
                        os.makedirs(output_path, exist_ok=True)

                        tensor_path = os.path.join(output_path, f"{os.path.splitext(file)[0]}.pt")
                        torch.save(frames_tensor, tensor_path)
                        print(f"Saved processed video to: {tensor_path}")
                    except Exception as e:
                        print(f"Error processing {video_path}: {e}")

# Run the processing
process_videos(base_path, sub_dirs, OUTPUT_DIR)


Processing: D:\ASL\Dataset\train\#asl\#asl.mp4
Saved processed video to: D:\ASL\ProcessedDataset\train\#asl\#asl.pt
Processing: D:\ASL\Dataset\train\#asl\#asl_blurred.mp4
Error processing D:\ASL\Dataset\train\#asl\#asl_blurred.mp4: division by zero
Processing: D:\ASL\Dataset\train\#asl\#asl_brightness_contrast.mp4
Error processing D:\ASL\Dataset\train\#asl\#asl_brightness_contrast.mp4: division by zero
Processing: D:\ASL\Dataset\train\#asl\#asl_flipped.mp4
Error processing D:\ASL\Dataset\train\#asl\#asl_flipped.mp4: division by zero
Processing: D:\ASL\Dataset\train\#asl\#asl_gaussian_noise.mp4
Error processing D:\ASL\Dataset\train\#asl\#asl_gaussian_noise.mp4: division by zero
Processing: D:\ASL\Dataset\train\#asl\#asl_hue_saturation_shift.mp4
Error processing D:\ASL\Dataset\train\#asl\#asl_hue_saturation_shift.mp4: division by zero
Processing: D:\ASL\Dataset\train\#asl\#asl_perspective_warp.mp4
Error processing D:\ASL\Dataset\train\#asl\#asl_perspective_warp.mp4: division by zero
Proc

Dataloader


In [2]:
import os
import torch
from torch.utils.data import Dataset

class ASLVideoDataset(Dataset):
    def __init__(self, base_path, sub_dirs, transform=None):
        """
        Custom Dataset for loading preprocessed video tensors.

        Args:
            base_path (str): Path to the processed dataset folder.
            sub_dirs (list): List of subdirectories (e.g., ["train", "test", "val"]).
            transform (callable, optional): Optional transformations on video tensors.
        """
        self.data = []
        self.labels = []
        self.transform = transform

        for sub_dir in sub_dirs:
            sub_dir_path = os.path.join(base_path, sub_dir)
            for root, _, files in os.walk(sub_dir_path):
                for file in files:
                    if file.endswith(".pt"):
                        file_path = os.path.join(root, file)
                        # Label is derived from the file name (e.g., "SIMILE" from "SIMILE.pt")
                        label = os.path.splitext(file)[0]
                        self.data.append(file_path)
                        self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load tensor
        video_tensor = torch.load(self.data[idx])
        label = self.labels[idx]

        # Apply transformation if provided
        if self.transform:
            video_tensor = self.transform(video_tensor)

        return video_tensor, label


In [4]:
from torchvision.transforms import Compose, Normalize

# Example normalization (adjust mean and std based on your dataset)
transform = Compose([
    Normalize(mean=[0.5], std=[0.5])  # Example: normalize pixel values
])


In [5]:
from torch.utils.data import DataLoader

# Paths
base_path = r"D:\ASL\ProcessedDataset"
sub_dirs = ["train", "test", "val"]

# Instantiate Dataset for training, testing, and validation
train_dataset = ASLVideoDataset(base_path, ["train"], transform=transform)
test_dataset = ASLVideoDataset(base_path, ["test"])
val_dataset = ASLVideoDataset(base_path, ["val"])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4)


# 3D cnn


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class C3DModel(nn.Module):
    def __init__(self, num_classes):
        super(C3DModel, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool3d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv3d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool3d(kernel_size=2, stride=2)

        self.conv3a = nn.Conv3d(128, 256, kernel_size=3, stride=1, padding=1)
        self.conv3b = nn.Conv3d(256, 256, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool3d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(256 * 4 * 4 * 2, 4096)  # Adjust dimensions based on your input size
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)

        x = F.relu(self.conv2(x))
        x = self.pool2(x)

        x = F.relu(self.conv3a(x))
        x = F.relu(self.conv3b(x))
        x = self.pool3(x)

        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        x = F.relu(self.fc2(x))
        x = F.dropout(x, 0.5)
        x = self.fc3(x)

        return x


BI DIRECTONAL LSTM

In [8]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=2):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_size)
        out, _ = self.lstm(x)  # LSTM output
        out = out[:, -1, :]  # Take the output of the last time step
        out = self.fc(out)  # Fully connected layer
        return out


In [9]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class VideoTransformer(nn.Module):
    def __init__(self, num_patches, embed_dim, num_heads, num_classes, num_layers=4):
        super(VideoTransformer, self).__init__()
        self.embedding = nn.Linear(num_patches, embed_dim)
        encoder_layer = TransformerEncoderLayer(embed_dim, num_heads)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # x shape: (batch_size, num_frames, num_patches)
        x = self.embedding(x)  # Convert to embeddings
        x = self.transformer_encoder(x)  # Transformer Encoder
        x = x.mean(dim=1)  # Global average pooling
        x = self.fc(x)  # Fully connected layer
        return x


In [11]:
output = F.softmax(model_output, dim=1)


NameError: name 'model_output' is not defined

In [12]:
import torch.nn.functional as F

class EnsembleModel:
    def __init__(self, models):
        self.models = models  # List of trained models

    def predict(self, inputs):
        predictions = []
        for model in self.models:
            model.eval()
            with torch.no_grad():
                output = model(inputs)  # Get model output
                probabilities = F.softmax(output, dim=1)  # Convert to probabilities
                predictions.append(probabilities)
        
        # Average predictions
        ensemble_output = torch.mean(torch.stack(predictions), dim=0)
        return ensemble_output


In [13]:
# Instantiate the models
c3d_model = C3DModel(num_classes=num_classes).to(device)
bilstm_model = BiLSTMModel(input_size=input_size, hidden_size=hidden_size, num_classes=num_classes).to(device)
transformer_model = VideoTransformer(num_patches=num_patches, embed_dim=embed_dim, num_heads=num_heads, num_classes=num_classes).to(device)

# Load trained model weights
c3d_model.load_state_dict(torch.load("c3d_model.pth"))
bilstm_model.load_state_dict(torch.load("bilstm_model.pth"))
transformer_model.load_state_dict(torch.load("transformer_model.pth"))

# Create the ensemble
ensemble = EnsembleModel(models=[c3d_model, bilstm_model, transformer_model])

# Predict on test data
for inputs, labels in test_loader:
    inputs = inputs.to(device)
    ensemble_output = ensemble.predict(inputs)
    predicted_labels = torch.argmax(ensemble_output, dim=1)
    print(f"Predictions: {predicted_labels}")


NameError: name 'num_classes' is not defined