In [30]:
# Cell 1: Import Necessary Libraries
import cv2
import torch
import numpy as np
import mediapipe as mp
import matplotlib
matplotlib.use('Agg')
from torchvision import models, transforms, datasets
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, SubsetRandomSampler
from tqdm import tqdm


In [31]:
# Cell 2: Load YOLO Model
def load_yolo_model(cfg_path, weights_path):
    net = cv2.dnn.readNet(weights_path, cfg_path)
    return net

# Paths to the YOLO config and weights
cfg_path = r"D:\ASL_Alphabet_Dataset\Sign_language\SignLanguageToText\cross-hands.cfg"
weights_path = r"D:\ASL_Alphabet_Dataset\Sign_language\SignLanguageToText\cross-hands.weights"

# Initialize YOLO model
yolo_net = load_yolo_model(cfg_path, weights_path)

if torch.cuda.is_available():
    yolo_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
    yolo_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
else:
    yolo_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
    yolo_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)


In [32]:
def detect_hands_yolo(frame):
    blob = cv2.dnn.blobFromImage(frame, scalefactor=1/255.0, size=(416, 416), swapRB=True, crop=False)
    yolo_net.setInput(blob)

    layer_names = yolo_net.getLayerNames()
    output_layers = [layer_names[i - 1] for i in yolo_net.getUnconnectedOutLayers().flatten()]

    outputs = yolo_net.forward(output_layers)
    boxes, confidences, class_ids = [], [], []

    h, w = frame.shape[:2]
    for output in outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:  # Filter for high-confidence detections
                box = detection[0:4] * np.array([w, h, w, h])
                (center_x, center_y, width, height) = box.astype("int")
                x = int(center_x - (width / 2))
                y = int(center_y - (height / 2))
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.3, nms_threshold=0.4)
    if len(indices) > 0:  # Check if any detections were returned
        indices = indices[0]  # Access the first item in the tuple for the indices

    results = [(boxes[i[0]], confidences[i[0]]) for i in indices] if len(indices) > 0 else []
    return results


In [33]:
# Cell 4: Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.5)

def process_frame_with_mediapipe(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)
    return results


In [34]:
# Cell 5: Define Random Sampling Function
def get_random_indices(dataset, fraction=0.2):
    """Return a random subset of indices from the dataset."""
    num_samples = len(dataset)
    num_selected = int(num_samples * fraction)
    return np.random.choice(num_samples, num_selected, replace=False).tolist()


In [35]:
# Cell 6: Define Data Transformations
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [36]:
# Cell 7: Load Datasets
train_data = datasets.ImageFolder(root='D:/ASL_Alphabet_Dataset/asl_alphabet_train_transformed', transform=transform)
val_data = datasets.ImageFolder(root='D:/ASL_Alphabet_Dataset/asl_alphabet_test_transformed', transform=transform)

# Validation loader
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

# Initialize train_loader with a random sampler
train_sampler = SubsetRandomSampler(get_random_indices(train_data, fraction=0.2))
train_loader = DataLoader(train_data, batch_size=32, sampler=train_sampler, pin_memory=True)


In [37]:
# Cell 8: Load MobileNetV2 Model
model = models.mobilenet_v2(pretrained=True)
model.classifier[1] = nn.Linear(model.last_channel, len(train_data.classes))

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)


In [38]:
# Cell 9: Training Loop
num_epochs = 10
best_val_acc = 0.0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    # Update sampler indices
    train_sampler_indices = get_random_indices(train_data, fraction=0.2)
    train_loader = DataLoader(train_data, batch_size=8, sampler=SubsetRandomSampler(train_sampler_indices))

    # Training phase
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss = running_loss / len(train_sampler_indices)
    train_acc = 100 * correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%')

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for val_inputs, val_labels in tqdm(val_loader, desc="Validating", leave=False):
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
            val_outputs = model(val_inputs)
            val_loss += criterion(val_outputs, val_labels).item() * val_inputs.size(0)
            _, val_predicted = val_outputs.max(1)
            val_total += val_labels.size(0)
            val_correct += val_predicted.eq(val_labels).sum().item()

    val_loss /= len(val_loader.dataset)
    val_acc = 100 * val_correct / val_total
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%')

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'asl_to_text_model.pth')
        print(f"New best model saved with Validation Accuracy: {val_acc:.2f}%")

print("Training complete.")



Epoch 1/10


                                                               

Epoch 1/10, Loss: 0.6655, Accuracy: 84.89%


                                                           

Validation Loss: 1.8624, Validation Accuracy: 52.65%
New best model saved with Validation Accuracy: 52.65%

Epoch 2/10


                                                               

Epoch 2/10, Loss: 0.1143, Accuracy: 96.85%


                                                           

Validation Loss: 2.0788, Validation Accuracy: 53.96%
New best model saved with Validation Accuracy: 53.96%

Epoch 3/10


                                                               

KeyboardInterrupt: 

In [29]:
# Cell 10: Test Hand Detection and Landmark Extraction
# Load a sample image for testing
test_image = cv2.imread("D:\ASL_Alphabet_Dataset\s.jpg")  # Adjust path

# Detect hands using YOLO
yolo_detections = detect_hands_yolo(test_image)

# Process and visualize detected hands
for (box, confidence) in yolo_detections:
    x, y, w, h = box
    cv2.rectangle(test_image, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Draw bounding box

# Process detected hand regions with MediaPipe
for (box, _) in yolo_detections:
    x, y, w, h = box
    hand_region = test_image[y:y + h, x:x + w]
    landmarks_result = process_frame_with_mediapipe(hand_region)

    # Visualization of landmarks
    if landmarks_result.multi_hand_landmarks:
        for hand_landmarks in landmarks_result.multi_hand_landmarks:
            for lm in hand_landmarks.landmark:
                cx, cy = int(lm.x * w), int(lm.y * h)
                cv2.circle(hand_region, (cx, cy), 5, (0, 0, 255), -1)

# Show the result
cv2.imshow('Detected Hands', test_image)
cv2.waitKey(0)
cv2.destroyAllWindows()


TypeError: object of type 'numpy.int32' has no len()