In [None]:
import os
from PIL import Image, ImageOps

def resize_images(input_folder, output_folder, target_size=(384, 512)):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    count = 0
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.webp')):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            try:
                img = Image.open(input_path).convert("RGB")
                # Resize con padding per mantenere le proporzioni
                img_resized = ImageOps.pad(img, target_size, method=Image.BICUBIC, color=(0, 0, 0))
                img_resized.save(output_path)
                count += 1
            except Exception as e:
                print(f"Errore con {filename}: {e}")
    
    print(f"Completed: {count} resized images saved in '{output_folder}'")

if __name__ == "__main__":
    # Insert here images folderpath
    input_dir = r"hagrid-sample-30k-384p/hagrid_30k/none"   # <-- Input dir
    output_dir = r"hagrid-sample-30k-384p/hagrid_30k/none_resized"  # <-- Output dir

    resize_images(input_dir, output_dir)


✔️ Completato: 2164 immagini ridimensionate salvate in 'hagrid-sample-30k-384p/hagrid_30k/none_resized'


In [None]:
import os
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import mediapipe as mp
import cv2
from tqdm import tqdm, trange

# ----------------------------
# Configuration
# ----------------------------
dataset_path    = 'hagrid-sample-30k-384p/hagrid_30k'
cache_file      = 'hand_landmarks.npz'
batch_size      = 32
learning_rate   = 1e-3
num_epochs      = 20
num_classes     = 19  # real gestures + 1 for 'none'
val_split       = 0.1  # fraction for validation
test_split      = 0.1  # fraction for test
device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ----------------------------
# Mediapipe Setup
# ----------------------------
mp_hands = mp.solutions.hands
mp_cfg   = {
    'static_image_mode': True,
    'max_num_hands': 2
    
    ,
    'min_detection_confidence': 0.5
}

# ----------------------------
# Preprocessing: extract and cache landmarks
# ----------------------------
def preprocess_and_cache(root_dir: str, out_file: str) -> None:
    """
    Extracts hand landmarks from images in root_dir and saves them with labels.

    Args:
        root_dir (str): Path to dataset root, with subfolders per class containing JPEGs.
        out_file (str): Path to output .npz cache file.
    """
    files = glob.glob(os.path.join(root_dir, '*/*.jpg'))
    class_names = sorted(os.listdir(root_dir))
    labels_map = {name: idx for idx, name in enumerate(class_names)}

    all_feats, all_labels = [], []
    hands = mp_hands.Hands(**mp_cfg)
    for img_path in tqdm(files, desc='Extracting landmarks'):
        img = cv2.imread(img_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        res = hands.process(img_rgb)
        if res.multi_
        landmarks:
            lm = res.multi_hand_landmarks[0]
            coords = [coord for pt in lm.landmark for coord in (pt.x, pt.y, pt.z)]
            feats = np.array(coords, dtype=np.float32)
        else:
            feats = np.zeros(21 * 3, dtype=np.float32)
        all_feats.append(feats)
        label = labels_map[os.path.basename(os.path.dirname(img_path))]
        all_labels.append(label)
    hands.close()
    np.savez(out_file,
             features=np.stack(all_feats),
             labels=np.array(all_labels),
             classes=class_names)
    print(f"Saved cached landmarks and labels to {out_file}")

# ----------------------------
# Cached Dataset
# ----------------------------
class CachedHandDataset(Dataset):
    """
    PyTorch Dataset loading pre-extracted hand landmarks from a .npz file.
    """
    def __init__(self, cache_path: str):
        data = np.load(cache_path, allow_pickle=True)
        self.features = data['features']
        self.labels   = data['labels']
        self.classes  = data['classes']

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx: int):
        feat = torch.from_numpy(self.features[idx])
        lbl  = int(self.labels[idx])
        return feat, lbl

# ----------------------------
# Model Definition
# ----------------------------
class HandNet(nn.Module):
    """
    Simple fully-connected network for hand gesture classification.

    Args:
        input_size (int): Dimensionality of input features (default 63).
        hidden_size (int): Number of units in hidden layers (default 128).
        num_classes (int): Number of output classes.
    """
    def __init__(self, input_size: int = 63, hidden_size: int = 128, num_classes: int = 19):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_size).
        Returns:
            torch.Tensor: Logits tensor of shape (batch_size, num_classes).
        """
        return self.net(x)

# ----------------------------
# Training and Validation
# ----------------------------
def train_one_epoch(model: nn.Module, loader: DataLoader, criterion: nn.Module,
                    optimizer: torch.optim.Optimizer) -> tuple:
    """
    Runs a single training epoch.

    Args:
        model (nn.Module): The neural network.
        loader (DataLoader): Training data loader.
        criterion (nn.Module): Loss function.
        optimizer (Optimizer): Optimizer instance.
    Returns:
        tuple: (average_loss: float, accuracy: float)
    """
    model.train()
    total_loss, correct = 0.0, 0
    for feats, labels in tqdm(loader, desc='Training Batches', leave=False):
        feats, labels = feats.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(feats)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * feats.size(0)
        correct += (logits.argmax(dim=1) == labels).sum().item()
    avg_loss = total_loss / len(loader.dataset)
    acc = correct / len(loader.dataset)
    return avg_loss, acc


def validate(model: nn.Module, loader: DataLoader, criterion: nn.Module) -> tuple:
    """
    Evaluates the model on validation or test data.

    Args:
        model (nn.Module): The neural network.
        loader (DataLoader): Validation/test data loader.
        criterion (nn.Module): Loss function.
    Returns:
        tuple: (average_loss: float, accuracy: float)
    """
    model.eval()
    total_loss, correct = 0.0, 0
    with torch.no_grad():
        for feats, labels in tqdm(loader, desc='Validation Batches', leave=False):
            feats, labels = feats.to(device), labels.to(device)
            logits = model(feats)
            total_loss += criterion(logits, labels).item() * feats.size(0)
            correct += (logits.argmax(dim=1) == labels).sum().item()
    avg_loss = total_loss / len(loader.dataset)
    acc = correct / len(loader.dataset)
    return avg_loss, acc

# ----------------------------
# Main
# ----------------------------
def main() -> None:
    """
    Main entry: preprocess data if needed, split into train/val/test, train and evaluate.

    Returns:
        None
    """
    # Preprocess landmarks if cache not found
    if not os.path.exists(cache_file):
        preprocess_and_cache(dataset_path, cache_file)

    # Load dataset and split
    dataset = CachedHandDataset(cache_file)
    total = len(dataset)
    val_size  = int(total * val_split)
    test_size = int(total * test_split)
    train_size = total - val_size - test_size
    train_ds, val_ds, test_ds = random_split(dataset, [train_size, val_size, test_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

    # Model setup
    model = HandNet(input_size=63, hidden_size=128, num_classes=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training + Validation
    for epoch in trange(num_epochs, desc='Epochs'):
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer)
        val_loss, val_acc     = validate(model, val_loader, criterion)
        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Final Test
    test_loss, test_acc = validate(model, test_loader, criterion)
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

    torch.save(model.state_dict(), 'handnet_cached.pth')
    print("Training complete; model saved as handnet_cached.pth")

if __name__ == '__main__':
    main()


In [None]:
import torch
import torch.nn as nn
import mediapipe as mp
import cv2
import numpy as np

# ==== Parameters ====
model_path   = 'handnet_two_hands.pth'
num_classes  = 19
class_names  = [
    'Call', 'Dislike', 'Fist', 'Four', 'Like', 'Mute', 'None',
    'Ok', 'One', 'Palm', 'Peace', 'Peace_Inverted', 'Rock',
    'Stop', 'Stop_Inverted', 'Three', 'Three2', 'Two_Up',
    'Two_Up_Inverted'
]

# Two hands → 21 landmarks × 3 coords × 2 = 126 inputs
input_size   = 21 * 3 * 2
hidden_size  = 128

# ==== Model Definition ====
class HandNet(nn.Module):
    def __init__(self, input_size=input_size, hidden_size=hidden_size, num_classes=num_classes):
        super(HandNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        return self.net(x)

# ==== Load the trained model ====
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HandNet().to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

# ==== MediaPipe Hands Setup ====
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.5
)
mp_draw = mp.solutions.drawing_utils

# ==== Open Webcam ====
cap = cv2.VideoCapture(0, cv2.CAP_V4L2)
cap.set(cv2.CAP_PROP_FOURCC,  cv2.VideoWriter_fourcc(*'MJPG'))
cap.set(cv2.CAP_PROP_FRAME_WIDTH,  640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

if not cap.isOpened():
    print("[INFO] Cannot open webcam.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert BGR to RGB for MediaPipe
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result  = hands.process(img_rgb)

    # Build input feature vector of length 126
    feats = []

    if result.multi_hand_landmarks:
        # Collect landmarks for up to two hands
        for hand_landmarks in result.multi_hand_landmarks[:2]:
            for lm in hand_landmarks.landmark:
                feats.extend([lm.x, lm.y, lm.z])
        # If only one hand is detected, pad the second hand with zeros
        if len(result.multi_hand_landmarks) == 1:
            feats.extend([0.0] * (21 * 3))
        # Draw landmarks on the frame
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    else:
        # No hands detected → use all zeros
        feats = [0.0] * input_size

    # Perform inference
    x = torch.tensor(feats, dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        logits     = model(x)
        pred_class = logits.argmax(dim=1).item()
        pred_name  = class_names[pred_class]

    # Display predicted class on the frame
    cv2.putText(
        frame, pred_name, (10, 50),
        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2, cv2.LINE_AA
    )

    cv2.imshow("Hand Gesture Prediction", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Epochs:   5%|▌         | 1/20 [00:02<00:39,  2.10s/it]

Epoch 1/20 | Train Loss: 2.6369, Train Acc: 0.1668 | Val Loss: 2.1965, Val Acc: 0.2860


Epochs:  10%|█         | 2/20 [00:03<00:30,  1.70s/it]

Epoch 2/20 | Train Loss: 1.8981, Train Acc: 0.3812 | Val Loss: 1.6854, Val Acc: 0.4607


Epochs:  15%|█▌        | 3/20 [00:05<00:31,  1.82s/it]

Epoch 3/20 | Train Loss: 1.5770, Train Acc: 0.4694 | Val Loss: 1.4238, Val Acc: 0.5634


Epochs:  20%|██        | 4/20 [00:07<00:28,  1.77s/it]

Epoch 4/20 | Train Loss: 1.3902, Train Acc: 0.5367 | Val Loss: 1.3244, Val Acc: 0.5749


Epochs:  25%|██▌       | 5/20 [00:09<00:27,  1.84s/it]

Epoch 5/20 | Train Loss: 1.2846, Train Acc: 0.5681 | Val Loss: 1.2272, Val Acc: 0.5881


Epochs:  25%|██▌       | 5/20 [00:09<00:29,  1.95s/it]


KeyboardInterrupt: 