In [1]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d grassknoted/asl-alphabet
!unzip asl-alphabet.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Dataset URL: https://www.kaggle.com/datasets/grassknoted/asl-alphabet
License(s): GPL-2.0
User cancelled operation
Archive:  asl-alphabet.zip
replace asl_alphabet_test/asl_alphabet_test/A_test.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [2]:
import os
import random
import string
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import tensorflow as tf

class CFG:
    TRAIN_PATH = "asl_alphabet_train/asl_alphabet_train"
    LABELS = list(string.ascii_uppercase) + ["del", "nothing", "space"]
    NUM_CLASSES = len(LABELS)
    IMG_SIZE = 224
    BATCH_SIZE = 96
    EPOCHS = 30
    LR = 1e-4
    MOMENTUM = 0.9
    SEED = 42

    @staticmethod
    def seed_everything():
        random.seed(CFG.SEED)
        os.environ["PYTHONHASHSEED"] = str(CFG.SEED)
        np.random.seed(CFG.SEED)
        tf.random.set_seed(CFG.SEED)
        torch.manual_seed(CFG.SEED)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(CFG.SEED)

class ASLNetVGG(nn.Module):
    def __init__(self, feature_dim=512, freeze_vgg=True):
        super().__init__()
        # Backbone VGG16 pretrained
        vgg = models.vgg16(pretrained=True)
        # Convolutional features
        self.vgg_feats = vgg.features
        if freeze_vgg:
            for p in self.vgg_feats.parameters():
                p.requires_grad = False
        # Pooling for static feature (1x1)
        self.pool1 = nn.AdaptiveAvgPool2d((1,1))
        # Pooling for classifier branch (7x7)
        self.pool2 = vgg.avgpool
        # Classifier branch (penultimate layers)
        orig_cls = list(vgg.classifier.children())[:-1]
        self.asl_feats = nn.Sequential(*orig_cls)
        # Projection
        self.proj = nn.Linear(512 + 4096, feature_dim)
        self.act  = nn.ReLU()
        # Final head
        self.classifier = nn.Linear(feature_dim, CFG.NUM_CLASSES)

    def forward(self, x):
        # x: [B, C, H, W]
        feats = self.vgg_feats(x)         # [B,512,7,7]
        # static path
        f1 = self.pool1(feats)           # [B,512,1,1]
        f1 = torch.flatten(f1,1)         # [B,512]
        # classifier path
        f2 = self.pool2(feats)           # [B,512,7,7]
        f2 = torch.flatten(f2,1)         # [B,25088]
        f2 = self.asl_feats(f2)          # [B,4096]
        # concat + proj
        f  = torch.cat([f1, f2], dim=1)  # [B,4608]
        feat = self.act(self.proj(f))     # [B,feature_dim]
        return self.classifier(feat)      # [B,NUM_CLASSES]

class LibrasDataset(Dataset):
    def __init__(self, split='train', transform=None, val_ratio=0.2):
        super().__init__()
        self.transform = transform
        samples = []
        for idx, label in enumerate(CFG.LABELS):
            label_dir = os.path.join(CFG.TRAIN_PATH, label)
            if not os.path.isdir(label_dir): continue
            for fname in os.listdir(label_dir):
                if fname.lower().endswith(('.png','.jpg','.jpeg')):
                    samples.append((os.path.join(label_dir, fname), idx))
        random.shuffle(samples)
        split_idx = int(len(samples) * (1 - val_ratio))
        self.data = samples[:split_idx] if split == 'train' else samples[split_idx:]
        print(f"{split}: {len(self.data)} samples loaded")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        return img, label

# Training and evaluation

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    loss_accum = 0.0
    correct = total = 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        loss_accum += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return loss_accum / len(loader), correct / total if total > 0 else 0.0

@torch.no_grad()
def eval_epoch(model, loader, criterion, device):
    model.eval()
    loss_accum = 0.0
    correct = total = 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        logits = model(imgs)
        loss = criterion(logits, labels)
        loss_accum += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return loss_accum / len(loader), correct / total if total > 0 else 0.0

# Main script

def main():
    CFG.seed_everything()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.Resize((CFG.IMG_SIZE, CFG.IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ])
    train_ds = LibrasDataset('train', transform)
    val_ds   = LibrasDataset('val',   transform)
    train_dl = DataLoader(train_ds, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=4)
    val_dl   = DataLoader(val_ds,   batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=4)

    model = ASLNetVGG().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=CFG.LR, momentum=CFG.MOMENTUM)

    best_val = float('inf')
    for epoch in range(1, CFG.EPOCHS + 1):
        tr_loss, tr_acc = train_epoch(model, train_dl, optimizer, criterion, device)
        vl_loss, vl_acc = eval_epoch(model, val_dl, criterion, device)
        print(f"[Epoch {epoch:02d}/{CFG.EPOCHS}] "
              f"Train L: {tr_loss:.4f}, A: {tr_acc:.4%} | "
              f"Val L: {vl_loss:.4f}, A: {vl_acc:.4%}")
        if vl_loss < best_val:
            best_val = vl_loss
            torch.save(model.state_dict(), "v2_static_best.pt")
            print("👉 Best static model saved")

    torch.save(model.state_dict(), "v2_static_final.pt")
    print("👉 Static final model saved")

if __name__ == '__main__':
    main()

train: 69600 samples loaded
val: 17400 samples loaded




[Epoch 01/30] Train L: 3.3490, A: 4.7141% | Val L: 3.3117, A: 11.3908%
👉 Best static model saved
[Epoch 02/30] Train L: 3.2963, A: 9.8736% | Val L: 3.2467, A: 20.3506%
👉 Best static model saved
[Epoch 03/30] Train L: 3.2331, A: 15.9626% | Val L: 3.1619, A: 27.8851%
👉 Best static model saved
[Epoch 04/30] Train L: 3.1459, A: 21.3261% | Val L: 3.0486, A: 34.0345%
👉 Best static model saved
[Epoch 05/30] Train L: 3.0403, A: 24.8966% | Val L: 2.9185, A: 37.8103%
👉 Best static model saved
[Epoch 06/30] Train L: 2.9198, A: 27.6580% | Val L: 2.7764, A: 41.7011%
👉 Best static model saved
[Epoch 07/30] Train L: 2.7932, A: 30.0963% | Val L: 2.6354, A: 43.4540%
👉 Best static model saved
[Epoch 08/30] Train L: 2.6664, A: 32.5043% | Val L: 2.5065, A: 45.4885%
👉 Best static model saved


KeyboardInterrupt: 