# Comparing 3D CNN, and Vision Transformer feature extractors for Isolated Sign Language Recognition

In [1]:
import torch
import test
import configs
from torchvision.transforms import v2
from video_dataset import VideoDataset
from pathlib import Path
from torch.utils.data import DataLoader
from models import pytorch_mvit, pytorch_r3d, pytorch_s3d, pytorch_swin3d
import json
import utils
import torch
import gc
import torch.nn.functional as F
import numpy as np
import random

def set_seed(seed=42):
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  random.seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  
set_seed()

with open('wlasl_implemented_info.json', 'r') as f:
  imp_info = json.load(f)


In [2]:
print(imp_info['splits'])
print(imp_info['models'].keys())
model_info = imp_info['models']

['asl100', 'asl300']
dict_keys(['MViT_V1_B', 'MViT_V2_S', 'Swin3D_B', 'Swin3D_S', 'Swin3D_T', 'Resnet2D_1D_18', 'Resnet3D_18', 'S3D'])


In [3]:
split = 'asl100'
root = Path('../data/WLASL/WLASL2000')
labels = Path(f'./preprocessed/labels/{split}')

## Main results

### load testing dictionary

In [4]:
with open('results/wlasl_runs_to_test.json', 'r') as f:
    to_test = json.load(f)

utils.print_dict(to_test)

{
asl100 : {
Resnet3D_18 : ['004', '005']
Resnet2D_1D_18 : ['004', '006']
S3D : ['010', '012']
MViT_V1_B : ['000']
MViT_V2_S : ['004']
Swin3D_B : ['002']
Swin3D_S : ['002']
Swin3D_T : ['002']
}

}



### test experiments

In [None]:
output = Path('results/')
test.test_all(
    runs_dict = to_test,
    test_last = True, #verify that best weights are working
    plot = True, 
    disp = True, #lets see some graphs
    res_output = output / 'wlasl_same_setup.json',
    skip_done = False, #rerun all the tests the first time 
    err_output = output / 'wlasl_same_setup_errors.json'
)