# Comparing 3D CNN, and Vision Transformer feature extractors for Isolated Sign Language Recognition

In [1]:
import torch
import test
import configs
from torchvision.transforms import v2
from video_dataset import VideoDataset
from pathlib import Path
from torch.utils.data import DataLoader
from models import pytorch_mvit, pytorch_r3d, pytorch_s3d, pytorch_swin3d
import json
import utils
import torch
import gc
import torch.nn.functional as F
import numpy as np
import random

def set_seed(seed=42):
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  random.seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  
set_seed()

with open('wlasl_implemented_info.json', 'r') as f:
  imp_info = json.load(f)


In [2]:
print(imp_info['splits'])
print(imp_info['models'].keys())
model_info = imp_info['models']

['asl100', 'asl300']
dict_keys(['MViT_V1_B', 'MViT_V2_S', 'Swin3D_B', 'Swin3D_S', 'Swin3D_T', 'Resnet2D_1D_18', 'Resnet3D_18', 'S3D'])


In [3]:
split = 'asl100'
root = Path('../data/WLASL/WLASL2000')
labels = Path(f'./preprocessed/labels/{split}')

## Main results

### load testing dictionary

In [4]:
with open('results/wlasl_runs_to_test.json', 'r') as f:
    to_test = json.load(f)

utils.print_dict(to_test)

{
asl100 : {
Resnet3D_18 : ['004', '005']
Resnet2D_1D_18 : ['004', '006']
S3D : ['010', '012']
MViT_V1_B : ['000']
MViT_V2_S : ['004']
Swin3D_B : ['002']
Swin3D_S : ['002']
Swin3D_T : ['002']
}

}



### test experiments

In [5]:
output = Path('results/')
# test.test_all(
#     runs_dict = to_test,
#     test_last = True, #verify that best weights are working
#     plot = True, 
#     disp = True, #lets see some graphs
#     res_output = output / 'wlasl_same_setup.json',
#     skip_done = False, #rerun all the tests the first time 
#     err_output = output / 'wlasl_same_setup_errors.json'
# )


#cleared cell outputs because of size
#commented out to prevent re-running everything by accident

In [6]:
with open('results/wlasl_same_setup.json', 'r') as f:
  result_dict = json.load(f)


## Comparing all with 16 frames

In [7]:
with open('results/wlasl_satnac_16_only.json', 'r') as f:
  satnac_16_only = json.load(f)
utils.print_dict(satnac_16_only)
summary = test.summarise(
    result_dict, 
    to_summarise = satnac_16_only,
    metric = 'top_k_average_per_class_acc'
)
# with open('results/wlasl_satnac_16_only_summary.json', 'w') as f:
#     json.dump(summary, f, indent=4)
utils.print_dict(summary)


{
asl100 : {
Resnet3D_18 : ['005']
Resnet2D_1D_18 : ['006']
S3D : ['012']
MViT_V1_B : ['000']
MViT_V2_S : ['004']
Swin3D_B : ['002']
Swin3D_S : ['002']
Swin3D_T : ['002']
}

}

['asl100']
[[('Resnet3D_18', '005'), ('Resnet2D_1D_18', '006'), ('S3D', '012'), ('MViT_V1_B', '000'), ('MViT_V2_S', '004'), ('Swin3D_B', '002'), ('Swin3D_S', '002'), ('Swin3D_T', '002')]]
{
asl100 : {
Resnet3D_18 : {
exp : 005
top_k_average_per_class_acc : {
top1 : 0.45699999999999996
top5 : 0.7583333333333333
top10 : 0.8658333333333332
}

}

Resnet2D_1D_18 : {
exp : 006
top_k_average_per_class_acc : {
top1 : 0.5025
top5 : 0.8204999999999998
top10 : 0.8913333333333333
}

}

S3D : {
exp : 012
top_k_average_per_class_acc : {
top1 : 0.29866666666666664
top5 : 0.5998333333333333
top10 : 0.7313333333333333
}

}

MViT_V1_B : {
exp : 000
top_k_average_per_class_acc : {
top1 : 0.6656666666666666
top5 : 0.8783333333333333
top10 : 0.93
}

}

MViT_V2_S : {
exp : 004
top_k_average_per_class_acc : {
top1 : 0.6798333333333332

## Comparing 32 frames to 16 frames for 3D CNNs

In [8]:
#summarise uses architectures as keys, so we should rather seperately summarise,
#into two dictionaries (later tables) for 16 and 32 frames
splits = ['asl100']
archs = ['Resnet3D_18', 'Resnet2D_1D_18', 'S3D']
c3d16s = ["005", '006', '012']
c3d32s = ["004", "004", "010"]

model_exps_16f = []
model_exps_32f = []

for arch, c16, c32 in zip(archs, c3d16s, c3d32s):
    model_exps_16f.append((arch, c16))
    model_exps_32f.append((arch, c32))

print("16 frame experiments:")
print(model_exps_16f)
print("32 frame experiments:")
print(model_exps_32f)

16 frame experiments:
[('Resnet3D_18', '005'), ('Resnet2D_1D_18', '006'), ('S3D', '012')]
32 frame experiments:
[('Resnet3D_18', '004'), ('Resnet2D_1D_18', '004'), ('S3D', '010')]


### 16 frame summary

In [9]:
sum16 = test.summarise(
    result_dict,
    splits=splits,
    model_exps = [model_exps_16f],
)

['asl100']
[[('Resnet3D_18', '005'), ('Resnet2D_1D_18', '006'), ('S3D', '012')]]


### 32 frame summary

In [10]:
sum32 = test.summarise(
    result_dict,
    splits=splits,
    model_exps = [model_exps_32f],
)

['asl100']
[[('Resnet3D_18', '004'), ('Resnet2D_1D_18', '004'), ('S3D', '010')]]


In [11]:
from tables import gen_single_split_table

print(gen_single_split_table(
    split_dict=sum16['asl100'],
    caption='16 frame 3D CNN results on WLASL100', label='tab:wlasl_16f',
    footnotes=['All models use 16 frames as input.']
)
)

print()
print('%'*10)
print()

print(gen_single_split_table(
    split_dict=sum32['asl100'],
    caption='32 frame 3D CNN results on WLASL100', label='tab:wlasl_32f',
    footnotes=['All models use 32 frames as input.']
)
)

\begin{table}[t]
\begin{center} 
\caption{16 frame 3D CNN results on WLASL100}
\label{tab:wlasl_16f}
\begin{tabular}{|l|ccc|}
\hline\textbf{Model} & \textbf{Acc@1} & \textbf{Acc@5} & \textbf{Acc@10} \\ 
\hline 
Resnet3D\_18 & 45.7 & 75.83 & 86.58 \\ 
Resnet2D\_1D\_18 & 50.25 & 82.05 & 89.13 \\ 
S3D & 29.87 & 59.98 & 73.13 \\ 
\hline 
\multicolumn{4}{l}{All models use 16 frames as input.} \\ 
\end{tabular} 
\end{center} 
\end{table} 


%%%%%%%%%%

\begin{table}[t]
\begin{center} 
\caption{32 frame 3D CNN results on WLASL100}
\label{tab:wlasl_32f}
\begin{tabular}{|l|ccc|}
\hline\textbf{Model} & \textbf{Acc@1} & \textbf{Acc@5} & \textbf{Acc@10} \\ 
\hline 
Resnet3D\_18 & 53.43 & 82.0 & 89.5 \\ 
Resnet2D\_1D\_18 & 51.32 & 76.35 & 85.02 \\ 
S3D & 41.22 & 68.0 & 78.58 \\ 
\hline 
\multicolumn{4}{l}{All models use 32 frames as input.} \\ 
\end{tabular} 
\end{center} 
\end{table} 



# now we can create a differences table

In [12]:
from tables import gen_compare_table

print(gen_compare_table(
    sum16['asl100'],
    sum32['asl100'],
    caption='Comparison of 16 and 32 frame 3D CNN results on WLASL100', 
    label='tab:wlasl_16f_vs_32f',
    footnotes=[
        '32 - 16 frame difference',
        'All models use 16 or 32 frames as input respectively.'
    ]       
))

TypeError: string indices must be integers