<a href="https://www.kaggle.com/code/patrickstarrrr/dvs128gesture-cnn-0-96?scriptVersionId=161758699" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

IMB DVS128 Gesture Dataset (http://research.ibm.com/dvsgesture/) contains event-based recordings of 11 gestures made by 29 subjects under 3 different lighting conditions. A series of 11 gestures was recorded for each subject. Each gesture lasts 6 seconds. This work uses a preprocessed version of the original dataset (https://tonic.readthedocs.io/), where recordings that originally contained multiple labels have already been cut into respective samples. Also temporal precision is reduced to ms.

To avoid downloading dataset every session this kernel uses output from another kernel ([https://www.kaggle.com/code/dlarionov/create-dvs128gesture-tonic-dataset](https://www.kaggle.com/code/dlarionov/create-dvs128gesture-tonic-dataset))

This is the second part of the DVS128 Gesture Dataset exploration. It contains a straightforward solution using 2-layer CNN implemented with pytorch. The event trail for each gesture is divided into dense frames (similar to the first part). Each frame is classified as a separate image. The final class is determined by the most represented class among all frames in the trail.

The first part [https://www.kaggle.com/code/dlarionov/dvs128gesture-snntorch](https://www.kaggle.com/code/dlarionov/dvs128gesture-snntorch) uses a spiking neural network implemented with snntorch. It also contains details about dataset properties and preprocessing steps.

In [None]:
!pip install tonic --quiet # https://tonic.readthedocs.io/

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from IPython.display import HTML
from dataclasses import dataclass
import tonic

In [None]:
@dataclass(frozen=True)
class ToOneHotTimeCoding:
    """    
    encoder = ToOneHotTimeCoding(n_classes=5, n_frames=4)
    a = encoder(np.array([3, 2, 1])) # [n] -> [n_frames n n_classes]
    a.shape, a    
    """
    n_classes: int
    n_frames: int
    def __call__(self, target):
        oh = np.eye(self.n_classes)[target] # one-hot
        res = np.array([oh for _ in range(self.n_frames)]) # stack
        return res

In [None]:
debug=False # uses cache

dataset_path = '/kaggle/input/create-dvs128gesture-tonic-dataset'
w,h=64,64
n_frames=32

transforms = tonic.transforms.Compose([
    tonic.transforms.Denoise(filter_time=10000), # removes outlier events with inactive surrounding pixels for 10ms
    tonic.transforms.Downsample(sensor_size=tonic.datasets.DVSGesture.sensor_size, target_size=(w,h)), # downsampling image
    tonic.transforms.ToFrame(sensor_size=(w,h,2), n_time_bins=n_frames), # n_frames frames per trail
])

target_transform = ToOneHotTimeCoding(n_classes=11, n_frames=n_frames)

train = tonic.datasets.DVSGesture(save_to=dataset_path, transform=transforms, target_transform=target_transform, train=True)
test = tonic.datasets.DVSGesture(save_to=dataset_path, transform=transforms, target_transform=target_transform, train=False)

cached_train = train if debug else tonic.DiskCachedDataset(train, cache_path='/temp/dvsgesture/train')
cached_test = test if debug else tonic.DiskCachedDataset(test, cache_path='/temp/dvsgesture/test')

frames, labels = train[0]
print (tonic.datasets.DVSGesture.sensor_size, frames.shape, labels.shape)
ani = tonic.utils.plot_animation(frames)
HTML(ani.to_jshtml())

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
class DVS128GestureCNN(nn.Module):
    def __init__(self):
        super().__init__()
                
        self.net = nn.Sequential(
            nn.Conv2d(in_channels=2, out_channels=64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(4),
            nn.Dropout(0.4),
            
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(4),
            nn.Dropout(0.4),
            
            nn.Flatten(),
            nn.Linear(1152, 1024),
            nn.ReLU(),            
            nn.Dropout(0.4),
            nn.Linear(1024, 11),
            nn.Softmax(dim=1)
        )

    def forward(self, X): # X is [batch time polarity x-pos y-pos]
        arr = []
        for t in range(X.shape[1]): # n_frames
            y = self.net(X[:,t]) # [batch n_classes]
            arr.append(y)
        res = torch.stack(arr, dim=1) # [batch n_frames n_classes]
        return res
    
model = DVS128GestureCNN().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters()) #, lr=0.002, betas=(0.9, 0.999))

# log traces
loss_hist = []
acc_hist = []
test_acc_hist = []

print(model)

In [None]:
def accuracy(outputs, targets): # both tensors [batch n_frames n_classes]
    _, output_frames = torch.max(outputs, dim=2) # one-hot -> int   
    output_preds, _ = torch.mode(output_frames, dim=1) # the most polular class    
    _, target_frames = torch.max(targets, dim=2) # one-hot -> int
    target_preds = target_frames[:,0] # first slice    
    return torch.sum(output_preds == target_preds).item()/ len(output_preds)

def validate_model():
    correct, total = 0, 0  
    for batch, (data, targets) in enumerate(iter(test_loader)): 
        data, targets = data.to(device), targets.to(device) # [batch, n_frames, polarity, x-pos, y-pos] [batch, n_frames, n_classes] 
        outputs = model(data) # [batch, time step, n_classes]            
        correct += accuracy(outputs, targets) * data.shape[0]
        total += data.shape[0]
    return correct/total

In [None]:
num_epochs = 500
cnt = 1

train_loader = torch.utils.data.DataLoader(cached_train, batch_size=64, shuffle=True, drop_last=True, 
                                           collate_fn=tonic.collation.PadTensors(batch_first=True))
test_loader = torch.utils.data.DataLoader(cached_test, batch_size=32, shuffle=True, drop_last=True, 
                                          collate_fn=tonic.collation.PadTensors(batch_first=True))

for epoch in range(num_epochs):
    for batch, (data, targets) in enumerate(iter(train_loader)): # [batch, time, polarity, x-pos, y-pos] [batch, time, oh]
        data, targets = data.to(device), targets.to(device)
        
        outputs = model(data)
        
        # CrossEntropyLoss requires (N,C,d1,..dn)    
        loss = loss_fn(outputs.permute(0,2,1), targets.permute(0,2,1)) 
        loss_hist.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()        
        
        acc = accuracy(outputs, targets)
        acc_hist.append(acc)

        if cnt % 100 == 0:
            print(f"Epoch {epoch}, Iteration {batch} \nTrain Loss: {loss.item():.2f}")
            print(f"Train Accuracy: {acc * 100:.2f}%")
            test_acc = validate_model()            
            test_acc_hist.append(test_acc)
            print(f"Test Accuracy: {test_acc * 100:.2f}%\n")
        
        cnt+=1

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18,4))

axes[0].plot(acc_hist)
axes[0].set_title("Train Set Accuracy")
axes[0].set_xlabel("Iteration")
axes[0].set_ylabel("Accuracy")

axes[1].plot(test_acc_hist)
axes[1].set_title("Test Set Accuracy")
axes[1].set_xlabel("Iteration")
axes[1].set_ylabel("Accuracy")

axes[2].plot(loss_hist)
axes[2].set_title("Loss History")
axes[2].set_xlabel("Iteration")
axes[2].set_ylabel("Loss")

plt.show()

In [None]:
validate_model(), np.max(test_acc_hist)