In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! unzip /content/drive/MyDrive/'Colab Notebooks'/HandGestures/imgs.zip

Archive:  /content/drive/MyDrive/Colab Notebooks/HandGestures/imgs.zip
replace pictures/Ok/frame_04_07_0106.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [3]:
import cv2
import numpy as np
import pandas as pd

import torch 
import torch.nn as nn
import torch.optim
from torchvision import datasets, transforms

In [4]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

device = 'cpu'
if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    device = 'cuda'
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


In [5]:
torch.__version__

'1.8.1+cu101'

In [6]:
transforms = transforms.Compose([transforms.Resize(224), 
                                 transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [7]:
labels = {
    "ThumbUp": 0,
    "ThumbDown": 1,
    "Ok": 2,
    "Palm": 3
}

class HandGesturesDataset(torch.utils.data.Dataset):
  def __init__(self, csv_file, file_col, lbl_col, text2lbl, transforms=None):
    self.csv = pd.read_csv(csv_file)
    self.file_col = file_col
    self.lbl_col = lbl_col
    self.labels = text2lbl
    self.transforms = transforms

  def __len__(self):
    return len(self.csv)

  def __getitem__(self, idx):
    img = cv2.imread(self.csv.loc[idx][self.file_col])
    if "Thumb" not in self.csv.loc[idx][self.lbl_col ]:
      img = img.astype(np.float).transpose(2, 0, 1)
    else:
      img = img.astype(np.float).transpose(2, 1, 0)
    label = self.csv.loc[idx][self.lbl_col]
    img = torch.from_numpy(img)
    if self.transforms:
      img = self.transforms(img)
    sample = {"img": img, "label": self.labels[label]}

    return sample

In [8]:
train_ds = HandGesturesDataset("/content/drive/MyDrive/Colab Notebooks/HandGestures/train_split.csv",
                            "file", "label", labels, transforms=transforms)
valid_ds = HandGesturesDataset("/content/drive/MyDrive/Colab Notebooks/HandGestures/valid_split.csv",
                            "file", "label", labels, transforms=transforms)
test_ds = HandGesturesDataset("/content/drive/MyDrive/Colab Notebooks/HandGestures/test_split.csv",
                            "file", "label", labels, transforms = transforms)

In [9]:
trainloader = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_ds, batch_size=16, shuffle=True)
testloader = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=True)

In [10]:
def set_parameter_requires_grad(model, extract_features):
    if extract_features:
        for param in model.parameters():
            param.requires_grad = False

In [11]:
from torchvision.models import resnet18, vgg16, inception_v3, densenet121, resnet50

def load_and_initalize(model_name, num_classes, extract_features=True):
  input_size = 0
  model = None

  if model_name == "resnet18":
    model = resnet18(pretrained=True)
    set_parameter_requires_grad(model, extract_features)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
    input_size = 224
  elif model_name == "resnet50":
    model = resnet50(pretrained=True)
    set_parameter_requires_grad(model, extract_features)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
  elif model_name == "inception_v3":
    model = inception_v3(pretrained=True)
    set_parameter_requires_grad(model, extract_features)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
  return model, input_size


In [12]:
model, img_size = load_and_initalize("resnet50", 4)
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [13]:
def train_model(model, train_dl, optimizer, loss_fn, valid_dl = None, epochs=10):
  model.to(device)

  for ep in range(1, epochs + 1):
    train_loss, val_loss = [], []
    print("---" * 3, "Epoch", ep, "---" * 3)
    model.train()
    for batch in train_dl:
      optimizer.zero_grad()
      img = batch["img"].float().to(device)
      out = model(img)
      loss = loss_fn(out, batch["label"].to(device))
      train_loss.append(loss.item())
      loss.backward()
      optimizer.step()
    print("Training Loss: ", np.mean(train_loss))
    if valid_dl is not None:
      model = model.eval()
      for batch in valid_dl:
        img = batch["img"].float().to(device)
        out = model(img)
        loss = loss_fn(out, batch["label"].to(device))
        val_loss.append(loss.item())
      print("Validation Loss: ", np.mean(val_loss))

In [14]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [15]:
optimizer = torch.optim.SGD(model.parameters(), 1e-04)
loss_fn = torch.nn.CrossEntropyLoss()

train_model(model, trainloader, optimizer, loss_fn, validloader, epochs=15)

--------- Epoch 1 ---------
Training Loss:  1.337630818198035
Validation Loss:  1.3130923980890319
--------- Epoch 2 ---------
Training Loss:  1.287784727844032
Validation Loss:  1.2653177671654279
--------- Epoch 3 ---------
Training Loss:  1.245514762652171
Validation Loss:  1.2196502103361972
--------- Epoch 4 ---------
Training Loss:  1.2045076970939521
Validation Loss:  1.1822512260703153
--------- Epoch 5 ---------
Training Loss:  1.1677550005841184
Validation Loss:  1.1434894351996192
--------- Epoch 6 ---------
Training Loss:  1.1334437443329408
Validation Loss:  1.1051288796949756
--------- Epoch 7 ---------
Training Loss:  1.096497000337721
Validation Loss:  1.0686312410258507
--------- Epoch 8 ---------
Training Loss:  1.0680145172027495
Validation Loss:  1.0360372126564499
--------- Epoch 9 ---------
Training Loss:  1.0376985993113246
Validation Loss:  1.0079535329064657
--------- Epoch 10 ---------
Training Loss:  1.008194731162475
Validation Loss:  0.975762590881466
-----

In [16]:
def test_model(model, testloader):
  model.eval()
  y_true, y_pred = [], []
  for batch in testloader:
    img = batch["img"].float().to(device)
    out = model(img)
    y_true.extend(batch["label"])
    probabilities = torch.softmax(model(img), dim=1)
    predicted_class = torch.argmax(probabilities, dim=1)
    y_pred.extend(predicted_class.cpu())
  return y_true, y_pred

In [17]:
# TODO: We need to split the data into a train/validation/test set instead of just a training set
y_true, y_pred = test_model(model, testloader)
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.897     0.984     0.938        62
           1      0.981     1.000     0.990        52
           2      1.000     0.642     0.782        53
           3      0.808     0.967     0.881        61

    accuracy                          0.904       228
   macro avg      0.922     0.898     0.898       228
weighted avg      0.916     0.904     0.898       228



In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[61,  0,  0,  1],
       [ 0, 52,  0,  0],
       [ 5,  1, 34, 13],
       [ 2,  0,  0, 59]])

In [19]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/HandGestures/models/resnet50_leapmotion.pkl")