# Apply prediction on skeleton

The goal here is to use the trained model on some skeletons Idemia gave us.
Based on demo_from_3D.py.

In [2]:
import copy
import cv2
import numpy as np
import os

from torch.utils.data import Dataset

import torch
import torch.nn as nn

In [3]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))

device: cuda


## Constant

In [4]:
classes = [5, 6, 7, 8, 14, 24, 30, 32, 42]
classes_names = ['pickup', 'throw', 'sitting down', 'standing up (from sitting position)', 'take off jacket', 'reach into pocket', 'pointing to something with finger', 'check time (from watch)', 'falling']
print(classes_names)
print(len(classes_names))
# class 0  (6) : pickup
# class 1  (7) : throw
# class 2  (8) : sitting down
# class 3  (9) : standing up (from sitting position)
# class 4 (15) : take off jacket
# class 5 (25) : reach into pocket
# class 6 (31) : pointing to something with finger
# class 7 (33) : check time (from watch)
# class 8 (43) : falling

root_dir = os.path.dirname(os.getcwd())
data_dir = root_dir + "/data/"
data_skeleton_dir = data_dir + "mmpose_ntu_3d_sample/"
data_skeleton_files = os.listdir(data_skeleton_dir)

models_saved = os.path.join(root_dir, 'models_saved')


['pickup', 'throw', 'sitting down', 'standing up (from sitting position)', 'take off jacket', 'reach into pocket', 'pointing to something with finger', 'check time (from watch)', 'falling']
9


## Dataset loader for pytorch

In [5]:
class HumanActionDataset(Dataset):

    def __init__(self, data_dir, data_files, classes, with_depth=True):
        self.data_dir = data_dir
        self.data_files = [data_file for data_file in data_files if int(data_file[17:-4])-1 in classes]
        self.classes = classes
        self.with_depth = with_depth

    def __len__(self):
        return len(self.data_files)

    def __getitem__(self, idx):
        tensor = torch.Tensor(np.load(self.data_dir + self.data_files[idx]))
        # if self.with_depth:
        #     tensor[:,:,2] = 3
        #     tensor = tensor.reshape((tensor.shape[0], 3*17))
        # else:
        #     tensor = torch.tensor([[tensor[i,k//2,k%2] for k in range(2*17)] for i in range(tensor.shape[0])])
        label = self.classes.index(int(self.data_files[idx][17:-4])-1)
        return (tensor, label)

In [6]:
HAD = HumanActionDataset(data_skeleton_dir, data_skeleton_files, classes)

## Model Loading

In [7]:
sm = nn.Softmax(dim=1).to(device)
h_n, c_n = None, None

class LSTMHA(nn.Module):

    def __init__(self, nb_classes, input_size, hidden_size_lstm, hidden_size_classifier, num_layers, device):

        super(LSTMHA, self).__init__()

        self.num_classes = nb_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size_lstm
        self.device = device

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size_lstm, num_layers=num_layers, batch_first=True) # lstm
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size_lstm, hidden_size_classifier),
            nn.ReLU(),
            nn.Linear(hidden_size_classifier, nb_classes)
        )

    def forward(self,x,h_0=None,c_0=None):
        if h_0 is None:
            h_0 = torch.rand(self.num_layers, x.size(0), self.hidden_size).to(self.device) # hidden state (short memory)
            c_0 = torch.rand(self.num_layers, x.size(0), self.hidden_size).to(self.device) # internal state (long memory)
        _, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        h_n_reshape = h_n[-1].reshape(1, h_n.shape[1], h_n.shape[2]).view(-1, self.hidden_size)
        results = self.classifier(h_n_reshape) # reshaping the data for clasifier
        return results, h_n, c_n

In [8]:
model_LSTMHA = LSTMHA(nb_classes=len(classes), input_size=3*17, hidden_size_lstm=256, hidden_size_classifier=128, num_layers=1, device=device)
model_LSTMHA.to(device)
model_LSTMHA.load_state_dict(torch.load(os.path.join(models_saved,"LSTM03D_mmpose.pt")))
model_LSTMHA.eval()

LSTMHA(
  (lstm): LSTM(51, 256, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=9, bias=True)
  )
)

What is the required input size already ?

In [9]:
model_LSTMHA.input_size

51

### Applying the model

In the demo_from_3D.py there is alread a window display of the results.
Here we are not interesting in that. We want the final output of the model about the skeleton data from a video.

In [10]:
sequence, label = HAD[0]
sequence.shape

torch.Size([74, 17, 3])

In [11]:
x = sequence.reshape(74,1,3*17).to(device)
h_n, c_n = None, None
output, h_n, c_n = model_LSTMHA(x, h_n, c_n)

In [31]:
output.shape

torch.Size([74, 9])

We notice that what we get is a prediction for each frame inputed in the model.\
But why is that ? Does the prediction take into account what happened so far ?

In [12]:
prediction_arg = output.argmax().item()
output.shape

torch.Size([74, 9])

In [30]:
prediction_arg%9, classes_names[prediction_arg%9], label, classes_names[label]

(8, 'falling', 0, 'pickup')

The "highest" prediction is not often the true label.