In [35]:
import cv2
import mediapipe as mp
import numpy as np
import time 
import pyrealsense2 as rs
import math as m
import matplotlib.pyplot as plt
from PIL import Image
import time
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F

In [36]:
def convert_landmark_to_array(landmarks):
    landmarks_array = np.array(
        [[landmark.x, landmark.y, landmark.z] for landmark in landmarks])
    return landmarks_array

## Load the trained model

In [37]:
class ConvNet(nn.Module):
    def __init__(self, n_input_channels=3, n_output=4):
        super().__init__()
    
        # input = 75x20x3
        self.conv1 = nn.Conv2d(n_input_channels, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        
        # input = 37x10x32
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        
        # input = 18x5x64
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        # input = 9x2x128
        self.fc1 = nn.Linear(128*9*2, 512)
        self.fc2 = nn.Linear(512, n_output)

    
    def forward(self, x):
        
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2) #maxpool of kernel size 2 to reduce the size of the images by a factor 2
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.max_pool2d(x, 2)
        x = x.view(-1, 128*9*2) #flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x
    
    def predict(self, x):
        logits = self.forward(x)
        return F.softmax(logits)

In [38]:
torch.cuda.is_available()

True

In [46]:
CHECKPOINT = "trained_model.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = ConvNet().to(device)
model.load_state_dict(torch.load(CHECKPOINT))
model.eval()

# useful for normalization
dataset_mean = np.load("dataset_mean.npy")
dataset_std = np.load("dataset_std.npy")

cuda


In [47]:
dataset_mean

array([ 0.50957664,  0.58962419, -0.06745016])

In [48]:
def normalize_data(data):
    new_data = np.zeros(data.shape)
    new_data[:,:,0] = (data[:,:,0] - dataset_mean[0])/dataset_std[0]
    new_data[:,:,1] = (data[:,:,1] - dataset_mean[1])/dataset_std[1]
    new_data[:,:,2] = (data[:,:,2] - dataset_mean[2])/dataset_std[2]
    return new_data


In [49]:
label_dict = {"come":0,"lie down":1,"sit":2,"stay":3}
inv_label_dict = {v: k for k, v in label_dict.items()}

In [55]:
new_frame_time = 0
prev_frame_time = 0

predicted_class_name = "hee"


# could find a way of finding this value automatically
frames_by_sec = 20 - 1
n_frames = 0
landmark_image = np.empty((75,3))

#hollistic detects posiiton, face and hand landmarks
mp_holistic = mp.solutions.holistic
holistic_model = mp_holistic.Holistic(
    model_complexity=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

moves = ["sit", "lie down", "stay", "come"]


n_samples = 30 # number of times we will record each move
current_sample = 0 # current sample we are recording
current_move = 0

cap = cv2.VideoCapture(0)
cv2.namedWindow("image", cv2.WINDOW_NORMAL)
while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      # If loading a video, use 'break' instead of 'continue'.
      continue

    n_frames += 1

    if n_frames > 20:
      if n_frames % 20 == 0:
        # convert to tensor
        data = landmark_image[:,:,-20:]
        data = np.transpose(data, (0, 2, 1))
        data = np.transpose(data, (2, 0, 1))
        data = np.nan_to_num(data)
        data = np.clip(data, -10, 10)
        # print("before normalization")
        # print(data)
        data = normalize_data(data)
        # print("after normalization")
        # print(data)
        data = torch.tensor(data).float() 
        # print(data.shape)
        # data = data.transpose(1,0,2)
        data = torch.unsqueeze(data, 0)
        # print(data.shape)
        data = data.to(device)

        # predict
        output = model.predict(data)
        print(f"frame {n_frames}")
        print(output)
        # get the predicted class
        if torch.max(output) > 0.6:
          predicted_class = torch.argmax(output, dim=1)
          predicted_class_name = inv_label_dict[int(predicted_class)]

    # compute fps
    new_frame_time = time.time()
    fps = 1/(new_frame_time-prev_frame_time)
    prev_frame_time = new_frame_time

    # compute keypoints etc...
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    image.flags.writeable = False
    results = holistic_model.process(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # convert landmarks to array with exception handling (zero array if not detected)
    try:
      pose_landmarks = convert_landmark_to_array(results.pose_landmarks.landmark)
    except:
      #  print("no pose landmarks detected")
       pose_landmarks = np.zeros((33,3))
    try:
      left_hand_landmarks = convert_landmark_to_array(results.left_hand_landmarks.landmark)
    except:
      # print("no left hand landmarks detected")
      left_hand_landmarks = np.zeros((21,3))
    try:
      right_hand_landmarks = convert_landmark_to_array(results.right_hand_landmarks.landmark)
    except:
      # print("no right hand landmarks detected")
      right_hand_landmarks = np.zeros((21,3))

    hands_and_pose_array = np.concatenate((pose_landmarks, left_hand_landmarks, right_hand_landmarks), axis=0)
    landmark_image = np.dstack((landmark_image, hands_and_pose_array))

    # Drawing the annotiations
    image.flags.writeable = True
    
    # Drawing Pose Landmarks
    mp_drawing.draw_landmarks(
        image,
        results.pose_landmarks,
        mp_holistic.POSE_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())
    
    # Drawing Right hand Landmarks
    mp_drawing.draw_landmarks(
      image,
      results.right_hand_landmarks,
      mp_holistic.HAND_CONNECTIONS
    )
 
    # Drawing Left hand Landmarks
    mp_drawing.draw_landmarks(
      image,
      results.left_hand_landmarks,
      mp_holistic.HAND_CONNECTIONS
    )

    fps = str(int(fps))
    image = cv2.flip(image,1)
    cv2.putText(image, fps, (7, 70), 1, 3, (100, 255, 0), 3, cv2.LINE_AA)
    cv2.putText(image, predicted_class_name, (7, 100), 1, 3, (100, 255, 0), 3, cv2.LINE_AA)
    cv2.imshow('image', image)
    c = cv2.waitKey(1)
    if c == 27: # press escape to quit
        break
cap.release()
cv2.destroyAllWindows()

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [25]:
print(landmark_image[:,:,40:60])

[[[ 0.59088641  0.59006107  0.5900647  ...  0.52256835  0.51002502
    0.50510854]
  [ 0.21145545  0.21115653  0.21099502 ... -0.20798609 -0.14849935
   -0.12320651]
  [-2.040483   -2.04355097 -2.01860261 ... -0.75773156 -0.56069994
   -0.59763104]]

 [[ 0.64257628  0.64151841  0.64150804 ...  0.54083884  0.52911341
    0.52235234]
  [ 0.11997752  0.11982448  0.11897407 ... -0.24453324 -0.18421206
   -0.16392471]
  [-2.00935268 -2.01289988 -1.98904061 ... -0.77320743 -0.55757856
   -0.58639228]]

 [[ 0.67216563  0.67118281  0.67122722 ...  0.55443913  0.54195845
    0.53246504]
  [ 0.1141978   0.11400199  0.11259723 ... -0.23271029 -0.17502123
   -0.15437298]
  [-2.00990438 -2.0134387  -1.98953438 ... -0.77323598 -0.55775183
   -0.58658379]]

 ...

 [[ 0.          0.          0.         ...  0.          0.
    0.1895048 ]
  [ 0.          0.          0.         ...  0.          0.
    1.04024053]
  [ 0.          0.          0.         ...  0.          0.
    0.00818705]]

 [[ 0.        

In [16]:

torch.set_printoptions(threshold=10_000)

In [17]:
cap.release()
cv2.destroyAllWindows()