# Tutorial 4 (part 2)

This tutorial classifies images from the RGBD sensor. The first step is to read the labels.txt file. **Please download that file from LEARN and upload it to the robot before running this code.** You also need to download the pretrained model ('alexnet-owt-4df8aa71.pth') and upload it to the robot. The model can be downloaded from the following link https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth

In [1]:
#open the txt file and read object categories.
f = open("labels.txt", "r") 
labels = []
for x in f: # read the txt file line by line
    labels.append(x.split(':')[1]) #add it to the list
print(len(labels))

1000


Run the following code to capture image data from the RGBD sensor 

In [2]:
#use traitlets and widgets to display the image in Jupyter Notebook
import traitlets
from traitlets.config.configurable import SingletonConfigurable

#use opencv to covert the depth image to RGB image for displaying purpose
import cv2
import numpy as np

#using realsense to capture the color and depth image
import pyrealsense2 as rs

#multi-threading is used to capture the image in real time performance
import threading

predict_id = 0
class Camera(SingletonConfigurable):
    
    #this changing of this value will be captured by traitlets
    color_value = traitlets.Any()
    
    def __init__(self):
        super(Camera, self).__init__()
        
        #configure the color and depth sensor
        self.pipeline = rs.pipeline()
        self.configuration = rs.config()  
        
        #set resolution for the color camera
        self.color_width = 640
        self.color_height = 480
        self.color_fps = 30
        self.configuration.enable_stream(rs.stream.color, self.color_width, self.color_height, rs.format.bgr8, self.color_fps)

        #set resolution for the depth camera
        self.depth_width = 640
        self.depth_height = 480
        self.depth_fps = 30
        self.configuration.enable_stream(rs.stream.depth, self.depth_width, self.depth_height, rs.format.z16, self.depth_fps)

        #flag to control the thread
        self.thread_runnning_flag = False
        
        #start the RGBD sensor
        self.pipeline.start(self.configuration)
        self.pipeline_started = True
        frames = self.pipeline.wait_for_frames()

        #start capture the first color image
        color_frame = frames.get_color_frame()   
        image = np.asanyarray(color_frame.get_data())
        self.color_value = image

        #start capture the first depth image
        depth_frame = frames.get_depth_frame()           
        depth_image = np.asanyarray(depth_frame.get_data())
        depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
        self.depth_value = depth_colormap   

class Camera(SingletonConfigurable):
    # Other methods and attributes remain the same
    
    def _capture_frames(self):
        # Initialize human detection cascade classifiers
        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        fullbody_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_fullbody.xml')

        while(self.thread_runnning_flag==True):
            frames = self.pipeline.wait_for_frames()
            color_frame = frames.get_color_frame()
            image = np.asanyarray(color_frame.get_data())

            # Perform human detection
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
            bodies = fullbody_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

            # Draw bounding boxes around detected humans
            for (x,y,w,h) in faces:
                cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
            for (x,y,w,h) in bodies:
                cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)

            self.color_value = image
            # Continue with depth image processing and displaying

    
    def start(self): #start the data capture thread
        if self.thread_runnning_flag == False: #only process if no thread is running yet
            self.thread_runnning_flag=True #flag to control the operation of the _capture_frames function
            self.thread = threading.Thread(target=self._capture_frames) #link thread with the function
            self.thread.start() #start the thread

    def stop(self): #stop the data capture thread
        if self.thread_runnning_flag == True:
            self.thread_runnning_flag = False #exit the while loop in the _capture_frames
            self.thread.join() #wait the exiting of the thread       

def bgr8_to_jpeg(value):#convert numpy array to jpeg coded data for displaying 
    return bytes(cv2.imencode('.jpg',value)[1])

#create a camera object
camera = Camera.instance()
camera.start() # start capturing the data


Run the following code to perform classification and display the images

In [None]:
#The pytorch platform is used in this tutorial
import torch
import torchvision
import torch.nn as nn
import cv2
import numpy as np

# the following AlexNet model is defined by torchvision
class AlexNet(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

model = AlexNet()
model.load_state_dict(torch.load('alexnet-owt-4df8aa71.pth'))
# 'alexnet-owt-4df8aa71.pth' is the pretrained model, it can be downloaded from the following link
# https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth
# this model should be placed under the same folder of this file

#We use GPU for classification 
device = torch.device('cuda')
model = model.to(device)


import ipywidgets.widgets as widgets
from IPython.display import display, HTML
import sys

#create widgets for the displaying of the image
display_color = widgets.Image(format='jpeg', width='45%') #determine the width of the color image
display_depth = widgets.Image(format='jpeg', width='45%')  #determine the width of the depth image
layout=widgets.Layout(width='100%')

sidebyside = widgets.HBox([display_color, display_depth],layout=layout) #horizontal 
display(sidebyside) #display the widget

#callback function, invoked when traitlets detects the changing of the color image
def process(change):
    
    image = change['new'] #retrieve data from the input dict
    display_color.value = bgr8_to_jpeg(cv2.resize(image,(160,120)))
    display_depth.value = bgr8_to_jpeg(cv2.resize(camera.depth_value,(160,120)))

#processing({'new': camera.color_value})
#the camera.observe function will monitor the color_value variable. If this value changes, the processing function will be excuted.
camera.observe(process, names='color_value')


#the following code will classify the image and measure the time 
while (1):
    t1 = cv2.getTickCount() 
    imgsized= cv2.resize(camera.color_value,(224,224)) #resize the image
    x = cv2.cvtColor(imgsized, cv2.COLOR_BGR2RGB) #convert to RGB as required by the model
    x = x.transpose((2, 0, 1)) #swith the image channels
    x = torch.from_numpy(x).float() #convert to type float
    mean = 255.0 * np.array([0.485, 0.456, 0.406]) #mean value
    stdev = 255.0 * np.array([0.229, 0.224, 0.225]) # for the nomalization of the input image
    normalize = torchvision.transforms.Normalize(mean, stdev)
    x = normalize(x)
    x = x.to(device)# send the data to GPU device
    x = x[None, ...] # increase the image dimension, the model takes a batch of images and the batch size is 1
    output = model(x) #classfy the images
    predict_id = output.max(1, keepdim=True)[1].item() #get the label
    print('id ', predict_id," prediction time ", (cv2.getTickCount()-t1)/cv2.getTickFrequency())


HBox(children=(Image(value=b'', format='jpeg', width='45%'), Image(value=b'', format='jpeg', width='45%')), la…

id  256  prediction time  12.681661722
id  527  prediction time  0.105745472
id  747  prediction time  0.112847599
id  527  prediction time  0.068712513
id  747  prediction time  0.073490875
id  799  prediction time  0.069631508
id  422  prediction time  0.067404565
id  527  prediction time  0.071383843
id  598  prediction time  0.064851126
id  598  prediction time  0.065635022
id  799  prediction time  0.054757317
id  527  prediction time  0.068927838
id  799  prediction time  0.091001991
id  527  prediction time  0.044868245
id  799  prediction time  0.048860686
id  799  prediction time  0.070471982
id  799  prediction time  0.067426506
id  799  prediction time  0.065014041
id  527  prediction time  0.053237913
id  553  prediction time  0.041823289
id  747  prediction time  0.070320837
id  569  prediction time  0.051445387
id  527  prediction time  0.039274926
id  527  prediction time  0.050431282
id  707  prediction time  0.048493548
id  799  prediction time  0.06003266
id  799  pre

id  705  prediction time  0.047175201
id  414  prediction time  0.089334288
id  414  prediction time  0.069367081
id  920  prediction time  0.068102975
id  199  prediction time  0.061888121
id  747  prediction time  0.057045601
id  834  prediction time  0.045853907
id  583  prediction time  0.057991689
id  799  prediction time  0.061817496
id  505  prediction time  0.04793582
id  799  prediction time  0.053143337
id  569  prediction time  0.057900127
id  882  prediction time  0.050394345
id  747  prediction time  0.062547101
id  882  prediction time  0.06633791
id  598  prediction time  0.055553604
id  422  prediction time  0.059240247
id  747  prediction time  0.041050468
id  799  prediction time  0.054511475
id  598  prediction time  0.037564656
id  527  prediction time  0.056835784
id  553  prediction time  0.059212331
id  527  prediction time  0.060766591
id  527  prediction time  0.053448357
id  747  prediction time  0.053159244
id  799  prediction time  0.05614459
id  747  predic

id  508  prediction time  0.049663584
id  508  prediction time  0.041151997
id  508  prediction time  0.051775244
id  508  prediction time  0.047528024
id  508  prediction time  0.054996077
id  508  prediction time  0.042996636
id  508  prediction time  0.037824101
id  673  prediction time  0.05260046
id  508  prediction time  0.034986036
id  508  prediction time  0.041566068
id  508  prediction time  0.042361742
id  508  prediction time  0.049907135
id  508  prediction time  0.040441904
id  508  prediction time  0.046891781
id  508  prediction time  0.044038456
id  508  prediction time  0.052491658
id  508  prediction time  0.039414095
id  508  prediction time  0.038431182
id  508  prediction time  0.042943459
id  508  prediction time  0.047409435
id  508  prediction time  0.036138793
id  508  prediction time  0.035377181
id  508  prediction time  0.042220857
id  508  prediction time  0.044339183
id  508  prediction time  0.042306274
id  508  prediction time  0.039461126
id  508  pred

id  508  prediction time  0.045071171
id  508  prediction time  0.043742852
id  508  prediction time  0.041251778
id  508  prediction time  0.046835075
id  508  prediction time  0.038600036
id  508  prediction time  0.041245265
id  508  prediction time  0.042200152
id  508  prediction time  0.049412996
id  508  prediction time  0.042492427
id  508  prediction time  0.032444741
id  508  prediction time  0.037021438
id  508  prediction time  0.038676811
id  508  prediction time  0.044647356
id  508  prediction time  0.040642577
id  508  prediction time  0.040353179
id  508  prediction time  0.045926174
id  508  prediction time  0.037701144
id  508  prediction time  0.044179527
id  508  prediction time  0.043054082
id  508  prediction time  0.047824109
id  508  prediction time  0.036557569
id  508  prediction time  0.041660546
id  508  prediction time  0.044277156
id  508  prediction time  0.041232832
id  508  prediction time  0.045199164
id  508  prediction time  0.043335143
id  508  pre

[1 1 1]
[[1 1 1]]
