# Google Colab: Access Webcam for Images and Video
This notebook will go through how to access and run code on images and video taken using your webcam.  

For this purpose of this tutorial we will be using OpenCV's Haar Cascade to do face detection on our Webcam image and video.

In [11]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time
!pip install face_recognition
import face_recognition

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, \
Dropout, BatchNormalization

import pathlib
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')
from tensorflow.keras.applications.vgg19 import preprocess_input

!pip install ultralytics
from ultralytics import YOLO



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Helper Functions
Below are a few helper function to make converting between different image data types and formats. 

In [12]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

## Haar Cascade Classifier
For this tutorial we will run a simple object detection algorithm called Haar Cascade on our images and video fetched from our webcam. OpenCV has a pre-trained Haar Cascade face detection model. 

In [13]:
# initialize the Haar Cascade face detection model
# face_cascade = cv2.CascadeClassifier(cv2.samples.findFile(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'))

In [14]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [15]:
path = Path("/content/drive/My Drive/diplom/model_save_ocv/checkpoint_best128_new_DS_24-02.h5")

In [16]:
%%capture --no-display

class em_model:
    def __init__(self, config:str):
        self.config = config
        self.model = tf.keras.models.load_model(self.config)
        # warm-up model on a random sample
        self.img_size = 128
        sample = tf.random.uniform((1, self.img_size, self.img_size, 3), 0, 1)
        self.model(sample)
        self.class_names = {0: 'anger',
                      1: 'contempt',
                      2: 'disgust',
                      3: 'fear',
                      4: 'happy',
                      5: 'neutral',
                      6: 'sad',
                      7: 'surprise',
                      8: 'uncertain'}
        
    def predict_model(self, image):
        input_arr = cv2.resize(image, (128,128), interpolation=cv2.INTER_AREA)
        input_arr = preprocess_input(input_arr)
        input_arr = np.array([input_arr])  # Convert single image to a batch.
          
        predictions = self.model.predict(input_arr)

        emotion = self.class_names[np.argmax(predictions)]
        return emotion

In [17]:
model_emotion = em_model(path)

In [18]:
!cp /content/drive/MyDrive/diplom/yolov8n-face.pt /content/

In [19]:
model_path = '/content/yolov8n-face.pt'
model = YOLO(model_path)

In [21]:
%%capture --no-display
# start streaming video from webcam
video_stream()
# label for video

label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 
frameSize = (640, 480) 
out = cv2.VideoWriter('output_video.mp4',cv2.VideoWriter_fourcc(*'DIVX'), 25, frameSize) 
  # for filename in glob.glob('D:/images/*.jpg'): 
  #   img = cv2.imread(filename) 
  #   out.write(img) 
  # out.release()

while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break
    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])
    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)
    
    results = model.predict(source=img, show=False, save=False, conf=0.25, imgsz=640, line_thickness=3, max_det=1000)
    for result in results:
      img = result[0].orig_img

      bxes = result[0].boxes
      if len(bxes.xywh) >= 1:
        for x, y, w, h in bxes.xyxy.tolist():
          x, y, w, h, = int(x), int(y), int(w), int(h)
          # print(int(y), int(h), int(x), int(w))
          img_box = img[int(y):int(h), int(x):int(w), :]

          label = model_emotion.predict_model(img_box);
          box_array = cv2.rectangle(bbox_array,(x, y),(w, h),(255,0,0),2)
          # result[0].names[0] = label
          cv2.putText(bbox_array, label, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
          # out.write(img) 

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

    out.write(img) 
out.release()

<IPython.core.display.Javascript object>


0: 480x640 1 face, 16.5ms
Speed: 0.7ms preprocess, 16.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 1 face, 13.2ms
Speed: 0.6ms preprocess, 13.2ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 1 face, 16.9ms
Speed: 3.9ms preprocess, 16.9ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 1 face, 15.2ms
Speed: 0.6ms preprocess, 15.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 1 face, 25.2ms
Speed: 0.6ms preprocess, 25.2ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 1 face, 17.6ms
Speed: 0.6ms preprocess, 17.6ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)


In [None]:
# Здесь все на камеру выводится, но не удалось вывести видео в отдельный экран или отключить вывод логов