In [None]:
"""
    Student:        Byron Dowling
    Class:          Computer Vision (CSE 60535)
    Term:           University of Notre Dame, Fall 2023
    Assignment:     Practical #4 : Deep Learning-based Object Detection
"""

# Two lines below (when uncommented) allow you to track the time spent on each cell, if you wanted
!pip install ipython-autotime
%load_ext autotime

import cv2
import os
import sys
import numpy as np
from sklearn import svm
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model
import torchvision.datasets as datasets
from torchvision.transforms import Compose, Resize
import tensorflow as tf

## Answers to tasks 1-4:

### Task 1
- The dollar bill was recognized fairly quickly at different angle
- My cellphone was recognized fairly well, but this was a bit ambiguous because the camera also thought that I was a cell phone
- My watch took the longest to identify and it only occured at close range, and this could be because it is a smart watch so it shares some characteristics with a smartphone

### Task 2
- The linear SVM when changed to fc1 performed considerably worse than when on fc2
- My cellphone was recognized fine, my watch was not recognized at all, and the dollar bill was only recognized some of the time, often it was misclassified as a cellphone
- The combination of fc1 and a polynomial kernel is the worst performer so far. It was only able to correctly identify a watch, and that in of itself could be flawed as it thought I was a watch and nothing presented to the camera made it change its mind including my phone or a dollar bill.
- I experienced the same poor results above when I tried the combination of fc2 and a polynomial kernel, only a watch was detected, but this was flawed because the background/me was identified as a watch and it never changed off of this
- When testing fc1 and fc2 on a rbf, I did not see signifcant performance increase. They did recognize my cellphone if I presented it with the back camera facing the webcam

### Task 3
- When adding in extra objects on the configuration from task 2, the addition of more objects decreased performance.
- I suspect this is from having more options to choose from and the ability for more items to share similar features and characteristics

### Task 4 (for 60000-level section)
- Ran this in a different Colab: https://colab.research.google.com/drive/14JskQTdA-TMOirTmLXFH45-asuKXP9LH?usp=sharing 
- So far my best performing set up was:
    - Two Dense Layers with 128 nodes and relu activation
    - Two drop out layers at 0.25%
    - A final Dense layer of 101 neurons and softmax
    - 10 Epochs
    - Accuracy was around 92%

- I experimented with a few more setups but performance was slightly worse and stagnated
    - Particularly when I used my current best setup and extended the Epochs from 10 to 15, minimal increase was observed.


In [None]:
# We are utilizing Keras API to load the pretrained VGG16 model as our feature extractor for training SVM
model = VGG16(weights='imagenet')

# *** TASK 2 ***
# Choose the layer of the VGG model used to get your features (= "network embeddings")
cnn_codes = 'fc2'

# Loading our model that will output the network enbeddings specified by us above (instead of a classification decision)
features_model = Model(inputs=model.input, outputs=model.get_layer(cnn_codes).output)

# And here is the function that will give us VGG-based features for an image "img"
def extract_vgg_features(img):
    # prepare the image for VGG
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
    img = img[np.newaxis, :, :, :]
    # call feature extraction
    return features_model.predict(img,verbose=0)

In [None]:
# Let's download and extract the Caltech101 dataset.
!wget -c https://notredame.box.com/shared/static/o5hw6ljq7x00smui4ixo9akxwlq2dkib.gz -O caltech101.tar.gz
%mkdir ./caltech101/
!tar -zxf caltech101.tar.gz -C ./caltech101/

# We need to convert PIL images in to OpenCV images
class ToCV2(object):

    def __call__(self, sample):
        #image, label = sample['image'], sample['label']

        image = np.array(sample.convert('RGB'))
        image = image[:, :, ::-1].copy()

        return image

caltech101_dataset = datasets.Caltech101(root='./', transform=Compose([Resize((224, 224)), ToCV2()]))

In [None]:
# First, let us find the label numbers for our objects

# *** TASK 1 *** and *** TASK 3 ***
#  We are going to use this list to restrict the objets our classifier will recognize
# my_object_list = ['watch','cellphone','dollar_bill']
my_object_list = ['watch','cellphone','dollar_bill','lotus','camera','stapler','celing_fan','headphone','scissors','lobster']

my_object_label_list = []
for my_object in my_object_list:
    for i, category in enumerate(caltech101_dataset.categories):
        if category == my_object:
            my_object_label_list.append(i)
print("{} -> {} (class indicies in CALTECH101)".format(my_object_list,my_object_label_list))

In [None]:
# Next, let's extract the VGG features required for our selected object categories.
# It should take no longer than 5 minutes.

from tqdm import tqdm

batch_size = 128

c101_images = []
c101_labels = []
c101_label_to_i = {}

# We can load the entire dataset on RAM as the dataset is pretty small
print("Loading dataset to RAM....")
for image, label in tqdm(caltech101_dataset):
    if int(label) in my_object_label_list:
        c101_images.append(image)
        c101_labels.append(int(label))

print('Extracting features.....')
# Now we convert the images into vgg features
c101_vgg_features = features_model.predict(np.array(c101_images), batch_size=batch_size, workers=2, use_multiprocessing=True, verbose=1)

assert len(c101_vgg_features) == len(c101_labels)

In [None]:
# *** TASK 2 ***
clf = svm.SVC(kernel='linear').fit(c101_vgg_features, c101_labels)
# clf = svm.SVC(kernel='poly',degree=3).fit(c101_vgg_features, c101_labels)
# clf = svm.SVC(kernel='rbf',gamma='auto').fit(c101_vgg_features, c101_labels)

In [None]:
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from google.colab.patches import cv2_imshow
from base64 import b64decode, b64encode
import PIL
import io

# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

#@title Webcam live streaming code (just run it)
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)


def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data


In [None]:
# Now, let's see how our trained SVM processes the webcam stream

video_stream()

# label for video
label_html = 'Capturing...'

# initialze bounding box to empty string
svm_overlay = ''
count = 0

while True:
    js_reply = video_frame(label_html, svm_overlay)
    if not js_reply:
        break

    # convert JS response to OpenCV image
    frame = js_to_image(js_reply["img"])

    # create transparent overlay for svm classification info
    svm_overlay = np.zeros([480,640,4], dtype=np.uint8)

    features = extract_vgg_features(frame)
    pred = clf.predict(features)

    # show the classification result
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(svm_overlay, '{}'.format(caltech101_dataset.categories[pred[0]]), (15, 25), font, 1, (0, 255, 0), 2, cv2.LINE_AA)

    svm_overlay[:,:,3] = (svm_overlay.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    svm_overlay = to_bytes(svm_overlay)

