In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import cv2
from scipy.ndimage import zoom
from scipy.special import logsumexp
import deepgaze_pytorch
import os
import imageio

In [2]:
DEVICE = 'cuda'
model3 = deepgaze_pytorch.DeepGazeIII(pretrained=True).to(DEVICE)
model3.eval()

Using cache found in C:\Users\ronki/.cache\torch\hub\pytorch_vision_v0.6.0


DeepGazeIII(
  (features): FeatureExtractor(
    (features): RGBDenseNet201(
      (0): Normalizer()
      (1): DenseNet(
        (features): Sequential(
          (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu0): ReLU(inplace=True)
          (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (denseblock1): _DenseBlock(
            (denselayer1): _DenseLayer(
              (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (relu1): ReLU(inplace=True)
              (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (relu2): ReLU(inplace=True)
              (conv2): Conv2d(128, 32, kernel_size=(3, 3), strid

In [16]:
video_path = 'C:/Users/ronki/OneDrive/Documents/GitHub/BionicVision/data/kitchen.mp4'
centerbias = np.load('C:/Users/ronki/OneDrive/Documents/GitHub/BionicVision/DeepGaze III/centerbias_mit1003.npy')

cap = cv2.VideoCapture(video_path)
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps    = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"Video resolution: {width}x{height}, FPS: {fps}, Frames: {frame_count}")

Video resolution: 1920x1440, FPS: 20.0, Frames: 201


In [4]:
def get_resized_centerbias(h, w):
    resized_cb = zoom(centerbias, (h / centerbias.shape[0], w / centerbias.shape[1]), order=0, mode='nearest')
    resized_cb -= logsumexp(resized_cb)
    return resized_cb

def create_uniform_bias(width, height):
    bias = np.ones((height, width))
    bias /= np.sum(bias)
    return bias

In [5]:
cb = get_resized_centerbias(height, width)
cb_tensor = torch.tensor([cb]).to(DEVICE)
unif = create_uniform_bias(width, height)
unif_tensor = torch.tensor([unif]).to(DEVICE)

  cb_tensor = torch.tensor([cb]).to(DEVICE)


In [6]:
def get_center_of_mass(saliency_map, threshold=0.7):
    if saliency_map.max() > 1:
        normalized_map = saliency_map / 255.0
    else:
        normalized_map = saliency_map.copy()
    
    max_val = normalized_map.max()
    binary_map = (normalized_map > (threshold * max_val)).astype(np.uint8)
    
    moments = cv2.moments(binary_map)    
    center_x = moments["m10"] / moments["m00"]
    center_y = moments["m01"] / moments["m00"]
    return center_x, center_y

In [17]:
gaze_output = 'C:/Users/ronki/OneDrive/Documents/GitHub/BionicVision/data/gaze_output'
saliency_output = 'C:/Users/ronki/OneDrive/Documents/GitHub/BionicVision/data/saliency_output'
gaze_map = np.load(os.path.join(gaze_output, 'out_00000_img.npy'))
gaze_map_w, gaze_map_h = gaze_map.shape
start_x, start_y = get_center_of_mass(gaze_map)
curr_fixations_x = np.repeat(start_x / gaze_map_w * width, 4)
curr_fixations_y = np.repeat(start_y / gaze_map_h * height, 4)
for i in range(frame_count):
  ret, frame = cap.read()
  if not ret: continue
  try:
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_tensor = torch.tensor(frame.transpose(2, 0, 1))[None].to('cuda:0')
    x_hist_tensor = torch.tensor([curr_fixations_x[model3.included_fixations]]).to(DEVICE)
    y_hist_tensor = torch.tensor([curr_fixations_y[model3.included_fixations]]).to(DEVICE)
    with torch.no_grad(): saliency = model3(frame_tensor, unif_tensor, x_hist_tensor, y_hist_tensor)
    saliency = saliency.detach().cpu().numpy()[0][0]
    saliency_norm = (saliency - saliency.min()) / (saliency.max() - saliency.min()) * 255
    np.save(os.path.join(saliency_output, f'saliency_{i:05d}_img.npy'), saliency_norm)
    saliency_heatmap = cv2.applyColorMap(saliency_norm.astype(np.uint8), cv2.COLORMAP_JET)
    cv2.imwrite(os.path.join(saliency_output, f'saliency_{i:05d}_img.png'), saliency_heatmap)
    
    gaze_map = np.load(os.path.join(gaze_output, f'out_{i:05d}_img.npy'))
    x, y = get_center_of_mass(gaze_map)
    gaze_map_w, gaze_map_h = gaze_map.shape
    curr_fixations_x = np.roll(curr_fixations_x, -1)
    curr_fixations_y = np.roll(curr_fixations_y, -1)
    curr_fixations_x[-1] = x / gaze_map_w * width 
    curr_fixations_y[-1] = y / gaze_map_h * height
  finally:
    print(i)
    print(curr_fixations_x, curr_fixations_y)

cap.release()


181
[1082.03494347 1082.03494347 1082.03494347  989.83625071] [940.92350609 940.92350609 940.92350609 876.35234331]
182
[1082.03494347 1082.03494347  989.83625071  981.40056022] [940.92350609 940.92350609 876.35234331 875.08403361]
183
[1082.03494347  989.83625071  981.40056022 1019.64803313] [940.92350609 876.35234331 875.08403361 886.50621118]
184
[ 989.83625071  981.40056022 1019.64803313 1044.82993197] [876.35234331 875.08403361 886.50621118 874.48979592]
185
[ 981.40056022 1019.64803313 1044.82993197 1049.08264074] [875.08403361 886.50621118 874.48979592 887.2557629 ]
186
[1019.64803313 1044.82993197 1049.08264074 1027.2861868 ] [886.50621118 874.48979592 887.2557629  889.9354229 ]
187
[1044.82993197 1049.08264074 1027.2861868  1009.54749035] [874.48979592 887.2557629  889.9354229  895.26023166]
188
[1049.08264074 1027.2861868  1009.54749035 1069.94923858] [887.2557629  889.9354229  895.26023166 878.29405366]
189
[1027.2861868  1009.54749035 1069.94923858 1005.57617609] [889.93542