# UE Computer Vision | Project : Visual tracking of video objects

February 2022

pierre-henri.conze@imt-atlantique.fr, ronan.fablet@imt-atlantique.fr

aurelien.colin@imt-atlantique.fr, quentin.febvre@imt-atlantique.fr, matteo.zambra@imt-atlantique.fr

## Purpose of the project

Implement the methodology described in one of the two articles and apply it in the context of **visual tracking of video objects** within image sequences. Specifically, the targeted application consists in estimating for the whole sequence the location of a manually defined region of interest in the first image, called **reference image**. Development will be done in `python` programming language and can use functions from librairies such as `openCV`, `scikit-image`, `scikit-learn`, `keras`, `pytorch`... 

## Groupe 9 - Deep Learning

**Authors :**


*   BERCY Victor
*   COSTE Paul
*   KALLALA Aymen


**Goal of the notebook :**



## Install requirements

In [None]:
from google.colab import drive
from os import chdir

drive.mount("/content/drive")
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main")

Mounted at /content/drive


In [None]:
!export CUDA_HOME="/usr/local/cuda-11.0"
!export CUDA_PATH="$CUDA_HOME"
!export PATH="$CUDA_HOME/bin:$PATH"
!export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$CUDA_HOME/lib:$CUDA_HOME/extras/CUPTI/lib64:$CUDA_HOME/efa/lib:$LD_LIBRARY_PATH"

In [None]:
!pip install decord
!pip install ffmpeg-python
!pip install addict

In [None]:
!tar -xjvf ./x264-master.tar.bz2

In [None]:
!pip3 install torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip3 install -r requirements.txt

In [None]:
# !pip uninstall datascience
# !pip uninstall albumentations

In [None]:
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/x264-master/")
!./configure --enable-shared --disable-asm

In [None]:
!make

In [None]:
!make install

In [None]:
!ldconfig

In [None]:
!git clone https://github.com/cocodataset/cocoapi.git
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/cocoapi/PythonAPI")
!python setup.py build_ext install
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main")
!git clone https://github.com/NVIDIA/apex.git
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/apex")
!python setup.py install --cuda_ext --cpp_ext
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main")
!git clone https://github.com/Idolized22/maskrcnn-benchmark.git
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/maskrcnn-benchmark")
!python setup.py build develop
!find / -name 'miscellaneous.py'
!rm -rf /opt/conda/lib/python3.6/site-packages/maskrcnn_benchmark
!cp -r maskrcnn-benchmark/build/lib.linux-x86_64-3.6/maskrcnn_benchmark /opt/conda/lib/python3.6/site-packages/

In [None]:
cuda_dir="maskrcnn_benchmark/csrc/cuda"
!perl -i -pe 's/AT_CHECK/TORCH_CHECK/' $cuda_dir/deform_pool_cuda.cu $cuda_dir/deform_conv_cuda.cu
# You can then run the regular setup command
!python3 setup.py build develop

In [None]:
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main")

## Import libraries

In [None]:
import os
import sys
import tqdm
import argparse
import pathlib

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2

import torch
from torchvision import transforms

from skimage.exposure import rescale_intensity
import numpy as np
from skimage.segmentation import mark_boundaries
from skimage.exposure import rescale_intensity

## Training

In [None]:
#To access the libraries downloaded

sys.path.append('/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/apex')
sys.path.append('/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/maskrcnn-benchmark')

In [None]:
#Informations about our dataset

dataset_dir_path = '/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train-untouched/'
fps = 15
img_width, img_height = 360, 480
nbFrames = {
    'bag':      196, 
    'bear':     26, 
    'book':     51, 
    'camel':    90, 
    'rhino':    90, 
    'swan':     50
}

In [None]:
def get_bbs(obj_name, path, nb_frames):
    """

    """
    len_obj_name = len(obj_name)
    out = []
    os.chdir(path)
    out = [[0,0,0,0]] * nb_frames

    for filename in os.listdir():

        reg_mask = obj_name + '-[0-9]{3}.png'

        if re.search(reg_mask,filename):
            img = cv2.imread(filename)
            x_min, x_max, y_min, y_max = get_bb_coos(img)
            index = int(filename[-7:-4]) # to match the # on the file name with the index in `out`
            out[index-1] = [x_min, x_max, y_min, y_max]
    
    return np.array(out)

In [None]:
def get_bb_coos(img):
    """
    From a mask of a black and white pixels, outputs border bb coordinates in format (x_min, x_max, y_min, y_max)
    """
    
    convert_tensor = transforms.ToTensor()
    img = convert_tensor(img)[0]
    nn0_img = np.nonzero(img)
    a,b = img.shape
    c = int(max(a,b))
    x_min, x_max, y_min, y_max = c, 0, c, 0

    for x,y in nn0_img:
      x, y = int(x), int(y)

      if x_min > x:
        x_min = x
      if x_max < x:
        x_max = x
      if y_min > y:
        y_min = y
      if y_max < y:
        y_max = y

    return x_min, x_max, y_min, y_max

def convert_chw_bb(x, y, height, width):
    """

    """

    x_min = x - width
    x_max = x + width
    y_min = y - height
    y_max = y + height

    return np.array([x_min, x_max, y_min, y_max])

def convert_corner_bb(bbox):
    """
    Returns a bounding box of format (x_center, y_center, width, height) from format (x_min, x_max, y_min, y_max)
    """

    x_center, y_center = 1/2*(bbox[0]+bbox[1]), 1/2*(bbox[2]+bbox[3])
    width = bbox[1] - bbox[0]
    height = bbox[3] - bbox[2]

    return [x_center, y_center, height/2, width/2]

In [None]:
# Ingestion script for our dataset

from datetime import datetime

from gluoncv.torch.data.gluoncv_motion_dataset.dataset import (
    AnnoEntity, DataSample, FieldNames, GluonCVMotionDataset, SplitNames
)
from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import \
    process_dataset_splits

import gluoncv.torch.data.gluoncv_motion_dataset.dataset as gcvdataset
gcvdataset.__version__ = "1.0"

_OBJECT_TYPES = ('bag', 'bear', 'book', 'camel', 'rhino', 'swan', 'others')

_CLASS_LABELS = dict((vt, i) for i, vt in enumerate(_OBJECT_TYPES, start=1))

def sample_from_raw(dataset_dir_path, fps, img_width, img_height, nbFrames):

    seq_name = "Train"
    sample = DataSample(id=seq_name)

    for object_type in _OBJECT_TYPES[:-1]:
        vehicle_type = object_type
        nb_frames = nbFrames[object_type]
        bboxes = get_bbs(object_type, dataset_dir_path + 'raw_data/', nb_frames)

        for i, bbox in enumerate(bboxes):
            frame_num = i + 1
            frame_idx = i

            obj_id = 1 #only one object per videos 
            time_ms = int(round((frame_idx / fps) * 1000))
            entity = AnnoEntity(time=time_ms, id=obj_id)
            entity.confidence = 1.0

            entity.bbox = convert_corner_bb(bbox)

            entity.blob = {
                    'frame_xml':         frame_num,
                    'frame_idx':         frame_idx,
                    'color':             None, #attrib_attr['color'],
                    'orientation':       0., #float(attrib_attr['orientation']),
                    'speed':             1., #float(attrib_attr['speed']),
                    'trajectory_length': 10., #float(attrib_attr['trajectory_length']),
                    'truncation_ratio':  1., #float(attrib_attr['truncation_ratio']),
                    'vehicle_type':      vehicle_type,
                }
            entity.labels = {vehicle_type: _CLASS_LABELS[vehicle_type]}
            
            region_overlap = None #target.find('.//region_overlap')
            if region_overlap is not None:
                region_overlap_attr = region_overlap.attrib
                occlusion_status = region_overlap_attr['occlusion_status']
                occlusion_box = convert_corner_bb(region_overlap_attr)
                entity.blob['occlusion_status'] = int(occlusion_status)
                entity.blob['occlusion_box'] = occlusion_box

            sample.add_entity(entity)
    
    # Need to replace the Windows path separator by UNIX-like to make the path
    # working across different platforms. Linux struggles with mixing path
    # separators whereas Windows does not.
    rel_data_path = os.path.join(dataset_dir_path, seq_name).replace('\\', '/')
    sample.metadata = {
        FieldNames.DATA_PATH:  rel_data_path,
        FieldNames.FPS:        fps,
        FieldNames.NUM_FRAMES: frame_num,
        FieldNames.RESOLUTION: {
            'width': img_width, 'height': img_height
        }
    }

    return sample

def ingest(dataset_dir_path, fps, img_width, img_height, nbFrames):
    dataset = GluonCVMotionDataset(
        annotation_file='anno.json', root_path=dataset_dir_path,
        load_anno=False
    )
    dataset.metadata = {
        FieldNames.DESCRIPTION:   "IMT Atlantique Computer vision course dataset ingestion",
        FieldNames.DATE_MODIFIED: str(datetime.now()),
    }

    dataset_anno_dir = (
        pathlib.Path(dataset_dir_path) / GluonCVMotionDataset.DATA_DIR /
        'Covi_public'
    )

    splits = ('_Train', dataset_anno_dir) #, ('Insight-MVT_Annotation_Test', dataset_anno_dir / '360p-Test'))

    #tqdm_pbar = tqdm.tqdm(file=sys.stdout)
    #with tqdm_pbar as pbar:
        #for split_dir_name, split_dir in splits:
            #for sample_xml_file_path in map(str, split_dir.iterdir()):
                #pbar.set_description(f"reading sample {split_dir}")
    sample = sample_from_raw(
        dataset_dir_path, fps, img_width, img_height, nbFrames
    )
    dataset.add_sample(sample)
                #pbar.update()
    
    dataset.dump()

    return dataset


def write_data_split(dataset):
    def split_func(sample):
        data_path = sample.data_relative_path

        if 'Train' in data_path:
            return SplitNames.TRAIN
        elif 'Test' in data_path:
            return SplitNames.TEST
        
        raise RuntimeError("unrecognized data split")
    
    process_dataset_splits(dataset, split_func, save=True)


def main_ingest_data(dataset_dir_path, fps, img_width, img_height, nbFrames):
    
    dataset = ingest(dataset_dir_path, fps, img_width, img_height, nbFrames)
    write_data_split(dataset)

    return None



In [None]:
import re
main_ingest_data(dataset_dir_path, fps, img_width, img_height, nbFrames)

INFO:root:Skipping loading for annotation file /content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train-untouched/annotation/anno.json
INFO:root:Split subpath: annotation/splits.json
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  return bound(*args, **kwds)


Processed 1 samples
Found 1 samples for train
Saving data splits to: /content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train-untouched/annotation/splits.json


In [None]:
chdir("/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main")

In [None]:
!python tools/train_net.py --config_file '/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/siammot/configs/defaults.py' --train-dir '/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train-untouched/annotation'

Traceback (most recent call last):
  File "tools/train_net.py", line 12, in <module>
    from maskrcnn_benchmark.utils.miscellaneous import mkdir, save_config
ImportError: cannot import name 'save_config' from 'maskrcnn_benchmark.utils.miscellaneous' (/content/drive/.shortcut-targets-by-id/1nmICa9-r33D-HVcB3J8mLCKC1esIf7t3/Project/siam-mot-main/maskrcnn-benchmark/maskrcnn_benchmark/utils/miscellaneous.py)


## Inference

In [None]:
import cv2
import numpy as np
from os import listdir
import re

def save_video(name: str):
    """
    Create a video in format .mp4 and save it in a folder with the name of the object studied
    /!\ Colab : doesn't work for mysterious reasons...
    """

    reg = name + "-[0-9]{3}.bmp"
    path = "./sequences-train-untouched/"
    export_path = "./sequences-train/" + name + '/'
    print('source_path :', path)
    print('export_path :', export_path)
    
    img0 = cv2.imread(path + name + "-001.bmp")
    height, width, _ = img0.shape
    size = (width,height)
    out = cv2.VideoWriter(export_path + 'project-' + name + '.mp4', cv2.VideoWriter_fourcc(*'avc1'), fps, size)

    for filename in listdir(path):
        if re.match(reg, filename):
            img = cv2.imread(path + filename)
            out.write(img)

    out.release()

    return None

In [None]:
missing_videos = False #videos already saved
name_data = 'rhino' #Directory to work on

if missing_videos:
    for data in NbFrames.keys():
        save_video(name_data)

In [None]:
from demos.demo_inference import DemoInference
from demos.utils import vis_writer as vw, vis_generator as vg
from demos.video_iterator import build_video_iterator
import os

dump_video = True
vis_resolution = 360

if step=="train":
    track_class = "person_vehicle"
    demo_video = "sequences-train/Videos/input/project-" + name_data + ".mp4"
    output_path = "sequences-train/Videos/output/"
elif step=="test":
    track_class = "person_vehicle"
    demo_video = "sequences-test/Videos/input/project-" + name_data + ".mp4"
    output_path = "sequences-test/Videos/output/"

vis_generator = vg.VisGenerator(vis_height=vis_resolution)
vis_writer = vw.VisWriter(dump_video=dump_video,
                        out_path=output_path,
                        file_name=os.path.basename(demo_video))
    
# Build demo inference
tracker = DemoInference(track_class=track_class,
                        vis_generator=vis_generator,
                        vis_writer=vis_writer)

# Build video iterator for inference
video_reader = build_video_iterator(demo_video)

results = list(tracker.process_frame_sequence(video_reader()))

if dump_video:
    vis_writer.close_video_writer()

Downloading: "http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth" to /root/.cache/torch/hub/checkpoints/dla34-ba72cf86.pth


  0%|          | 0.00/60.3M [00:00<?, ?B/s]

RuntimeError: ignored

In [None]:
from skimage.segmentation import mark_boundaries
#from skimage.exposure import rescale_intensity

def visualize_bbox(img, bbox):
    """
    Plot the bounding box of format [x_min, x_max, y_min, y_max] on the given image
    """

    #img = rescale_intensity(img, in_range=(np.min(img),np.max(img)), out_range=(0,1))
    xmin, xmax, ymin, ymax = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])

    mask = np.zeros(shape=img[:,:,0].shape, dtype=np.uint8)
    mask[xmin:xmax+1, ymin:ymax+1] = 1
    out = mark_boundaries(img, mask, color=(0, 255, 0))

    return out

def convert_chw_bb(bbox):
    """
    Return a bounding box in format [x_min, x_max, y_min, y_max] from format [x_center, y_center, width, height]
    """

    x, y = bbox[0], bbox[1]
    width, height = bbox[2], bbox[3]
    
    x_min = x - width
    x_max = x + width
    y_min = y - height
    y_max = y + height

    return np.array([x_min, x_max, y_min, y_max])

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm

path = '/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train/' + name_data + '/'
#outpath = '/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train/Videos/output/'

#img0 = cv2.imread(path + name_data + "-001.bmp")
#height, width, _ = img0.shape
#size = (width, height)
#out = cv2.VideoWriter(outpath + 'output-' + name_data + '.avi', cv2.VideoWriter_fourcc(*'DVIX'), 15, size)

mon_results = [] #for visualization purpose only
n = len(results)

for i in range(n):
    num = str(i+1)
    end = '-' + (3-len(num))*'0' + num + '.bmp'
    img = cv2.imread(path + name_data + end)
    
    bboxes = results[i][1].bbox.numpy()
    bb = bboxes[0]
    bb = np.around(bb, decimals=0) #Round the coordinates to the nearest integer
    bb = convert_chw_bb(bb)
    mon_results.append(bb)
    img = visualize_bbox(img, bb)
    for bb in bboxes:

        bb = np.around(bb, decimals=0) #Round the coordinates to the nearest integer
        bb = convert_chw_bb(bb)
        img = visualize_bbox(img, bb)
    
    plt.subplot(n, 1, i+1)
    plt.imshow(img)

    #out.write(img)

#out.release()

In [None]:
# results[2][1].bbox.numpy()

In [None]:
from os.path import isdir

isdir(outpath)

In [None]:
# results[0][1]

In [None]:
# len(results[0][1])

In [None]:
plt.imshow(img, cmap='gray')

In [None]:
import torch

from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.utils.miscellaneous import mkdir

from siammot.configs.defaults import cfg
from siammot.modelling.rcnn import build_siammot
from siammot.engine.inferencer import DatasetInference
from siammot.utils.get_model_name import get_model_name
from siammot.data.adapters.utils.data_utils import load_dataset_anno, load_public_detection
from siammot.data.adapters.handler.data_filtering import build_data_filter_fn

def test(cfg, test_dataset, output_dir, model_file=None, set="test"):

    torch.cuda.empty_cache()

    # Construct model graph
    model = build_siammot(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # Load model params
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=model_file)
    if os.path.isfile(model_file):
        _ = checkpointer.load(model_file)
    elif os.path.isdir(model_file):
        _ = checkpointer.load(use_latest=True)
    else:
        raise KeyError("No checkpoint is found")

    # Load testing dataset
    dataset_key = test_dataset
    dataset, modality = load_dataset_anno(cfg, dataset_key, set)
    dataset = sorted(dataset)

    # do inference on dataset
    data_filter_fn = build_data_filter_fn(dataset_key)

    # load public detection
    public_detection = None
    if cfg.INFERENCE.USE_GIVEN_DETECTIONS:
        public_detection = load_public_detection(cfg, dataset_key)

    dataset_inference = DatasetInference(cfg, model, dataset, output_dir, data_filter_fn, public_detection)
    dataset_inference()


def main(output_dir, config_file, test_dataset="MOT17_DPM", model_file=None):
    cfg.merge_from_file(config_file)
    cfg.freeze()

    model_name = get_model_name(cfg)
    output_dir = os.path.join(output_dir, model_name)
    if not os.path.exists(output_dir):
        mkdir(output_dir)

    test(cfg, output_dir, test_dataset, model_file, set)

In [None]:


# main(outpath, config_file)

## Evaluation

In [None]:
def metrics_centroid_distance(pred, GT):
    xmin, xmax, ymin, ymax = pred
    Xmin, Xmax, Ymin, Ymax = GT
    x,y, height, width = convert_corner_bb(xmin, xmax, ymin, ymax)
    X,Y, heightGT, widthGT = convert_corner_bb(Xmin, Xmax, Ymin, Ymax)
    return (abs(X-x)+abs(Y-y))/2

In [None]:
def visualize_predicted_box(img, xmin, xmax, ymin, ymax):
    img = rescale_intensity(img, in_range=(np.min(img),np.max(img)), out_range=(0,1))
    mask = np.zeros(shape=img[:,:,0].shape, dtype=np.uint8)
    # mask = np.zeros(shape=img[:,:].shape, dtype=np.uint8)
    mask[xmin:xmax+1, ymin:ymax+1] = 1
    # print(mask[np.where(mask==1)])
    out = mark_boundaries(img, mask, color=(0, 255, 0))
    return out
    
def visualize_predicted_boxes(img, xmin, xmax, ymin, ymax, GTxmin, GTxmax, GTymin, GTymax):
    img = rescale_intensity(img, in_range=(np.min(img),np.max(img)), out_range=(0,1))
    mask = np.zeros(shape=img[:,:,0].shape, dtype=np.uint8)
    mask[xmin:xmax+1, ymin:ymax+1] = 1
    out = mark_boundaries(img, mask, color=(255, 0, 0), background_label=2)
    mask = np.zeros(shape=img[:,:,0].shape, dtype=np.uint8)
    mask[GTxmin:GTxmax+1, GTymin:GTymax+1] = 1
    out = mark_boundaries(out, mask, color=(0, 255, 0), background_label=4)
    return out

In [None]:
directory = 'swan/'
path = '/content/drive/MyDrive/collab_2A_IMT/Project/siam-mot-main/sequences-train/'
# Get GT bbs
bb_swan = get_bbs('swan', path)

path += directory
nbFrames = len(os.listdir(path))
print(nbFrames)

In [None]:
nbFrames=0 # to comment out whenn result is defined
met = [] #metrics values

for i in range(nbFrames//2):
    plt.subplot(nbFrames, 1, i)
    plt.figure(figsize=(15,7))
    
    bb_GT = bb_swan[i]
    
    #read the i-th image
    num = str(i)
    num = (3-len(num))*'0' + num
    name = 'swan-' + num
    img = plt.imread(path + name + '.bmp')
    # img_mask = plt.imread(path + name + '.png') # not needed, computed before
    
    ###############################
    result = mon_results
    bb_pred = result # to be defined
    ###############################
    
    pred = result ### /!\ format [x,x,y,y]
    GT = bb_GT
    curr_met = metrics_centroid_distance(pred, GT)
    met.append(curr_met)

    xmin, xmax, ymin, ymax = pred
    Xmin, Xmax, Ymin, Ymax = GT
    vis_swan = visualize_predicted_boxes(img, xmin, xmax, ymin, ymax, Xmin, Xmax, Ymin, Ymax)
  
    plt.imshow(vis_swan)
    print('centroid distance :', curr_met)
    plt.axis('off')

In [None]:
plt.plot(met)
plt.show()

In [None]:
# img = Image.open('/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train/swan/swan-001.bmp')
# img_mask = Image.open('/content/drive/MyDrive/MCE/CompVi/Project/siam-mot-main/sequences-train/swan/swan-001.png')
# bb_swan_1 = get_bb_coos(img_mask)
# xmin, xmax, ymin, ymax = bb_swan_1
# print(bb_swan_1)
# vis_swan_1 = visualize_predicted_box(img, xmin, xmax, ymin, ymax)
# vis2_swan_1 = visualize_predicted_boxes(img, xmin, xmax, ymin, ymax, xmin+1, xmax+1, ymin+1, ymax+1)
# # plt.imshow(vis_swan_1)
# plt.imshow(vis2_swan_1)
# plt.axis('off')
# plt.show()