Copyright &copy; CAMMA, ICube, University of Strasbourg. All Rights Reserved.

<div>
<a href="https://cholectriplet2022.grand-challenge.org/">
<img src="https://rumc-gcorg-p-public.s3.amazonaws.com/b/649/banner2022.x10.jpeg" align="left"/>
</a>
</div>

## <h1><center>Getting Started</center></h1>


# Introduction

In this notebook, we provide some sample code to help familiarize yourself with the challenge, the dataset and the metrics. These are minimal examples to help illustrate a simple deep learning pipeline applied on a small subset of the Action Triplet dataset, **CholecT50**.

# Data Loading and Visualization

In [None]:
# Import necessary libraries for this module

from PIL import Image

import csv
import matplotlib.pyplot as plt
import numpy as np
import os
from glob import glob
import json
import random
from matplotlib import patches


print("Libraries successfully imported!")

Here, we use a small subset of the CholecT50 dataset available at this link: https://seafile.unistra.fr/f/ba1427a82ecc4ce18566/?dl=1. If you are running this notebook on Colab, you can run the cell below to download and unzip the sample dataset to the current directory.

In [None]:
# Ignore this cell if you have already downloaded and extracted the dataset

!wget -O CholecT50_sample.zip https://seafile.unistra.fr/f/ba1427a82ecc4ce18566/?dl=1
!unzip -o CholecT50_sample.zip

print("Dataset successfully extracted!")

In [None]:
# Change the dataset_path mentioned below appropriately if you have exracted the data to a different directory 

dataset_path = './CholecT50_sample/'

data_path = os.path.join(dataset_path, 'data')
triplet_path = os.path.join(dataset_path, 'triplet')
dict_path = os.path.join(dataset_path, 'dict')
video_names = os.listdir(data_path)                   

print("Dataset paths successfully defined!")

In [None]:
# Create dictionary mapping triplet ids to readable label

with open(os.path.join(dict_path, 'triplet.txt'), 'r') as f:
  triplet_info = f.readlines()
  triplet_dict = {}
  for l in triplet_info:
    triplet_id, triplet_label = l.split(':')
    triplet_dict[int(triplet_id)] = triplet_label.rstrip()

print('Random triplet id and its human readable label\n')
random_triplet_id = np.random.choice(list(triplet_dict.keys()))
print('Triplet id: ', random_triplet_id, '\nReadable label: ', triplet_dict[random_triplet_id])

In [None]:
def generator(data_path, triplet_path, video_names, batch_size, shuffle_videos=False):
  """ Defines a simple generator that returns sequential batches of input images and  their 
      corresponding triplet labels, video names, and frame ids.
        Args:
          data_path:     Path to directory containing a folder for each video
          triplet_path:  Path to folder containing a txt file for each video
                         listing the frame id, and binary label for all of the 100 considered 
                         triplet classes
          video_names:   Names of the videos that will be retruned by this generator. These names
                         should correpond to a folder in data_path and a txt file in triplet path
          batch_size:    Batch size of outputs yielded by the generator
          shuffle_videos:To perform a shuffling of videos (Note: frames will be returned sequentially!)  
        Returns:
          image batch     : Batch of images
          triplet batch   : Batch of triplet labels ([N] vectors)
          video_name_batch: Batch of video name strings
          frame_id_batch  : Batch of integer frame ids
    """

  if shuffle_videos:
    video_names = np.random.shuffle(video_names)

  image_batch, triplet_batch, video_name_batch, frame_id_batch = [], [], [], []

  for video_name in video_names:
    with open(os.path.join(triplet_path, video_name + '.txt'), mode='r') as infile:
      reader = csv.reader(infile)

      for line in reader:
        line = np.array(line, np.int64)
        frame_id, triplet_label = line[0], line[1:]
        image_path = os.path.join(data_path, video_name, "%06d.png" % frame_id)
        image = np.array(Image.open(image_path), np.float32) / 255.0

        image_batch.append(image)
        triplet_batch.append(triplet_label)
        video_name_batch.append(video_name)
        frame_id_batch.append(int(frame_id))

        if len(frame_id_batch) == batch_size:
          yield image_batch, triplet_batch, video_name_batch, frame_id_batch
          image_batch, triplet_batch, video_name_batch, frame_id_batch = [], [], [], []
            

In [None]:
batch_size = 8
gen = generator(data_path, triplet_path, video_names, batch_size)

for images, triplet_labels, video_names, frame_ids in gen:
  for batch in range(batch_size):
    print('\nVisualizing image...\n')
    print('Video name: ', video_names[batch], ' Frame_id', frame_ids[batch])
    plt.imshow(images[batch])
    plt.show()
    print('\nEncoding showing which of the 100 considered action triplets are represented in the image\n')
    print(triplet_labels[batch])
    print('\nReadable labels\n')
    for triplet in np.where(triplet_labels[batch])[0]:
      print(triplet_dict[triplet])
    
  break

#  Building and Running Models 

We build and perform a simple forward pass of an image through a few layer convolutional network to predict the probability of each of the considered triplets being represented in the input image.

Note: Please run the cells in the previous module Data Loading and Visualization before running this module

In [None]:
# Import necessary libraries for this module. You can skip ahead if you prefer a PyTorch based example.

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

print("Libraries successfully imported!")

Defining a simple neural network using tf.keras. You can skip ahead if you prefer to use torch.nn

In [None]:
# Defining the neural network architecture
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(
    filters=16, kernel_size=3, strides=2, activation="relu", input_shape=(480, 854, 3))
)                 
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides = 2, activation="relu")) 
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides = 2, activation="relu")) 
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides = 2, activation="relu")) 
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides = 2, activation="relu")) 
model.add(tf.keras.layers.Flatten())                                     
model.add(tf.keras.layers.Dense(units=4096, activation="relu"))                 
model.add(tf.keras.layers.Dense(units=2048, activation="relu"))            
model.add(tf.keras.layers.Dense(units=100, activation="sigmoid"))    

print("Neural network architecture successfully defined!")

In [None]:
model.build([1, 480, 854, 3])
model.summary()

Performing a simple forward pass

In [None]:
input_4d = np.expand_dims(images[0], axis=0)
print('Performing a simple forward pass on our untrained network for a test image')
plt.imshow(images[0])
plt.show()
print('\nPrediction\n')
print(model.predict(input_4d)[0])
print('\nLabel\n')
print(triplet_labels[0])
print('\nReadable label\n')
for triplet_id in np.where(triplet_labels[0])[0]:
  print(triplet_dict[triplet_id])

[**OPTIONAL**] Using PyTorch to make a prediction

In [None]:
# Import necessary libraries for this module.

import torch
import numpy as np
from torch import nn
from torch.nn import Module

In [None]:
class MyModel(Module):
    # define model elements
    def __init__(self, h, w):
        super(MyModel, self).__init__()
        # input to first hidden layer
        self.conv1 = nn.Conv2d(3, 32, (3,3))
        self.conv2 = nn.Conv2d(32, 64, (3,3))
        self.conv3 = nn.Conv2d(64, 128, (3,3))
        self.conv4 = nn.Conv2d(128, 256, (3,3))
        self.pool1 = nn.MaxPool2d((3,3), stride=(2,2))
        self.pool2 = nn.MaxPool2d((2,2), stride=(2,2))
        self.pool3 = nn.MaxPool2d((2,2), stride=(2,2))
        self.h   = int(h/8 - 4)
        self.w   = int(w/8 - 4)
        self.mlp = nn.Linear(self.h*self.w*256, 100)        
        self.act1 = nn.ReLU()
        self.act2 = nn.ReLU()
        self.act3 = nn.ReLU()        
        self.act4 = nn.Sigmoid()
    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.conv1(X)
        X = self.act1(X)
        X = self.pool1(X)
        # second hidden layer
        X = self.conv2(X)
        X = self.act2(X)
        X = self.pool2(X)
        # second hidden layer
        X = self.conv3(X)
        X = self.act3(X)
        X = self.pool3(X)
        # second hidden layer
        X = self.conv4(X)
        # flatten
        X = X.view(-1, self.h*self.w*256)
        # output layer
        X = self.mlp(X)
        X = self.act4(X)
        return X

In [None]:
input_4d = np.expand_dims(images[0], axis=0)
# Converting to Channel first. NHWC --> NCHW
input_4d = np.transpose(input_4d, [0, 3, 1, 2])
input_4d = torch.from_numpy(input_4d)
print('Performing a simple forward pass on our untrained network for a test image')
plt.imshow(images[0])
plt.show()
print('\nPrediction\n')
model  = MyModel(480, 854)
print(model(input_4d)[0])
print('\nLabel\n')
print(triplet_labels[0])
print('\nReadable label\n')
for triplet_id in np.where(triplet_labels[0])[0]:
  print(triplet_dict[triplet_id])

#  Spatial detection

In [None]:
# Ignore if you have already downloaded and extracted the spatial detection samples

!wget -O sample-spatial.zip https://seafile.unistra.fr/f/912c37e649a249aaa604/?dl=1
!unzip -o sample-spatial.zip

print("Dataset successfully extracted!")

In [None]:
# Loading images, annotations, instrument lookup table

imgs = sorted(glob("sample-spatial/video/*.png"))
img_dict = {
  int(os.path.basename(path).split(".")[0]): path
  for path in imgs
}

with open("sample-spatial/label.json") as f:
  annotations = {
    int(k): v
    for k, v in json.load(f).items()
  }

with open("CholecT50_sample/dict/instrument.txt") as f:
  instrument_buf = [
    lin.strip().split(":") for lin in f.readlines()
  ]
  instrument_table = {
    u[1]: int(u[0])
    for u in instrument_buf
  }

In [None]:
# One random detection

def one_random_detection():
  triplet_id = np.random.randint(0, 100)
  triplet_string = triplet_dict[triplet_id]
  tool_id = instrument_table[triplet_string.split(",")[0]]
  tool_prob = np.random.uniform(0, 0.6)
  bbox_x, bbox_y = np.random.uniform(0, 0.6, size=2)
  bbox_w = np.random.uniform(0.15, 0.4)
  bbox_h = np.random.uniform(0.15, 0.4)
  res = [
    triplet_id,
    tool_id,
    tool_prob,
    bbox_x,
    bbox_y,
    bbox_w,
    bbox_h
  ]
  return res

# Random detections for one image.
# This function is a placeholder for an object detector, hence the unused "img" argument.

def generate_random_detections(img):
  n_detect = np.random.randint(1, 3)
  detections = []
  for _ in range(n_detect):
    detections.append(one_random_detection())
  return detections

In [None]:
img_id = random.choice(list(annotations.keys()))

im = Image.open(img_dict[img_id])
true_w, true_h = im.size
fig, ax = plt.subplots(figsize=(16, 9))
ax.set_axis_off()
ax.imshow(im)

im_gt = annotations[img_id]
im_pd = generate_random_detections(im)

# Visualize ground truth bboxes in green
for bbox in im_gt:
  ax.add_patch(
    patches.Rectangle(
      (
        bbox["instrument"][2] * true_w,
        bbox["instrument"][3] * true_h
      ),
      bbox["instrument"][4] * true_w,
      bbox["instrument"][5] * true_h,
      fill=False,
      edgecolor="green",
      lw=3
    )
  )
  ax.text(
    bbox["instrument"][2] * true_w,
    bbox["instrument"][3] * true_h + 12,
    "#{}: {}".format(
      bbox["triplet"],
      triplet_dict[bbox["triplet"]]
    ),
    color="white",
    fontsize=12,
    bbox={"facecolor": "green", "alpha": 0.4}
  )

# Visualize predicted bboxes in blue
for bbox in im_pd:
  ax.add_patch(
    patches.Rectangle(
      (
        bbox[3] * true_w,
        bbox[4] * true_h
      ),
      bbox[5] * true_w,
      bbox[6] * true_h,
      fill=False,
      edgecolor="blue",
      lw=3
    )
  )
  ax.text(
    bbox[3] * true_w,
    bbox[4] * true_h + 12,
    "#{}: {} - {:.01f}%".format(
      bbox[0],
      triplet_dict[bbox[0]],
      100 * bbox[2]
    ),
    color="white",
    fontsize=12,
    bbox={"facecolor": "blue", "alpha": 0.4}
  )

#  Metrics and Evaluation 


Models will be evaluated using the ```ivtmetrics``` module, specifically designed for surgical action triplet recognition and spatial detection. More information available at: [https://github.com/CAMMA-public/ivtmetrics](https://github.com/CAMMA-public/ivtmetrics).

In [None]:
# Installation. If already installed you may skip this cell.

!pip install ivtmetrics

In [None]:
# Import necessary libraries for this module

import numpy as np
import ivtmetrics

In [None]:
# Example usage. Here we use randomly generated ground truth and predicted values

half_0s = np.zeros(shape=[20,100], dtype=np.int64)
half_1s = np.ones(shape=[20,100], dtype=np.int64)

# Ground truth
vid230_gt = np.concatenate((half_0s, half_1s), axis=0)
vid231_gt = np.concatenate((half_1s, half_0s), axis=0)

# Predictions
vid230_pd = np.random.random((40,100))
vid231_pd = np.random.random((40,100))

videos = [
  (vid230_gt, vid230_pd),
  (vid231_gt, vid231_pd)
]

# Initialize metric accumulators
recognize = ivtmetrics.Recognition(num_class=100)


In [None]:
for v in videos:
# loop over videos
  for gt, pd in zip(*v):
    # loop over batches of frames in the video (batch size 1 here)
    batch_gt = np.expand_dims(gt, axis=0)
    batch_pd = np.expand_dims(pd, axis=0)
    # accumulate results
    recognize.update(batch_gt, batch_pd)
  # signal end of video to accumulators
  recognize.video_end()

print("Instrument mAP            : {}".format(
  recognize.compute_video_AP("i", ignore_null=True)["mAP"])
)
print("Verb mAP                  : {}".format(
  recognize.compute_video_AP("v", ignore_null=True)["mAP"])
)
print("Target mAP                : {}".format(
  recognize.compute_video_AP("t", ignore_null=True)["mAP"])
)
print("Instrument-Verb mAP       : {}".format(
  recognize.compute_video_AP("iv", ignore_null=True)["mAP"])
)
print("Instrument-Target mAP     : {}".format(
  recognize.compute_video_AP("it", ignore_null=True)["mAP"])
)
print("Instrument-Verb-Target mAP: {}".format(
  recognize.compute_video_AP("ivt", ignore_null=True)["mAP"])
)

In [None]:
# Per-class AP results are also available

recognize.compute_video_AP("ivt", ignore_null=True)["AP"]

In [None]:
# To try ivtmetrics's spatial detection metrics, we reuse the content from the "Spatial detection" section.

# Correct format for labels
def format_labels(label_dict):
  res = [
    int(label_dict["instrument"][0]),
    int(label_dict["instrument"][1]),
    1.0,
    label_dict["instrument"][2],
    label_dict["instrument"][3],
    label_dict["instrument"][4],
    label_dict["instrument"][5]
  ]
  return res

images = [
  Image.open(img_dict[iid])
  for iid in annotations.keys()
]

# Ground truth
vid_gt = [
  [format_labels(lb) for lb in frame_annotations]
  for frame_annotations in annotations.values()
]

# Predictions
vid_pd = [generate_random_detections(im) for im in images]

In [None]:
# Initialize detection buffers
detect = ivtmetrics.Detection(num_class=100)

# Only one video available in this example
for frame_gt, frame_pd in zip(vid_gt, vid_pd):
  # detect.update takes frames in batches (batch size 1 here)
  detect.update([frame_gt], [frame_pd])
# Signal end of video to accumulators
detect.video_end()

print("Instrument-Verb-Target mAP: {}".format(
  detect.compute_video_AP("ivt")["mAP"])
)
print("Instrument-Verb-Target mean recall: {}".format(
  detect.compute_video_AP("ivt")["mRec"])
)
print("Instrument-Verb-Target mean precision: {}".format(
  detect.compute_video_AP("ivt")["mPre"])
)
# Since detections are randomly generated in this example, performance is most likely 0

#  Saving Results


A minimal working example to save your model results using a docker image is provided here: https://seafile.unistra.fr/f/a495966e56e84bf0a834/?dl=1

# Supplementary

Optionally, if you want to incorporate labels for instruments, verbs and targets into your modelling. You can use the following code to decompose a triplet id into its corresponding instrument, verb and target ids, respectively, and vice versa. 

In [None]:
# Import necessary libraries for this module

import numpy as np
import os

In [None]:
# Ignore this cell if you have already downloaded and extracted the dataset

!wget -O CholecT50_sample.zip https://seafile.unistra.fr/f/ba1427a82ecc4ce18566/?dl=1
!unzip -o CholecT50_sample.zip

print("Dataset successfully extracted!")

In [None]:
num_instrument = 6
num_verb     = 10
num_target   = 15
num_triplet  = 100
map_dict_url = './CholecT50_sample/dict/maps.txt'
maps_dict    = np.genfromtxt(map_dict_url, dtype=int, comments='#', delimiter=',', skip_header=0)

In [None]:
def map_selector(component='iv'):
    choices = {'ivt':0, 'i':1, 'v':2, 't':3, 'iv':4, 'it':5, 'vt':6, } 
    return  choices.get(component, 0) 

def decompose(inputs, component='i'):
    """ Extract the component labels from the triplets. E.g.: from a triplet ID vector[100], get the vector [6] for the used instruments (i), or the vector [15] for the target acted upon. 
        Args:
            inputs: a 1D vector of dimension (n), where n = number of triplet classes;
                    with values int(0 or 1) for target labels and float[0, 1] for predicted labels.
            component: a string for the component to extract; 
                    (e.g.: i for instrument, v for verb, t for target, iv for instrument-verb pair, it for instrument-target pair and vt (unused) for verb-target pair)
        Returns:
            output: int or float sparse encoding 1D vector of dimension (n), where n = number of component's classes.
    """
    key    = map_selector(component)
    index  = sorted(np.unique(maps_dict[:,key]))
    output = []
    for idx in index:
        same_class = [i for i,x in enumerate(maps_dict[:,key]) if x==idx]
        y = np.max(inputs[same_class])
        output.append( y )        
    return output


In [None]:
# Create dictionary mapping triplet ids, instrument ids, verb ids and target ids to readable label

dict_path = './CholecT50_sample/dict/'

with open(os.path.join(dict_path, 'triplet.txt'), 'r') as f:
  triplet_info = f.readlines()
  triplet_dict = {}
  for l in triplet_info:
    triplet_id, triplet_label = l.split(':')
    triplet_dict[int(triplet_id)] = triplet_label.rstrip()

with open(os.path.join(dict_path, 'instrument.txt'), 'r') as f:
  instrument_info = f.readlines()
  instrument_dict = {}
  for l in instrument_info:
    instrument_id, instrument_label = l.split(':')
    instrument_dict[int(instrument_id)] = instrument_label.rstrip()

with open(os.path.join(dict_path, 'verb.txt'), 'r') as f:
  verb_info = f.readlines()
  verb_dict = {}
  for l in verb_info:
    verb_id, verb_label = l.split(':')
    verb_dict[int(verb_id)] = verb_label.rstrip()

with open(os.path.join(dict_path, 'target.txt'), 'r') as f:
  target_info = f.readlines()
  target_dict = {}
  for l in target_info:
    target_id, target_label = l.split(':')
    target_dict[int(target_id)] = target_label.rstrip()

In [None]:
print('Random triplet id and its human readable label\n')
random_triplet_id = np.random.choice(list(triplet_dict.keys()))
print('Triplet id: ', random_triplet_id, '\nReadable label: ', triplet_dict[random_triplet_id])

one_hot_triplet = np.zeros(100)
one_hot_triplet[random_triplet_id] = 1

instrument_id = np.where(decompose(one_hot_triplet, 'i'))[0][0]
verb_id = np.where(decompose(one_hot_triplet, 'v'))[0][0]
target_id = np.where(decompose(one_hot_triplet, 't'))[0][0]

print('Instrument id: ', instrument_id, '\nReadable label: ', instrument_dict[instrument_id])
print('Verb id: ', verb_id, '\nReadable label: ', verb_dict[verb_id])
print('Target id: ', target_id, '\nReadable label: ', target_dict[target_id])

Additionally, if you want to learn the triplet as a 3d matrix of interaction as done in [Nwoye C.I. et.al, Recognition of Instrument-Tissue Interactions in Endoscopic Videos via Action Triplets, MICCAI 2020](https://arxiv.org/abs/2007.05405), we provide code below to map the a vector of triplet ids to its corresponding 3D matrix and vice versa.

In [None]:

def project_1d_to_3d(inputs):
    """ Convert triplets labels from 1D vector to 3D matrix of interaction.
        Args:
            inputs: a 1D vector of dimension (n), where n = number of triplet classes;
            with values int(0 or 1) for target labels and float[0, 1] for predicted labels.
        Returns:
            output: int or float sparse encoding 3D matrix of dimension (nI, nV, nT);
            where nI = number of instrument classes,  nV = number of verb classes,  nT = number of target classes
    """
    d3    = np.zeros([num_instrument, num_verb, num_target], dtype=np.float32)      
    for idx, val in enumerate(inputs):
        d3[maps_dict[idx,1], maps_dict[idx,2], maps_dict[idx,3]] = val
    return d3
    
def project_3d_to_1d(self, inputs):
    """ Convert triplets labels from 3D vector to 1D matrix of interaction.
        Args:
            inputs: a 3D matrix of dimension (nI, nV, nT);
            where nI = number of instrument classes,  nV = number of verb classes,  nT = number of target classes
        Returns:
            output: int or float sparse encoding 1D vector of dimension (n), where n = number of triplet classes;
            with values int(0 or 1) for target labels and float[0, 1] for predicted labels.
    """
    d1   = np.zeros([num_triplet], dtype=np.float32)      
    for idx in range(num_triplet):
        d1[idx] = inputs[maps_dict[idx,1], maps_dict[idx,2], maps_dict[idx,3]]
    return d1