# Semantic Segmentation with OpenVINO and PySDK

In this notebook, the Segmenter OpenVINO and PySDK models are compared for inference.


### Prerequisites

In [None]:
import sys
from pathlib import Path

# clone Segmenter repo
if not Path("segmenter").exists():
    !git clone https://github.com/rstrudel/segmenter
else:
    print("Segmenter repo already cloned")

# include path to Segmenter repo to use its functions
sys.path.append("./segmenter")

In [None]:
# Installing requirements
%pip install -q "openvino>=2023.1.0"
%pip install -r segmenter/requirements.txt

In [None]:
import numpy as np
import yaml

# Fetch the notebook utils script from the openvino_notebooks repo
import urllib.request
urllib.request.urlretrieve(
    url='https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/master/notebooks/utils/notebook_utils.py',
    filename='notebook_utils.py'
)
from notebook_utils import download_file, load_image

In [None]:
# download config and pretrained model weights
# here we use tiny model, there are also better but larger models available in repository
WEIGHTS_LINK = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/segmenter/checkpoints/ade20k/seg_tiny_mask/checkpoint.pth"
CONFIG_LINK = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/segmenter/checkpoints/ade20k/seg_tiny_mask/variant.yml"

MODEL_DIR = Path("model/")
MODEL_DIR.mkdir(exist_ok=True)

download_file(WEIGHTS_LINK, directory=MODEL_DIR, show_progress=True)
download_file(CONFIG_LINK, directory=MODEL_DIR, show_progress=True)

WEIGHT_PATH = MODEL_DIR / "checkpoint.pth"
CONFIG_PATH = MODEL_DIR / "variant.yaml"

### Loading PyTorch model
[back to top ⬆️](#Table-of-contents:)

PyTorch models are usually an instance of [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) class, initialized by a state dictionary containing model weights.
Typical steps to get the model are therefore:

1. Create an instance of the model class
2. Load checkpoint state dict, which contains pre-trained model weights
3. Turn the model to evaluation mode, to switch some operations to inference mode

We will now use already provided helper functions from repository to initialize the model.

In [None]:
from segmenter.segm.model.factory import load_model

pytorch_model, config = load_model(WEIGHT_PATH)
# put model into eval mode, to set it for inference
pytorch_model.eval()
print("PyTorch model loaded and ready for inference.")

Load normalization settings from config file.

In [None]:
from segmenter.segm.data.utils import STATS
# load normalization name, in our case "vit" since we are using transformer
normalization_name = config["dataset_kwargs"]["normalization"]
# load normalization params, mean and std from STATS
normalization = STATS[normalization_name]

## Preparing preprocessing, inference support functions, and visualization functions
[back to top ⬆️](#Table-of-contents:)

Now we will define utility functions for preprocessing, inferencing, and visualizing the results.

### Preprocessing
[back to top ⬆️](#Table-of-contents:)

Inference input is tensor with shape `[1, 3, H, W]` in `B, C, H, W` format, where:

* `B` - batch size (in our case 1, as we are just adding 1 with unsqueeze)
* `C` - image channels (in our case RGB - 3)
* `H` - image height
* `W` - image width

Splitting to batches is done inside inference, so we don't need to split the image in preprocessing.

Model expects images in RGB channels format, scaled to [0, 1] range and normalized with given mean and standard deviation.

In [None]:
from PIL import Image
import cv2
import numpy as np
from scipy.special import softmax
import math

def resize(im, smaller_size):
    """
    Resize image bilinearly to make shorter side at least as long as the dimension provided.
    """
    h, w = im.shape[2:]
    if h < w:
        ratio = w / h
        h_res, w_res = smaller_size, ratio * smaller_size
    else:
        ratio = h / w
        h_res, w_res = ratio * smaller_size, smaller_size
    if min(h, w) < smaller_size:
        im_res = cv2.resize(im, (int(h_res), int(w_res)), interpolation=cv2.INTER_LINEAR)
    else:
        im_res = im
    return im_res

def preprocess(im: Image, normalization: dict, window_size: int) -> np.ndarray:
    """
    Preprocess image: scale, normalize, unsqueeze, and resize

    :param im: input image
    :param normalization: dictionary containing normalization data
    :return:
            im: processed (scaled and normalized) image
    """
    # change PIL image to NumPy array and scale to [0, 1]
    im = np.asarray(im, dtype=np.float32) / 255.0
    # normalize by given mean and standard deviation
    im -= np.array(normalization["mean"])[None, None, :]
    im /= np.array(normalization["std"])[None, None, :]
    # HWC -> CHW
    im = np.transpose(im, (2, 0, 1))
    # change dim from [C, H, W] to [1, C, H, W]
    im = np.expand_dims(im, axis=0)
    # resize image to window size by shorter dimension
    im = resize(im, window_size)

    return im

### Inference Support
The functions below implement a sliding window methodology for model inference.

In [None]:
def sliding_window(im, flip, window_size, window_stride):
    """
    Create a batch of patches from the input image, based on provided window
    size and stride.
    """
    B, C, H, W = im.shape
    ws = window_size

    windows = {"crop": [], "anchors": []}
    h_anchors = np.arange(0, H, window_stride)
    w_anchors = np.arange(0, W, window_stride)
    h_anchors = [h for h in h_anchors if h < H - ws] + [H - ws]
    w_anchors = [w for w in w_anchors if w < W - ws] + [W - ws]
    for ha in h_anchors:
        for wa in w_anchors:
            window = im[:, :, ha : ha + ws, wa : wa + ws]
            windows["crop"].append(window)
            windows["anchors"].append((ha, wa))
    windows["flip"] = flip
    windows["shape"] = (H, W)
    return windows

def bilinear_interpolation(original_img, new_shape):
	"""
	Bilinear interpolation for masks in CHW format.
	"""
	#get dimensions of original image
	original_img = np.transpose(original_img, (1, 2, 0))
	old_h, old_w, c = original_img.shape
	new_h, new_w = new_shape
	#create an array of the desired shape. 
	#We will fill-in the values later.
	resized = np.zeros((new_h, new_w, c), dtype=original_img.dtype)
	#Calculate horizontal and vertical scaling factor
	w_scale_factor = old_w / new_w
	h_scale_factor = old_h / new_h
	for i in range(new_h):
		for j in range(new_w):
			#map the coordinates back to the original image
			x = i * h_scale_factor
			y = j * w_scale_factor
			#calculate the coordinate values for 4 surrounding pixels.
			x_floor = math.floor(x)
			x_ceil = min( old_h - 1, math.ceil(x))
			y_floor = math.floor(y)
			y_ceil = min(old_w - 1, math.ceil(y))

			if (x_ceil == x_floor) and (y_ceil == y_floor):
				q = original_img[int(x), int(y), :]
			elif (x_ceil == x_floor):
				q1 = original_img[int(x), int(y_floor), :]
				q2 = original_img[int(x), int(y_ceil), :]
				q = q1 * (y_ceil - y) + q2 * (y - y_floor)
			elif (y_ceil == y_floor):
				q1 = original_img[int(x_floor), int(y), :]
				q2 = original_img[int(x_ceil), int(y), :]
				q = (q1 * (x_ceil - x)) + (q2	 * (x - x_floor))
			else:
				v1 = original_img[x_floor, y_floor, :]
				v2 = original_img[x_ceil, y_floor, :]
				v3 = original_img[x_floor, y_ceil, :]
				v4 = original_img[x_ceil, y_ceil, :]

				q1 = v1 * (x_ceil - x) + v2 * (x - x_floor)
				q2 = v3 * (x_ceil - x) + v4 * (x - x_floor)
				q = q1 * (y_ceil - y) + q2 * (y - y_floor)

			resized[i,j,:] = q
	resized = np.transpose(resized, (2, 0, 1))
	return resized

def merge_windows(windows, window_size, ori_shape):
    """
    Merge all inference result patches to create complete segmentation
    mask output of the model.
    """
    ws = window_size
    im_windows = windows["seg_maps"]
    anchors = windows["anchors"]
    C = im_windows[0].shape[0]
    H, W = windows["shape"]
    flip = windows["flip"]

    logit = np.zeros((C, H, W), dtype=np.float32)
    count = np.zeros((1, H, W), dtype=np.float32)
    for window, (ha, wa) in zip(im_windows, anchors):
        logit[:, ha : ha + ws, wa : wa + ws] += window
        count[:, ha : ha + ws, wa : wa + ws] += 1
    logit = logit / count
    logit = bilinear_interpolation(
        logit,
        ori_shape
    )
    if flip:
        logit = np.flip(logit, axis=2)
    result = softmax(logit, axis=0)
    return result

### Inference Function

The following code implements the main inference function, utilizing the inference support functions defined above to split the input image into patches, run the model inference on each path, and merge together all outputs.

In [None]:
def inference(
    model,
    ims,
    ims_metas,
    ori_shape,
    window_size,
    window_stride,
    batch_size,
):
    C = model.n_cls
    seg_map = np.zeros((C, ori_shape[0], ori_shape[1]))
    for im, im_metas in zip(ims, ims_metas):
        flip = im_metas["flip"]
        windows = sliding_window(im, flip, window_size, window_stride)
        crops = np.stack(windows.pop("crop"))[:, 0]
        B = len(crops)
        WB = batch_size
        seg_maps = np.zeros((B, C, window_size, window_size))
        for i in range(0, B, WB):
            seg_maps[i : i + WB] = model.forward(crops[i : i + WB])
        windows["seg_maps"] = seg_maps
        im_seg_map = merge_windows(windows, window_size, ori_shape)
        seg_map += im_seg_map
    seg_map /= len(ims)
    return seg_map

### Visualization
[back to top ⬆️](#Table-of-contents:)

Inference output contains labels assigned to each pixel, so the output in our case is `[150, H, W]` in `CL, H, W` format where:

* `CL` - number of classes for labels (in our case 150)
* `H` - image height
* `W` - image width

Since we want to visualize this output, we reduce dimensions to `[1, H, W]` where we keep only class with the highest value as that is the predicted label.
We then combine original image with colors corresponding to the inferred labels.

In [None]:
import sys
import yaml
sys.path.append("./segmenter")

from segmenter.segm.data.utils import IGNORE_LABEL
from segmenter.segm.data.ade20k import ADE20K_CATS_PATH

def seg_to_rgb(seg, colors):
    im = np.zeros((seg.shape[0], seg.shape[1], seg.shape[2], 3), dtype=np.float32)
    cls = np.unique(seg)
    for cl in cls:
        color = colors[int(cl)]
        if len(color.shape) > 1:
            color = color[0]
        im[seg == cl] = color
    return im

def dataset_cat_description(path, cmap=None):
    desc = yaml.load(open(path, "r"), Loader=yaml.FullLoader)
    colors = {}
    names = []
    for i, cat in enumerate(desc):
        names.append(cat["name"])
        if "color" in cat:
            colors[cat["id"]] = np.array(cat["color"], dtype=np.float32) / 255.0
        else:
            colors[cat["id"]] = np.array(cmap[cat["id"]], dtype=np.float32)
    colors[IGNORE_LABEL] = np.array([0.0, 0.0, 0.0], dtype=np.float32)
    return names, colors

def apply_segmentation_mask(pil_im: Image, results: np.ndarray) -> Image:
    """
    Combine segmentation masks with the image

    :param pil_im: original input image
    :param results: tensor containing segmentation masks for each pixel
    :return:
            pil_blend: image with colored segmentation masks overlay
    """
    cat_names, cat_colors = dataset_cat_description(ADE20K_CATS_PATH)

    # 3D array, where each pixel has values for all classes, take index of max as label
    seg_map = np.argmax(results, axis=0, keepdims=True)
    # transform label id to colors
    seg_rgb = seg_to_rgb(seg_map, cat_colors)
    seg_rgb = (255 * seg_rgb).astype(np.uint8)
    pil_seg = Image.fromarray(seg_rgb[0])

    # overlay segmentation mask over original image
    pil_blend = Image.blend(pil_im, pil_seg, 0.5).convert("RGB")

    return pil_blend

## Convert PyTorch model to OpenVINO Intermediate Representation (IR)
[back to top ⬆️](#Table-of-contents:)

Now that we've verified that the inference of PyTorch model works, we will convert it to OpenVINO IR format.

To do this, we first get input dimensions from the model configuration file and create torch dummy input.
Input dimensions are in our case `[2, 3, 512, 512]` in `B, C, H, W]` format, where:

* `B` - batch size
* `C` - image channels (in our case RGB - 3)
* `H` - model input image height
* `W` - model input image width

> Note that H and W are here fixed to 512, as this is required by the model. Resizing is done inside the inference function from the original repository.

After that, we use `ov.convert_model` function from PyTorch to convert the model to OpenVINO model,  which is ready to use in Python interface but can also be serialized to OpenVINO IR format for future execution using `ov.save_model`.
The process can generate some warnings, but they are not a problem.

In [None]:
import openvino as ov
import torch

# get input sizes from config file
batch_size = 1
channels = 3
image_size = config["dataset_kwargs"]["image_size"]

# make dummy input with correct shapes obtained from config file
dummy_input = torch.randn(batch_size, channels, image_size, image_size)

model = ov.convert_model(pytorch_model, example_input=dummy_input, input=([batch_size, channels, image_size, image_size], ))
# serialize model for saving IR
ov.save_model(model, MODEL_DIR / "segmenter.xml")

## Verify converted model inference
[back to top ⬆️](#Table-of-contents:)

To test that model was successfully converted, we can use same inference function from original repository, but we need to make custom class.

`SegmenterOV` class contains OpenVINO model, with all attributes and methods required by inference function.
This way we don't need to write custom code required to process input when comparing to the PySDK model.

In [None]:
# load image with PIL
image = load_image("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_hollywood.jpg")
# load_image reads the image in BGR format, [:,:,::-1] reshape transfroms it to RGB
pil_image = Image.fromarray(image[:,:,::-1])

# preprocess image with normalization params loaded in previous steps
image = preprocess(pil_image, normalization, 512)

# inference function needs some meta parameters, where we specify that we don't flip images in inference mode
im_meta = dict(flip=False)

In [None]:
class SegmenterOV:
    """
    Class containing OpenVINO model with all attributes required to work with inference function.

    :param model: compiled OpenVINO model
    :type model: CompiledModel
    :param output_blob: output blob used in inference
    :type output_blob: ConstOutput
    :param config: config file containing data about model and its requirements
    :type config: dict
    :param n_cls: number of classes to be predicted
    :type n_cls: int
    :param normalization:
    :type normalization: dict

    """

    def __init__(self, model_path: Path, device:str = "CPU"):
        """
        Constructor method.
        Initializes OpenVINO model and sets all required attributes

        :param model_path: path to model's .xml file, also containing variant.yml
        :param device: device string for selecting inference device
        """
        # init OpenVino core
        core = ov.Core()
        # read model
        model_xml = core.read_model(model_path)
        self.model = core.compile_model(model_xml, device)
        self.output_blob = self.model.output(0)

        # load model configs
        variant_path = Path(model_path).parent / "variant.yml"
        with open(variant_path, "r") as f:
            self.config = yaml.load(f, Loader=yaml.FullLoader)

        # load normalization specs from config
        normalization_name = self.config["dataset_kwargs"]["normalization"]
        self.normalization = STATS[normalization_name]

        # load number of classes from config
        self.n_cls = self.config["net_kwargs"]["n_cls"]

    def forward(self, data: np.ndarray) -> np.ndarray:
        """
        Perform inference on data and return the result in Tensor format

        :param data: input data to model
        :return: data inferred by model
        """
        return self.model(data)[self.output_blob]

Now that we have created `SegmenterOV` helper class, we can use it in the inference function.

In [None]:
# select device from dropdown list for running inference using OpenVINO
import ipywidgets as widgets
import openvino as ov

core = ov.Core()
device = widgets.Dropdown(
    options=core.available_devices + ["AUTO"],
    value='AUTO',
    description='Device:',
    disabled=False,
)

device

In [None]:
# load model into SegmenterOV class
model_ov = SegmenterOV(MODEL_DIR / "segmenter.xml", device.value)

In [None]:
# perform inference with same function as in case of PyTorch model from repository
results_ov = inference(model=model_ov,
                    ims=[image],
                    ims_metas=[im_meta],
                    ori_shape=image.shape[2:4],
                    window_size=model_ov.config["inference_kwargs"]["window_size"],
                    window_stride=model_ov.config["inference_kwargs"]["window_stride"],
                    batch_size=1)

In [None]:
# combine segmentation mask with image
converted_blend = apply_segmentation_mask(pil_image, results_ov)

# show image with segmentation mask overlay
converted_blend

## Run PySDK model inference
[back to top ⬆️](#Table-of-contents:)

We can use the same inference function from the original repository, but for that we need to make a custom class similar to `SegmenterOV`.

`SegmenterPySDK` class contains PySDK model, with all attributes and methods required by the inference function.

In [None]:
import degirum as dg

target = "@cloud"
cloud_zoo_url = "https://cs.degirum.com/degirum/openvino_demos"
cloud_token = "<your DeGirum Cloud token here>"

zoo = dg.connect(target, cloud_zoo_url, cloud_token)
model_loaded = zoo.load_model("segmenter--512x512_float_openvino_cpu_1")
model_loaded.input_image_format = "RAW"
model_loaded.image_backend = "pil"

In [None]:
from pathlib import Path
import yaml

class SegmenterPySDK:
    """
    Class containing PySDK model with all attributes required to work with inference function.

    :param model: PySDK model
    :type model: CompiledModel
    :param output_blob: output blob used in inference
    :type output_blob: ConstOutput
    :param config: config file containing data about model and its requirements
    :type config: dict
    :param n_cls: number of classes to be predicted
    :type n_cls: int
    :param normalization:
    :type normalization: dict

    """

    def __init__(self, model):
        """
        Constructor method.
        Initializes PySDK model and sets all required attributes

        :param model: PySDK model object
        :param device: device string for selecting inference device
        """
        self.model = model

        # load model configs
        variant_path = Path("/home/degirum/Desktop/Software_Workspace/OpenVINO Segmentation Demo Notebooks/Segmenter/model/segmenter.xml").parent / "variant.yml"
        with open(variant_path, "r") as f:
            self.config = yaml.load(f, Loader=yaml.FullLoader)

        # load normalization specs from config
        normalization_name = self.config["dataset_kwargs"]["normalization"]
        self.normalization = STATS[normalization_name]

        # load number of classes from config
        self.n_cls = self.config["net_kwargs"]["n_cls"]

    def forward(self, data: np.ndarray) -> np.ndarray:
        """
        Perform inference on data and return the result in Tensor format

        :param data: input data to model
        :return: data inferred by model
        """
        return self.model(np.transpose(data, (0,2,3,1)).tobytes()).results[0]['data']

In [None]:
model_pysdk = SegmenterPySDK(model_loaded)

In [None]:
# perform inference with same function as in case of PyTorch model from repository
results_pysdk = inference(model=model_pysdk,
                    ims=[image],
                    ims_metas=[im_meta],
                    ori_shape=image.shape[2:4],
                    window_size=model_pysdk.config["inference_kwargs"]["window_size"],
                    window_stride=model_pysdk.config["inference_kwargs"]["window_stride"],
                    batch_size=1)

In [None]:
# combine segmentation mask with image
converted_blend = apply_segmentation_mask(pil_image, results_pysdk)

# show image with segmentation mask overlay
converted_blend

### Compare OpenVINO and PySDK results

In [52]:
np.allclose(results_pysdk, results_ov, rtol=1e-05, atol=1e-05)

True

As we can see, we get effectively the same results for both the OpenVINO and PySDK models.