# Putting the Object Back into Video Object Segmentation
# Colab Demo

[[arXiv]](https://arxiv.org/abs/2310.12982) [[PDF]](https://arxiv.org/pdf/2310.12982.pdf) [[Code]](https://github.com/hkchengrex/Cutie) [[Project Page]](https://hkchengrex.github.io/Cutie/)

![title](https://camo.githubusercontent.com/84482c6f65f93339699387c6880640bf5213583ceca2f5658c423dc1d68ab8a9/68747470733a2f2f696d6775722e636f6d2f364b3742675a372e706e67)

![overview](https://camo.githubusercontent.com/53c8662cecfbd61e1e06d08cfe086333cbcb365170ad56f6a62e9d55aa7a918b/68747470733a2f2f696d6775722e636f6d2f707835673433372e6a7067)

You can make a copy of this notebook to change the input video or mask.

In [None]:
!nvidia-smi

import torch

if torch.cuda.is_available():
  print('Using GPU')
  device = 'cuda'
else:
  print('CUDA not available. Please connect to a GPU instance if possible.')
  device = 'cpu'

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

# Get our code and install prerequisites

In [None]:
!git clone https://github.com/hkchengrex/Cutie.git
%cd Cutie
!pip install -e .

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

# Then restart the runtime.

# Download the pretrained model



In [None]:
%cd /content/Cutie
!python cutie/utils/download_models.py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install hydra-core --upgrade


# Basic setup

In [None]:
%cd /content/Cutie/

from os import path
import logging
from omegaconf import DictConfig
import hydra
from hydra.core.hydra_config import HydraConfig
from omegaconf import open_dict
from hydra import compose, initialize

import torch
from torch.utils.data import DataLoader
import numpy as np
from PIL import Image

from cutie.inference.data.vos_test_dataset import VOSTestDataset
from cutie.inference.data.burst_test_dataset import BURSTTestDataset
from cutie.model.cutie import CUTIE
from cutie.inference.inference_core import InferenceCore
from cutie.inference.utils.results_utils import ResultSaver, make_zip
from cutie.inference.utils.burst_utils import BURSTResultHandler
from cutie.inference.utils.args_utils import get_dataset_cfg

from tqdm import tqdm


from hydra.core.global_hydra import GlobalHydra
GlobalHydra.instance().clear()

with torch.inference_mode():
  initialize(version_base='1.3.2', config_path="cutie/config", job_name="eval_config")
  cfg = compose(config_name="eval_config")

  with open_dict(cfg):
    cfg['weights'] = './weights/cutie-base-mega.pth'

  data_cfg = get_dataset_cfg(cfg)

  # Load the network weights
  cutie = CUTIE(cfg).cuda().eval()
  model_weights = torch.load(cfg.weights)
  cutie.load_weights(model_weights)

# Load some data

(Source: https://www.youtube.com/watch?v=FTcjzaqL0pE)

In [None]:

# 파일 이름 지정
video_name = 'rabbit1.mp4'
mask_name = 'rabbit1_mask.png'
#/content/Cutie/examples/masks/rabbit/rabbit1_mask.png
#경로 설정
video_path = f'/content/drive/MyDrive/{video_name}'
image_path = f'/content/drive/MyDrive/{mask_name}'

'''
# 파일 이름 지정
video_name = 'rabbit1.mp4'
mask_name = 'rabbit_1_mask2.jpg'
#/content/Cutie/examples/masks/rabbit/rabbit1_mask.png
#경로 설정
video_path = f'/content/drive/MyDrive/{video_name}'
image_path = f'/content/drive/MyDrive/{mask_name}'
'''

# Preview the video and first-frame annotation

The first frame mask is a PNG with a color palette.

In [None]:
from IPython.display import HTML
from base64 import b64encode

data_url = "data:video/mp4;base64," + b64encode(open(video_path, 'rb').read()).decode()
HTML(f"""
<video width=400 controls>
      <source src="{data_url}" type="video/mp4">
</video>
""")

In [None]:
import IPython.display
image_path = '/content/drive/MyDrive/rabbit1_frame.jpg'
IPython.display.Image(image_path, width=400)

## Convert the mask to a numpy array

In [None]:
'''
from PIL import Image
import numpy as np

mask_path = '/content/drive/MyDrive/rabbit1_mask.jpg'
# Use Image.open to open the image file
mask = np.array(Image.open(mask_path))
valid_values = [1, 2, 3]
mask = np.isin(mask, valid_values).astype(np.uint8)
  '''
from PIL import Image
import numpy as np

mask_path = '/content/drive/MyDrive/rabbit1_mask.jpg'
# Use Image.open to open the image file
mask = np.array(Image.open(mask_path))
valid_values = [1, 2, 3]
mask = np.isin(mask, valid_values).astype(np.uint8)

In [None]:
import os

video_name = '/content/drive/MyDrive/rabbit1.mp4'

# Check if the video file exists
if not os.path.exists(video_name):
    print(f"Error: Video file '{video_name}' does not exist.")
else:
    print(f"Video file '{video_name}' found.")


# Propagte frame-by-frame

In [None]:
import cv2
from gui.interactive_utils import image_to_torch, torch_prob_to_numpy_mask, index_numpy_to_one_hot_torch, overlay_davis
import numpy as np
import torch

device = 'cuda'
torch.cuda.empty_cache()

# 마스크에서 객체 개수 계산 (예시: 마스크에 고유한 값에서 배경(0)을 제외한 개수)
num_objects = len(np.unique(mask)) - 1  # 배경 제외

print('num_objects :',num_objects) #이거 직접입력 기능 넣으면 좋겟다

processor = InferenceCore(cutie, cfg=cfg)
cap = cv2.VideoCapture(video_name)

# You can change these two numbers
frames_to_propagate = 200
visualize_every = 20

current_frame_index = 0


In [None]:
import torch
import numpy as np
from PIL import Image
from scipy.ndimage import binary_dilation  # 마스크 확장에 사용
import matplotlib.pyplot as plt

with torch.inference_mode():
    with torch.amp.autocast("cuda", enabled=True):  # 최신 방식으로 수정
        while cap.isOpened():
            # 프레임을 한 장씩 읽기
            _, frame = cap.read()
            if frame is None or current_frame_index > frames_to_propagate:
                break

            # NumPy 배열을 PyTorch 텐서로 변환
            frame_torch = image_to_torch(frame, device=device)

            if current_frame_index == 0:
                # 초기 마스크 확인 및 시각화
                print("Initial Mask Shape:", mask.shape)  # (H, W, C) 또는 (H, W)
                print("Unique Values in Mask:", np.unique(mask))  # [0, 1] 등
                plt.imshow(mask, cmap="gray")
                plt.title("Initial Mask")
                plt.show()

                # 마스크를 단일 채널로 변환 (첫 번째 채널 사용)
                if mask.ndim == 3 and mask.shape[2] == 3:  # RGB일 경우
                    mask = mask[:, :, 0]  # 단일 채널로 변환
                elif mask.ndim != 2:  # 2D가 아닌 경우 예외 처리
                    raise ValueError("mask는 2D 단일 채널 배열이어야 합니다.")

                # 마스크 확장 (필요하면 활성화)
                mask = binary_dilation(mask, iterations=3).astype(mask.dtype)  # 확장 정도는 조정 가능
                print("Mask Shape After Dilation:", mask.shape)

                # 변환 후 Shape 확인
                print("Converted Mask Shape (Single Channel):", mask.shape)

                # One-Hot 인코딩 수행
                mask_torch = index_numpy_to_one_hot_torch(mask, num_objects + 1).to(device)

                # One-Hot 변환된 텐서 확인 및 시각화
                print("Mask Tensor Shape (One-Hot):", mask_torch.shape)  # (C, H, W)
                for i in range(mask_torch.shape[0]):  # 각 채널 확인
                    plt.imshow(mask_torch[i].cpu().numpy(), cmap="gray")
                    plt.title(f"One-Hot Channel {i}")
                    plt.show()

                # 채널 확인 및 배경 제외
                if mask_torch.shape[0] != num_objects + 1:
                    raise ValueError("One-Hot 인코딩된 mask_torch의 채널 수가 올바르지 않습니다.")

                # 배경을 제외한 마스크 전달
                prediction = processor.step(frame_torch, mask_torch[1:], idx_mask=False)
            else:
                # 다음 프레임에서는 propagate만 수행
                prediction = processor.step(frame_torch)

            # 예측 결과를 NumPy 배열로 변환
            prediction = torch_prob_to_numpy_mask(prediction)

            # 시각화 주기마다 결과를 출력
            if current_frame_index % visualize_every == 0:
                visualization = overlay_davis(frame, prediction)
                display(Image.fromarray(visualization))

            # 현재 프레임 인덱스 증가
            current_frame_index += 1


In [None]:
import torch
import numpy as np
from PIL import Image

#마스킹 단색이 아닌 3채널일때
with torch.inference_mode():
    with torch.amp.autocast("cuda", enabled=True):  # torch.cuda.amp.autocast 대신 새 방식 사용
        while cap.isOpened():
            # 프레임을 한 장씩 읽기
            _, frame = cap.read()
            if frame is None or current_frame_index > frames_to_propagate:
                break

            # NumPy 배열을 PyTorch 텐서로 변환
            frame_torch = image_to_torch(frame, device=device)

            if current_frame_index == 0:
                # 초기 마스크 확인 및 디버깅
                print("Initial Mask Shape:", mask.shape)  # (720, 1280, 3)
                print("Unique Values in Mask:", np.unique(mask))  # [0, 1]

                # 마스크를 단일 채널로 변환 (첫 번째 채널 사용)
                if mask.ndim == 3 and mask.shape[2] == 3:  # RGB일 경우
                    mask = mask[:, :, 0]  # 단일 채널로 변환
                elif mask.ndim != 2:  # 2D가 아닌 경우 예외 처리
                    raise ValueError("mask는 2D 단일 채널 배열이어야 합니다.")

                # 변환 후 Shape 확인
                print("Converted Mask Shape (Single Channel):", mask.shape)

                # One-Hot 인코딩 수행
                mask_torch = index_numpy_to_one_hot_torch(mask, num_objects + 1).to(device)

                # One-Hot 변환된 텐서 확인
                print("Mask Tensor Shape (One-Hot):", mask_torch.shape)  # (C, H, W)

                # 채널 확인 및 배경 제외
                if mask_torch.shape[0] != num_objects + 1:
                    raise ValueError("One-Hot 인코딩된 mask_torch의 채널 수가 올바르지 않습니다.")

                # 배경을 제외한 마스크 전달
                prediction = processor.step(frame_torch, mask_torch[1:], idx_mask=False)
            else:
                # 다음 프레임에서는 propagate만 수행
                prediction = processor.step(frame_torch)

            # 예측 결과를 NumPy 배열로 변환
            prediction = torch_prob_to_numpy_mask(prediction)

            # 시각화 주기마다 결과를 출력
            if current_frame_index % visualize_every == 0:
                visualization = overlay_davis(frame, prediction)
                display(Image.fromarray(visualization))

            # 현재 프레임 인덱스 증가
            current_frame_index += 1
