In [1]:
import sys
sys.path.append('..')

In [2]:

from omegaconf import OmegaConf

from config.modifier import dynamically_modify_train_config
from modules.utils.fetch import fetch_data_module, fetch_model_module
from models.detection.yolox.utils.boxes import postprocess

yaml_path = './gen4_dt_20.yaml'
config = OmegaConf.load(yaml_path)
dynamically_modify_train_config(config)

data = fetch_data_module(config)
model = fetch_model_module(config)
model.setup("test")

Using python-based detection evaluation
Set MaxViTRNN backbone (height, width) to (512, 640)
Set partition sizes: (8, 10)
Set num_classes=3 for detection head
inchannels: (64, 128, 256)
strides: (8, 16, 32)


In [3]:
# import cv2
# import torch
# import torchvision.transforms as transforms

# from modules.utils.detection import RNNStates
# # 動画のパス
# video_path = "output.avi"

# # 動画を開く
# cap = cv2.VideoCapture(video_path)

# # 画像をTensorに変換するためのTransform定義
# transform = transforms.Compose([
#     transforms.ToPILImage(),  # NumPy画像をPIL画像に変換
#     transforms.Resize((384, 640)),  # 画像サイズをリサイズ
#     transforms.ToTensor(),  # Tensorに変換
# ])

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.eval()  # 推論モード
# model.to(device)  # モデルをGPUに転送

# ckpt_path = config.ckpt_path
# if ckpt_path != "":
#     print(f"Loading checkpoint from {ckpt_path}, device: {device}")
#     ckpt = torch.load(ckpt_path, map_location=device)  # デバイスに合わせてチェックポイントをロード
#     model.load_state_dict(ckpt['state_dict'])

# rnn_state = RNNStates()
# rnn_state.reset(worker_id=0, indices_or_bool_tensor=True)
# prev_states = rnn_state.get_states(worker_id=0)

# # フレームごとに処理
# while cap.isOpened():
#     ret, frame = cap.read()  # フレームを取得
#     if not ret:
#         break  # 動画が終了したらループを抜ける

#     # BGR(OpenCV) → RGB
#     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

#     # NumPy → Tensor
#     frame_tensor = transform(frame)  # shape: (3, 224, 224)
#     frame_tensor = frame_tensor.unsqueeze(0)  # shape: (1, 3, 224, 224) バッチ次元追加
#     frame_tensor = frame_tensor.to(device)  # テンソルをGPUに転送

#     # モデルに入力（推論実行）
#     with torch.no_grad():
#         backbone_features, states = model.mdl.forward_backbone(x=frame_tensor, previous_states=prev_states)
#         prev_states = states
#         rnn_state.save_states_and_detach(worker_id=0, states=prev_states)

#         predictions, _ = model.mdl.forward_detect(backbone_features=backbone_features)
#         pred_processed = postprocess(prediction=predictions, num_classes=3, conf_thre=0.1, nms_thre=0.45)

#     # 結果を表示（ダミー出力の形状を確認）
#     print(pred_processed)  # (1, 16, 224, 224)

# # 動画を閉じる
# cap.release()
# cv2.destroyAllWindows()


In [4]:
import cv2
import torch
import torchvision.transforms as transforms
import numpy as np

from modules.utils.detection import RNNStates
from models.detection.yolox.utils.boxes import postprocess

class VideoVisualizer:
    def __init__(self, output_path, fps=30):
        self.video_writer = None
        self.output_path = output_path
        self.fps = fps
        self.mode = 4  # すべての可視化を行う
    
    def create_video_writer(self, frame_shape):
        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        self.video_writer = cv2.VideoWriter(self.output_path, fourcc, self.fps, frame_shape)

    def visualize(self, ev_tensor, labels_yolox, predictions):
        ev_tensor = ev_tensor.squeeze(0).detach().cpu().numpy().astype('uint8').copy()

        if ev_tensor.shape[0] == 3:
            ev_tensor = np.transpose(ev_tensor, (1, 2, 0))  # (C, H, W) -> (H, W, C)

        ev_tensor = (ev_tensor * 255).clip(0, 255).astype('uint8')


        if self.video_writer is None:
            self.create_video_writer(frame_shape=(ev_tensor.shape[1], ev_tensor.shape[0]))

        # 指定色を置換する
        # red_mask = (ev_tensor[:, :, 0] == 255) & (ev_tensor[:, :, 1] == 0) & (ev_tensor[:, :, 2] == 0)
        # ev_tensor[red_mask] = [255, 255, 255]

        # blue_mask = (ev_tensor[:, :, 0] == 0) & (ev_tensor[:, :, 1] == 0) & (ev_tensor[:, :, 2] == 255)
        # ev_tensor[blue_mask] = [0, 0, 0]

        

        # RGB -> BGR変換
        ev_tensor = cv2.cvtColor(ev_tensor, cv2.COLOR_RGB2BGR)

        cv2.imshow("Debug Frame", ev_tensor)
        cv2.waitKey(1)

        # YOLOXのラベル描画
        if self.mode in [2, 4] and labels_yolox is not None:
            for cls, cx, cy, w, h in labels_yolox[0]:
                if any(val is None or np.isnan(val) for val in [cx, cy, w, h]):
                    continue
                x = max(0, int(cx - w / 2))
                y = max(0, int(cy - h / 2))
                x2 = min(ev_tensor.shape[1] - 1, int(cx + w / 2))
                y2 = min(ev_tensor.shape[0] - 1, int(cy + h / 2))
                color = (0, 255, 0)  # 緑色
                cv2.rectangle(ev_tensor, (x, y), (x2, y2), color, 2)
                label = f"{cls}"
                cv2.putText(ev_tensor, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

        # 予測結果の描画
        if self.mode in [3, 4] and predictions is not None and predictions[0] is not None:
            for x1, y1, x2, y2, obj_conf, class_conf, class_id in predictions[0]:
                x1 = max(0, int(x1))
                y1 = max(0, int(y1))
                x2 = min(ev_tensor.shape[1] - 1, int(x2))
                y2 = min(ev_tensor.shape[0] - 1, int(y2))
                color = (0, 255, 255)  # 黄色
                cv2.rectangle(ev_tensor, (x1, y1), (x2, y2), color, 2)
                label = f"{class_id:.2f}"
                cv2.putText(ev_tensor, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

        # フレームを書き込み
        self.video_writer.write(ev_tensor)

    def close(self):
        if self.video_writer is not None:
            self.video_writer.release()

# 動画パス
video_path = "output_100.avi"
output_path = "output_visualized_100.avi"

# 動画を開く
cap = cv2.VideoCapture(video_path)

# 画像をTensorに変換するためのTransform定義
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((512, 640)),
    transforms.ToTensor(),
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
model.to(device)

ckpt_path = config.ckpt_path
if ckpt_path != "":
    print(f"Loading checkpoint from {ckpt_path}, device: {device}")
    ckpt = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(ckpt['state_dict'])

rnn_state = RNNStates()
rnn_state.reset(worker_id=0, indices_or_bool_tensor=True)
prev_states = rnn_state.get_states(worker_id=0)

visualizer = VideoVisualizer(output_path=output_path, fps=30)

# フレームごとに処理
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_tensor = transform(frame)
    frame_tensor = frame_tensor.unsqueeze(0).to(device)

    # モデル推論
    with torch.no_grad():
        backbone_features, states = model.mdl.forward_backbone(x=frame_tensor, previous_states=prev_states)
        prev_states = states
        rnn_state.save_states_and_detach(worker_id=0, states=prev_states)

        predictions, _ = model.mdl.forward_detect(backbone_features=backbone_features)
        pred_processed = postprocess(prediction=predictions, num_classes=3, conf_thre=0.1, nms_thre=0.45)

    # 可視化
    visualizer.visualize(frame_tensor, labels_yolox=None, predictions=pred_processed)

cap.release()
visualizer.close()
cv2.destroyAllWindows()


Loading checkpoint from ./ckpt/gen4_20.ckpt, device: cuda


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
