In [1]:
%%capture
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [2]:
%%capture
import os
import sys

!git clone https://github.com/14790897/sort-detectron2.git
os.chdir('/kaggle/working/sort-detectron2')
!pip install -r requirements.txt
sys.path.append('/kaggle/working/sort-detectron2')
os.chdir('..')

In [None]:
import cv2
import torch
import detectron2
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

import numpy as np
import matplotlib.pyplot as plt
from sort import *

# 初始化 SORT
sort_tracker = Sort()

# 配置 Detectron2
cfg = get_cfg()
config_name = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml" 
cfg.merge_from_file(model_zoo.get_config_file(config_name))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # 设置阈值
cfg.MODEL.WEIGHTS = "/kaggle/input/detectron2_coincide_separation/pytorch/default/1/model_final.pth"
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2  # 你的类别数量

predictor = DefaultPredictor(cfg)

# 打开视频文件或相机
cap = cv2.VideoCapture("/kaggle/input/particle-video/output_video.mp4")
# 获取视频的宽度、高度和帧率
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# 定义视频保存的格式和输出路径
out_video = cv2.VideoWriter('output_video.avi', cv2.VideoWriter_fourcc(*'XVID'), fps, (frame_width, frame_height))
# 创建保存图像的文件夹
output_image_dir = "result_ini"
os.makedirs(output_image_dir, exist_ok=True)
# 打开 result_ini.txt 文件用于写入检测结果
with open(os.path.join(output_image_dir, "result_ini.txt"), "w") as result_file:
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # 使用 Detectron2 进行目标检测
        outputs = predictor(frame)

        # 获取检测框
        boxes = outputs["instances"].pred_boxes.tensor.cpu().numpy()
        scores = outputs["instances"].scores.cpu().numpy()
        classes = outputs["instances"].pred_classes.cpu().numpy()

        # 将框和分数组合成 SORT 所需的输入格式
        detections = np.empty((0, 5))
        for box, score in zip(boxes, scores):
            detection = np.array([box[0], box[1], box[2], box[3], score])
            detections = np.vstack((detections, detection))

        # 更新 SORT 跟踪器
        tracks = sort_tracker.update(detections)

        # 绘制检测框和跟踪框
        for track in tracks:
            x1, y1, x2, y2, track_id = track[:5]
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {int(track_id)}', (int(x1), int(y1)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)
            # 将检测结果写入 result_ini.txt
            result_file.write(f"Frame {frame_count}: ID {int(track_id)}, Box [{x1}, {y1}, {x2}, {y2}]\n")
            print('predict result:', f"Frame {frame_count}: ID {int(track_id)}, Box [{x1}, {y1}, {x2}, {y2}]\n")
       # 保存当前帧到视频文件
        out_video.write(frame)
        # 保存当前帧为图像文件
        frame_filename = os.path.join(output_image_dir, f'frame_{frame_count:04d}.jpg')        
        cv2.imwrite(frame_filename, frame)
        frame_count += 1

        # 使用 Matplotlib 显示图像（可选）
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        plt.imshow(frame_rgb)
        plt.axis('off')  # 隐藏坐标轴
        plt.show()

    cap.release()


predict result: Frame 0: ID 7, Box [205.3253631591797, 55.76040267944336, 213.38815307617188, 63.45770263671875]

predict result: Frame 0: ID 6, Box [296.6557922363281, 573.275634765625, 303.1761474609375, 580.2236328125]

predict result: Frame 0: ID 5, Box [271.63275146484375, 571.0874633789062, 277.84625244140625, 577.228271484375]

predict result: Frame 0: ID 4, Box [355.59185791015625, 541.5277709960938, 363.7428283691406, 550.0439453125]

predict result: Frame 0: ID 3, Box [353.7958068847656, 609.1271362304688, 361.76190185546875, 616.9922485351562]

predict result: Frame 0: ID 2, Box [310.398681640625, 859.0647583007812, 319.7506103515625, 868.0664672851562]

predict result: Frame 0: ID 1, Box [400.90447998046875, 925.8240356445312, 410.32379150390625, 934.81787109375]

predict result: Frame 1: ID 12, Box [292.0013122558594, 573.89013671875, 299.0155029296875, 581.4900512695312]

predict result: Frame 1: ID 11, Box [292.81134033203125, 574.4649658203125, 298.966552734375, 581.103

In [4]:
%%capture captured_output

# zip result_ini 
!zip -r result_ini.zip result_ini