In [14]:
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from mtcnn import MTCNN
from ultralytics import YOLO
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image


In [15]:
# 얼굴 특징 추출 모델 (OpenFace)
class OpenFaceModel(nn.Module):
    def __init__(self):
        super(OpenFaceModel, self).__init__()
        self.model = models.resnet18(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, 128)

    def forward(self, x):
        return self.model(x)

# 신발 특징 추출 모델 (ResNet50)
shoe_model_resnet = models.resnet50(pretrained=True)
shoe_model_resnet.fc = nn.Identity()  # 최종 레이어를 제거하여 특징 벡터 추출

# 얼굴 특징 추출 모델 로드
face_feature_model = OpenFaceModel()
face_feature_model.eval()

# 신발 특징 추출 모델 로드
shoe_feature_model = shoe_model_resnet
shoe_feature_model.eval()


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [16]:
# 변환 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_features(image, model):
    # numpy.ndarray 이미지를 PIL.Image로 변환
    image = Image.fromarray(image)
    image = transform(image).unsqueeze(0)  # 배치 차원 추가
    with torch.no_grad():
        features = model(image).numpy().flatten()
    return features


In [17]:
# 사람 탐지 모델 로드 (이미 학습된 모델 경로 사용)
person_model = YOLO(r'C:\Users\dhshs\Documents\CCTV(2)\1. human_detection\runs\my_yolov8_training\weights\best.pt')

# 얼굴 인식 모델 로드 (MTCNN 사용)
face_detector = MTCNN()

# 신발 탐지 모델 로드 (이미 학습된 모델 경로 사용)
shoe_model = YOLO(r'C:\Users\dhshs\Documents\CCTV(2)\2. shoes_detection\runs\my_yolov8_training\weights\best.pt')


In [32]:
# 입력된 전신 이미지 로드 및 처리
input_full_body_img = cv2.imread(r'C:\Users\dhshs\Documents\CCTV(2)\source\suspect_image\2.PNG')
input_full_body_img_rgb = cv2.cvtColor(input_full_body_img, cv2.COLOR_BGR2RGB)

# 얼굴 탐지
faces = face_detector.detect_faces(input_full_body_img_rgb)
if len(faces) > 0:
    fx, fy, fwidth, fheight = faces[0]['box']
    face_img = input_full_body_img_rgb[fy:fy+fheight, fx:fx+fwidth]
    input_face_features = extract_features(face_img, face_feature_model)
else:
    raise ValueError("얼굴을 찾을 수 없습니다.")

# 신발 탐지
shoe_results = shoe_model(input_full_body_img_rgb)
shoes = shoe_results[0].boxes.xyxy.cpu().numpy()
if len(shoes) > 0:
    sx1, sy1, sx2, sy2 = map(int, shoes[0][:4])
    shoe_img = input_full_body_img_rgb[sy1:sy2, sx1:sx2]
    input_shoe_features = extract_features(shoe_img, shoe_feature_model)
else:
    raise ValueError("신발을 찾을 수 없습니다.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

0: 640x256 1 shoes, 24.5ms
Speed: 3.0ms preprocess, 24.5ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 256)


In [33]:
# 얼굴 이미지 저장
cv2.imwrite('detected_face.png', cv2.cvtColor(face_img, cv2.COLOR_RGB2BGR))

# 신발 이미지 저장
cv2.imwrite('detected_shoe.png', cv2.cvtColor(shoe_img, cv2.COLOR_RGB2BGR))


True

In [None]:
# CCTV 영상 처리
cctv_video = cv2.VideoCapture(r'C:\Users\dhshs\Documents\CCTV(2)\source\cctv_video\demo.mp4')

# 비디오 저장 설정
frame_width = int(cctv_video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cctv_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
output_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 30, (frame_width, frame_height))

while cctv_video.isOpened():
    ret, frame = cctv_video.read()
    if not ret:
        break

    # 사람 탐지
    results = person_model(frame)
    persons = results[0].boxes.xyxy.cpu().numpy()  # 모든 감지된 객체

    for person in persons:
        x1, y1, x2, y2 = map(int, person[:4])
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)

        # 바운딩 박스 내 얼굴 인식
        person_img = frame[y1:y2, x1:x2]
        faces = face_detector.detect_faces(person_img)
        face_detected = False
        for face in faces:
            fx, fy, fwidth, fheight = face['box']
            fx1, fy1, fx2, fy2 = int(x1 + fx), int(y1 + fy), int(x1 + fx + fwidth), int(y1 + fy + fheight)
            face_img = frame[fy1:fy2, fx1:fx2]
            face_img_rgb = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
            detected_face_features = extract_features(face_img_rgb, face_feature_model)

            # 얼굴 특징 매칭
            face_similarity = cosine_similarity([input_face_features], [detected_face_features])[0][0]
            if face_similarity > 0.8:  # 유사도 임계값
                cv2.rectangle(frame, (fx1, fy1), (fx2, fy2), (0, 255, 0), 2)
                face_detected = True

        # 바운딩 박스 내 신발 인식
        shoe_results = shoe_model(frame)
        shoes = shoe_results[0].boxes.xyxy.cpu().numpy()
        shoe_detected = False
        for shoe in shoes:
            sx1, sy1, sx2, sy2 = map(int, shoe[:4])
            if sx1 > x1 and sx2 < x2 and sy1 > y1 and sy2 < y2:  # 신발이 사람 바운딩 박스 내에 있는지 확인
                shoe_img = frame[sy1:sy2, sx1:sx2]
                shoe_img_rgb = cv2.cvtColor(shoe_img, cv2.COLOR_BGR2RGB)
                detected_shoe_features = extract_features(shoe_img_rgb, shoe_feature_model)

                # 신발 특징 매칭
                shoe_similarity = cosine_similarity([input_shoe_features], [detected_shoe_features])[0][0]
                if shoe_similarity > 0.8:  # 유사도 임계값
                    cv2.rectangle(frame, (sx1, sy1), (sx2, sy2), (0, 0, 255), 2)
                    shoe_detected = True

        # 얼굴과 신발이 모두 유사한 경우 용의자로 식별
        if face_detected and shoe_detected:
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
            cv2.putText(frame, 'Suspect', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 0), 2)

    output_video.write(frame)  # 처리된 프레임 저장
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cctv_video.release()
output_video.release()
cv2.destroyAllWindows()
