<a href="https://colab.research.google.com/github/AfifaMasood/AfifaMasood/blob/main/testing4_bundingbox%2Bresnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
pip install ultralytics pandas opencv-python deep_sort_realtime


Collecting ultralytics
  Downloading ultralytics-8.3.75-py3-none-any.whl.metadata (35 kB)
Collecting deep_sort_realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl.metadata (12 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.meta

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

# Load YOLOv8 for person detection
yolov8_model = YOLO("yolov8n.pt")

# Load fine-tuned PyTorch model (ResNet34) for action classification
resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 2)
resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
resnet_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet_model.to(device)

# Define transforms for ResNet
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    frame_count = 0
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Frame-level prediction
        processed_frame = preprocess_frame(frame)
        input_data = np.expand_dims(processed_frame, axis=0)
        predictions = frame_model.predict(input_data, verbose=0)[0]
        predicted_class = np.argmax(predictions)
        predicted_label = CLASSES_LIST[predicted_class]

        if predicted_label == "Shoplifting":
            results = yolov8_model(frame)
            bounding_boxes = [box.cpu().numpy() for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls) if int(cls) == 0]

            for box in bounding_boxes:
                x1, y1, x2, y2 = map(int, box)
                person_image = frame[y1:y2, x1:x2]
                if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                    continue

                person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(person_image)
                person_image = transform(pil_image).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = resnet_model(person_image)
                    _, predicted = torch.max(output, 1)
                    person_label = CLASSES_LIST[predicted.item()]

                color = (0, 0, 255) if person_label == "Shoplifting" else (0, 255, 0)
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        out.write(frame)
        frame_count += 1

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")

# Run the hybrid model
process_video("/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-1.mp4", "/content/drive/MyDrive/bounding_box_folder/Shoplifting-1_output_video.mp4")




Predicted: Normal (Normal: 0.86, Shoplifting: 0.14)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.87, Shoplifting: 0.13)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.84, Shoplifting: 0.16)
Predicted: Normal (Normal: 0.86, Shoplifting: 0.14)
Predicted: Normal (Normal: 0.86, Shoplifting: 0.14)
Predicted: Normal (Normal: 0.86, Shoplifting: 0.14)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.88, Shoplifting: 0.12)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.80, Shoplifting: 0.20)
Predicted: Normal (Normal: 0.76, Shoplifting: 0.24)
Predicted: Normal (Normal: 0.78, Shoplifting: 0.22)
Predicted: Normal (Normal: 0.79, Shoplifting: 0.21)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: N

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

# Load YOLOv8 for person detection
yolov8_model = YOLO("yolov8n.pt")

# Load fine-tuned PyTorch model (ResNet34) for action classification
resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 2)
resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
resnet_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet_model.to(device)

# Define transforms for ResNet
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    frame_count = 0
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Frame-level prediction
        processed_frame = preprocess_frame(frame)
        input_data = np.expand_dims(processed_frame, axis=0)
        predictions = frame_model.predict(input_data, verbose=0)[0]
        predicted_class = np.argmax(predictions)
        normal_probability = predictions[0]
        shoplifting_probability = predictions[1]
        predicted_label = CLASSES_LIST[predicted_class]

        text = f"Prediction: {predicted_label} ({shoplifting_probability:.2f})"
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 1.5
        font_thickness = 2
        text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
        text_width, text_height = text_size
        text_x, text_y = 10, 40

        # Draw a black rectangle as the background for the text
        cv2.rectangle(frame, (text_x - 5, text_y - text_height - 5),
                      (text_x + text_width + 5, text_y + 5), (0, 0, 0), -1)
        cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), font_thickness)

        if predicted_label == "Shoplifting":
            results = yolov8_model(frame)
            bounding_boxes = [box.cpu().numpy() for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls) if int(cls) == 0]

            for box in bounding_boxes:
                x1, y1, x2, y2 = map(int, box)
                person_image = frame[y1:y2, x1:x2]
                if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                    continue

                person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(person_image)
                person_image = transform(pil_image).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = resnet_model(person_image)
                    _, predicted = torch.max(output, 1)
                    person_label = CLASSES_LIST[predicted.item()]

                color = (0, 0, 255) if person_label == "Shoplifting" else (0, 255, 0)
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        out.write(frame)
        frame_count += 1

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")

# Run the hybrid model
process_video("/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-1.mp4", "/content/drive/MyDrive/bounding_box_folder/Shoplifting-1_output_video3.mp4")





0: 384x640 (no detections), 157.4ms
Speed: 6.1ms preprocess, 157.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 137.5ms
Speed: 3.8ms preprocess, 137.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 137.7ms
Speed: 5.4ms preprocess, 137.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 134.1ms
Speed: 4.9ms preprocess, 134.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 224.2ms
Speed: 7.1ms preprocess, 224.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 214.4ms
Speed: 4.6ms preprocess, 214.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 159.5ms
Speed: 5.9ms preprocess, 159.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 140.1ms
Speed: 4.8ms preprocess, 140.1ms inference, 1.1ms postproces

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

# Load YOLOv8 for person detection
yolov8_model = YOLO("yolov8n.pt")

# Load fine-tuned PyTorch model (ResNet34) for action classification
resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 2)
resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
resnet_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet_model.to(device)

# Define transforms for ResNet
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    frame_count = 0
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Frame-level prediction
        processed_frame = preprocess_frame(frame)
        input_data = np.expand_dims(processed_frame, axis=0)
        predictions = frame_model.predict(input_data, verbose=0)[0]
        normal_probability = predictions[0]
        shoplifting_probability = predictions[1]
        predicted_class = np.argmax(predictions)

        predicted_label = CLASSES_LIST[predicted_class]

        print(f"Predicted: {predicted_label} (Normal: {normal_probability:.2f}, Shoplifting: {shoplifting_probability:.2f})")

        text = f"Prediction: {predicted_label} ({shoplifting_probability:.2f})"
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 1.5
        font_thickness = 2
        text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
        text_width, text_height = text_size
        text_x, text_y = 10, 40

        # Draw a black rectangle as the background for the text
        cv2.rectangle(frame, (text_x - 5, text_y - text_height - 5),
                      (text_x + text_width + 5, text_y + 5), (0, 0, 0), -1)
        cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), font_thickness)

        if predicted_label == "Shoplifting":
            results = yolov8_model(frame)
            bounding_boxes = [box.cpu().numpy() for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls) if int(cls) == 0]

            for box in bounding_boxes:
                x1, y1, x2, y2 = map(int, box)
                person_image = frame[y1:y2, x1:x2]
                if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                    continue

                person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(person_image)
                person_image = transform(pil_image).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = resnet_model(person_image)
                    _, predicted = torch.max(output, 1)
                    person_label = CLASSES_LIST[predicted.item()]

                if person_label == "Shoplifting":
                    color = (0, 0, 255)
                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)


        out.write(frame)
        frame_count += 1

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")

# Run the hybrid model
process_video("/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-32.mp4", "/content/drive/MyDrive/bounding_box_folder/Shoplifting-32_output_video2.mp4")




Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.81, Shoplifting: 0.19)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.84, Shoplifting: 0.16)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.82, Shoplifting: 0.18)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.79, Shoplifting: 0.21)
Predicted: Normal (Normal: 0.80, Shoplifting: 0.20)
Predicted: Normal (Normal: 0.81, Shoplifting: 0.19)
Predicted: Normal (Normal: 0.81, Shoplifting: 0.19)
Predicted: Normal (Normal: 0.79, Shoplifting: 0.21)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.91, Shoplifting: 0.09)
Predicted: Normal (Normal: 0.95, Shoplifting: 0.05)
Predicted: Normal (Normal: 0.95, Shoplifting: 0.05)
Predicted: Normal (Normal: 0.96, Shoplifting: 0.04)
Predicted: N

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

# Load YOLOv8 for person detection
yolov8_model = YOLO("yolov8n.pt")

# Load fine-tuned PyTorch model (ResNet34) for action classification
resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 2)
resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
resnet_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet_model.to(device)

# Define transforms for ResNet
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    frame_count = 0
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Frame-level prediction
        processed_frame = preprocess_frame(frame)
        input_data = np.expand_dims(processed_frame, axis=0)
        predictions = frame_model.predict(input_data, verbose=0)[0]
        predicted_class = np.argmax(predictions)
        predicted_label = CLASSES_LIST[predicted_class]

        print(f"Frame Prediction: {predicted_label} (Normal: {predictions[0]:.2f}, Shoplifting: {predictions[1]:.2f})")

        text = f"Frame Prediction: {predicted_label} ({predictions[1]:.2f})"
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 1.5
        font_thickness = 2
        text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
        text_x, text_y = 10, 40

        # Draw a black rectangle as the background for the text
        cv2.rectangle(frame, (text_x - 5, text_y - text_size[1] - 5),
                      (text_x + text_size[0] + 5, text_y + 5), (0, 0, 0), -1)
        cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), font_thickness)

        # Person detection using YOLOv8
        results = yolov8_model(frame)
        bounding_boxes = [box.cpu().numpy() for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls) if int(cls) == 0]

        for box in bounding_boxes:
            x1, y1, x2, y2 = map(int, box)
            person_image = frame[y1:y2, x1:x2]
            if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                continue

            # If frame is Normal, mark all persons as Normal (Green)
            if predicted_label == "Normal":
                color = (0, 255, 0)  # Green
                person_label = "Normal"
            else:
                # Convert image and classify using ResNet
                person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(person_image)
                person_image = transform(pil_image).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = resnet_model(person_image)
                    _, predicted = torch.max(output, 1)
                    person_label = CLASSES_LIST[predicted.item()]

                color = (0, 0, 255) if person_label == "Shoplifting" else (0, 255, 0)

            # Draw bounding box and label
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        out.write(frame)
        frame_count += 1

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")

# Run the hybrid model
process_video("/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-32.mp4", "/content/drive/MyDrive/bounding_box_folder/Shoplifting-32_output_video.mp4")




Frame Prediction: Normal (Normal: 0.83, Shoplifting: 0.17)

0: 384x640 4 persons, 1 truck, 160.6ms
Speed: 5.4ms preprocess, 160.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)
Frame Prediction: Normal (Normal: 0.81, Shoplifting: 0.19)

0: 384x640 4 persons, 1 truck, 146.6ms
Speed: 5.3ms preprocess, 146.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Frame Prediction: Normal (Normal: 0.83, Shoplifting: 0.17)

0: 384x640 4 persons, 1 truck, 144.5ms
Speed: 5.1ms preprocess, 144.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Frame Prediction: Normal (Normal: 0.84, Shoplifting: 0.16)

0: 384x640 4 persons, 156.1ms
Speed: 5.2ms preprocess, 156.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Frame Prediction: Normal (Normal: 0.83, Shoplifting: 0.17)

0: 384x640 4 persons, 143.1ms
Speed: 4.8ms preprocess, 143.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
Frame Prediction: Normal (Normal:

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

# Load YOLOv8 for person detection
yolov8_model = YOLO("yolov8n.pt")

# Load fine-tuned PyTorch model (ResNet34) for action classification
resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 2)
resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
resnet_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet_model.to(device)

# Define transforms for ResNet
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    frame_count = 0
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        # Frame-level prediction
        processed_frame = preprocess_frame(frame)
        input_data = np.expand_dims(processed_frame, axis=0)
        predictions = frame_model.predict(input_data, verbose=0)[0]
        predicted_class = np.argmax(predictions)
        predicted_label = CLASSES_LIST[predicted_class]

        print(f"Predicted: {predicted_label} (Normal: {predictions[0]:.2f}, Shoplifting: {predictions[1]:.2f})")

        if predicted_label == "Shoplifting":
            results = yolov8_model(frame)
            bounding_boxes = [box.cpu().numpy() for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls) if int(cls) == 0]

            for box in bounding_boxes:
                x1, y1, x2, y2 = map(int, box)
                person_image = frame[y1:y2, x1:x2]
                if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                    continue



                person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(person_image)
                person_image = transform(pil_image).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = resnet_model(person_image)
                    probabilities = torch.nn.functional.softmax(output, dim=1)[0]
                    normal_prob = probabilities[0].item()
                    shoplifting_prob = probabilities[1].item()
                    _, predicted = torch.max(output, 1)
                    person_label = CLASSES_LIST[predicted.item()]

                print(f"Person detected: {person_label} (Normal: {normal_prob:.2f}, Shoplifting: {shoplifting_prob:.2f})")

                if shoplifting_prob == 1.00:
                    color = (0, 0, 255)  # Red box for shoplifting probability 1.00
                else:
                    color = (0, 255, 0)  # Green box for others

                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, f"{person_label} ({shoplifting_prob:.2f})", (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)


        out.write(frame)
        frame_count += 1

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")
# Run the hybrid model
process_video("/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-32.mp4", "/content/drive/MyDrive/bounding_box_folder/Shoplifting-32_output_video_prb3.mp4")





Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.81, Shoplifting: 0.19)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.84, Shoplifting: 0.16)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.82, Shoplifting: 0.18)
Predicted: Normal (Normal: 0.83, Shoplifting: 0.17)
Predicted: Normal (Normal: 0.79, Shoplifting: 0.21)
Predicted: Normal (Normal: 0.80, Shoplifting: 0.20)
Predicted: Normal (Normal: 0.81, Shoplifting: 0.19)
Predicted: Normal (Normal: 0.81, Shoplifting: 0.19)
Predicted: Normal (Normal: 0.79, Shoplifting: 0.21)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.85, Shoplifting: 0.15)
Predicted: Normal (Normal: 0.91, Shoplifting: 0.09)
Predicted: Normal (Normal: 0.95, Shoplifting: 0.05)
Predicted: Normal (Normal: 0.95, Shoplifting: 0.05)
Predicted: Normal (Normal: 0.96, Shoplifting: 0.04)
Predicted: N

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
import torch.nn as nn
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

# Constants
IMAGE_HEIGHT, IMAGE_WIDTH = 256, 256
CLASSES_LIST = ["Normal", "Shoplifting"]

def preprocess_frame(frame):
    resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
    return resized_frame / 255.0

# Load frame-level model (TensorFlow)
frame_model = tf.keras.models.load_model('/content/drive/MyDrive/60_model_bs_256_frame2.h5')

# Load YOLOv8 for person detection
yolov8_model = YOLO("yolov8n.pt")

# Load fine-tuned PyTorch model (ResNet34) for action classification
resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 2)
resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
resnet_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet_model.to(device)

# Define transforms for ResNet
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def is_blurry(image, threshold=100):
    """Check if image is blurry using Variance of Laplacian method."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    variance = cv2.Laplacian(gray, cv2.CV_64F).var()
    return variance < threshold  # If variance is low, image is blurry

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    min_bbox_size = 100  # Minimum width/height for a person to be considered

    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        results = yolov8_model(frame)
        bounding_boxes = [box.cpu().numpy() for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls) if int(cls) == 0]

        for box in bounding_boxes:
            x1, y1, x2, y2 = map(int, box)
            bbox_width = x2 - x1
            bbox_height = y2 - y1

            # Ignore people who are too small (likely far away)
            if bbox_width < min_bbox_size or bbox_height < min_bbox_size:
                continue

            person_image = frame[y1:y2, x1:x2]
            if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                continue

            # Check if the person image is blurry
            if is_blurry(person_image):
                continue

            # ResNet Processing
            person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(person_image)
            person_image = transform(pil_image).unsqueeze(0).to(device)

            with torch.no_grad():
                output = resnet_model(person_image)
                _, predicted = torch.max(output, 1)
                person_label = CLASSES_LIST[predicted.item()]

            color = (0, 0, 255) if person_label == "Shoplifting" else (0, 255, 0)
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        out.write(frame)

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")
# Run the hybrid model
process_video("/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-2.mp4", "/content/drive/MyDrive/bounding_box_folder/Shoplifting-2_output_video_dim2.mp4")



Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.




Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 80.2MB/s]
Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:01<00:00, 74.5MB/s]



0: 384x640 3 persons, 1 chair, 1 potted plant, 320.3ms
Speed: 19.1ms preprocess, 320.3ms inference, 33.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 potted plant, 122.8ms
Speed: 6.2ms preprocess, 122.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 130.8ms
Speed: 4.4ms preprocess, 130.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 131.0ms
Speed: 4.3ms preprocess, 131.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 125.0ms
Speed: 4.4ms preprocess, 125.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 129.0ms
Speed: 6.7ms preprocess, 129.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 131.8ms
Speed: 5.9ms preprocess, 131.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 124.2ms
Speed: 4.8ms preprocess

In [None]:
import cv2
import torch
import torch.nn as nn
import numpy as np
from PIL import Image
from torchvision import transforms, models
from ultralytics import YOLO

def is_clear_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
    mean_intensity = np.mean(gray)

    sharpness_threshold = 50
    brightness_threshold = 40

    return laplacian_var > sharpness_threshold and mean_intensity > brightness_threshold

def is_camera_angle_correct(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)

    if lines is not None:
        angles = []
        for rho, theta in lines[:, 0]:
            angle = np.degrees(theta)
            if 80 <= angle <= 100:  # Checking if lines are nearly vertical (90-degree view)
                angles.append(angle)
        return len(angles) > 0  # If we detect valid vertical lines, assume correct positioning
    return False

def process_video(video_path, output_video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_rate = int(video_capture.get(cv2.CAP_PROP_FPS))
    width, height = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (width, height))

    yolov8_model = YOLO("yolov8n.pt")
    resnet_model = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
    num_ftrs = resnet_model.fc.in_features
    resnet_model.fc = nn.Linear(num_ftrs, 2)
    resnet_model.load_state_dict(torch.load("/content/drive/MyDrive/resnet34_shoplifting.pth"))
    resnet_model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resnet_model.to(device)

    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break

        if not is_camera_angle_correct(frame):
            print("Warning: Camera is not positioned correctly at 90 degrees. Skipping frame.")
            continue  # Skip processing if the camera angle is not correct

        results = yolov8_model(frame)
        for box, cls in zip(results[0].boxes.xyxy, results[0].boxes.cls):
            if int(cls) == 0:
                x1, y1, x2, y2 = map(int, box.cpu().numpy())
                person_image = frame[y1:y2, x1:x2]
                if person_image.shape[0] == 0 or person_image.shape[1] == 0:
                    continue

                if is_clear_image(person_image):
                    person_image = cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB)
                    pil_image = Image.fromarray(person_image)
                    person_image = transform(pil_image).unsqueeze(0).to(device)

                    with torch.no_grad():
                        output = resnet_model(person_image)
                        _, predicted = torch.max(output, 1)
                        person_label = "Shoplifting" if predicted.item() == 1 else "Normal"
                        color = (0, 0, 255) if person_label == "Shoplifting" else (0, 255, 0)

                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(frame, person_label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        out.write(frame)

    video_capture.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Processing complete. Output saved as {output_video_path}")

# ✅ Call the function correctly
process_video(
    "/content/drive/MyDrive/FYP Dataset/Shoplifting/Shoplifting-32.mp4",
    "/content/drive/MyDrive/bounding_box_folder/Shoplifting-32_output_video_dim6.mp4"
)



0: 384x640 4 persons, 1 truck, 127.9ms
Speed: 3.9ms preprocess, 127.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 1 truck, 121.1ms
Speed: 5.0ms preprocess, 121.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 1 truck, 123.7ms
Speed: 4.9ms preprocess, 123.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 140.3ms
Speed: 4.7ms preprocess, 140.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 125.3ms
Speed: 5.1ms preprocess, 125.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 121.1ms
Speed: 4.7ms preprocess, 121.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 127.8ms
Speed: 4.3ms preprocess, 127.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 121.4ms
Speed: 4.3ms preprocess, 121.4ms infere