In [19]:
# !pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to c:\users\night shift\appdata\local\temp\pip-req-build-y9sri56a
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting ftfy (from clip==1.0)
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting regex (from clip==1.0)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting torch (from clip==1.0)
  Downloading torch-2.6.0-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting torchvision (from clip==1.0)
  Downloading torchvision-0.21.0-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting filelock (from torch->clip==1.0)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch->clip==1.0)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Co

  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git 'C:\Users\Night Shift\AppData\Local\Temp\pip-req-build-y9sri56a'


In [167]:
import cv2
import numpy as np
from PIL import Image
import torch
import clip
import os
from tqdm import tqdm

In [333]:
def setup_clip_model(labels):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, preprocess = clip.load("ViT-B/32", device=device)
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {label}") for label in labels]).to(device)
    return model, preprocess, text_inputs, device

In [61]:
def extract_obj_from_mask(img_path, mask_path, output_size=(244,244)):
    image = cv2.imread(img_path)
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

    if image is None or mask is None:
        raise ValueError(f"Mask: {mask_path} or image: {img_path} not found")
    
    _, thresh = cv2.threshold(mask, 1, 255, cv2.THRESH_BINARY)

    contour, _ = cv2.findContours(thresh, mode = cv2.RETR_EXTERNAL, method = cv2.CHAIN_APPROX_SIMPLE)

    if not contour:
        raise ValueError(f"Not Contour was found")

    x,y,w,h = cv2.boundingRect(np.concatenate(contour))

    cropped = image[y:y+h, x:x+w]
    cropped_resized = cv2.resize(cropped, output_size)

    cropped_pil = Image.fromarray(cv2.cvtColor(cropped_resized, cv2.COLOR_BGR2RGB))

    return cropped_pil

In [57]:
def classify_with_clip(pil_img, labels = labels, text_inputs = text_inputs):
    img_input = preprocess(pil_img).unsqueeze(0).to(device)

    with torch.no_grad():
        img_features = model.encode_image(img_input)
        text_features = model.encode_text(text_inputs)

        img_features /= img_features.norm(dim=1, keepdim=True)
        text_features /= text_features.norm(dim=1, keepdim=True)

        similarity = (100.0 * img_features @ text_features.T).softmax(dim=-1)
        top_prob, top_label_idx = similarity[0].max(0)
    
    predicted_label = labels[top_label_idx]
    confidence = top_prob.item()

    return predicted_label, confidence

In [339]:
def extract_center_crop(frame, size_ratio=0.5):
    h, w = frame.shape[:2]
    ch, cw = int(h * size_ratio), int(w * size_ratio)
    if ch == 0 or cw == 0:
        raise ValueError("Frame too small for cropping")
    x1, y1 = (w - cw) // 2, (h - ch) // 2
    return frame[y1:y1 + ch, x1:x1 + cw]

In [335]:
def run_batch_detection(image_root, mask_root):
    all_detections = []

    for class_folder in os.listdir(image_root):
        image_folder = os.path.join(image_root, class_folder)
        mask_folder = os.path.join(mask_root, class_folder)

        if not os.path.isdir(image_folder):
            continue

        print(f"🔍 Scanning folder: {class_folder}")
        for img_file in tqdm(os.listdir(image_folder)):
            if not img_file.lower().endswith((".jpg", ".png", ".jpeg")):
                continue

            image_path = os.path.join(image_folder, img_file)
            mask_path = os.path.join(mask_folder, os.path.splitext(img_file)[0] + "_mask.png")

            frame = cv2.imread(image_path)
            if frame is None:
                print(f"⚠️ Skipping {img_file} — image could not be loaded.")
                continue
            cropped = extract_center_crop(frame)

            try:
                cropped = extract_obj_from_mask(image_path, mask_path)
                label, prob = classify_with_clip(cropped)

                print(f"📣 ALERT: {label.upper()} detected with {prob:.1%} confidence — {img_file}")
                all_detections.append((img_file, label, prob))

            except Exception as e:
                print(f"❌ Error processing {img_file}: {e}")

    return all_detections


In [311]:
def run_video_detection(video_source=0):
    cap = cv2.VideoCapture(video_source)
    print("🎥 Starting video stream...")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Extract object (center crop)
        crop = extract_center_crop(frame)
        crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))

        # Classify
        label, prob = classify_with_clip(crop_pil)
        # Display alert on frame
        # if prob > .79:
        alert_text = f"{label.upper()} ({prob*100:.1f}%)"
        cv2.putText(frame, alert_text, (30, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 2)

        cv2.imshow("Live Detection", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            print("👋 Exiting...")
            break

    cap.release()
    cv2.destroyAllWindows()

In [353]:
labels = ["human", "butterfly", "cat", "dog", "horse", "elephant", "squirrel"]
model, preprocess, text_inputs, device = setup_clip_model(labels)

In [371]:
img_path = "animals10/raw-img/gatto/1001.jpeg"
mask_path = "animals10/renamed_masks/dog/dog_0091_mask.png"

cropped_object = extract_obj_from_mask(img_path, mask_path)
cropped_object.show()

In [373]:
# Classify with CLIP
label, prob = classify_with_clip(cropped_object)
print(f"🔔 Detected: {label} (confidence: {prob:.2%})")

🔔 Detected: human (confidence: 33.30%)


In [313]:
run_video_detection('Untitled design.mp4')

🎥 Starting video stream...
👋 Exiting...


In [341]:
# For animals
img_path = "Test_img/"
mask_path = "test_mask/"

animal_detections = run_batch_detection(img_path, mask_path)

# For humans (change path accordingly if needed)

# img_path = "segmentation-full-body-mads-dataset/segmentation_full_body_mads_dataset_1192_img/segmentation_full_body_mads_dataset_1192_img/images"
# mask_path = "segmentation-full-body-mads-dataset/segmentation_full_body_mads_dataset_1192_img/segmentation_full_body_mads_dataset_1192_img/masks"

# human_detections = run_batch_detection(img_path, mask_path)

🔍 Scanning folder: all


  9%|███████▋                                                                           | 7/75 [00:00<00:01, 66.85it/s]

❌ Error processing butterfly_0091.jpg: Mask: test_mask/all\butterfly_0091_mask.png or image: Test_img/all\butterfly_0091.jpg not found
❌ Error processing butterfly_0092.jpg: Mask: test_mask/all\butterfly_0092_mask.png or image: Test_img/all\butterfly_0092.jpg not found
❌ Error processing butterfly_0093.jpg: Mask: test_mask/all\butterfly_0093_mask.png or image: Test_img/all\butterfly_0093.jpg not found
❌ Error processing butterfly_0094.jpg: Mask: test_mask/all\butterfly_0094_mask.png or image: Test_img/all\butterfly_0094.jpg not found
❌ Error processing butterfly_0095.jpg: Mask: test_mask/all\butterfly_0095_mask.png or image: Test_img/all\butterfly_0095.jpg not found
❌ Error processing butterfly_0096.jpg: Mask: test_mask/all\butterfly_0096_mask.png or image: Test_img/all\butterfly_0096.jpg not found
❌ Error processing butterfly_0097.jpg: Mask: test_mask/all\butterfly_0097_mask.png or image: Test_img/all\butterfly_0097.jpg not found
❌ Error processing butterfly_0098.jpg: Mask: test_mask/

 21%|█████████████████▍                                                                | 16/75 [00:00<00:02, 22.90it/s]

📣 ALERT: DOG detected with 97.9% confidence — dog_4853.jpeg
📣 ALERT: DOG detected with 98.5% confidence — dog_4854.jpeg
📣 ALERT: DOG detected with 83.8% confidence — dog_4855.jpeg
📣 ALERT: DOG detected with 98.3% confidence — dog_4856.jpeg


 27%|█████████████████████▊                                                            | 20/75 [00:02<00:08,  6.40it/s]

📣 ALERT: DOG detected with 97.9% confidence — dog_4857.jpeg
📣 ALERT: DOG detected with 98.9% confidence — dog_4858.jpeg
📣 ALERT: DOG detected with 97.2% confidence — dog_4859.jpeg


 31%|█████████████████████████▏                                                        | 23/75 [00:03<00:11,  4.53it/s]

📣 ALERT: DOG detected with 99.1% confidence — dog_4860.jpeg
📣 ALERT: DOG detected with 96.2% confidence — dog_4861.jpeg


 33%|███████████████████████████▎                                                      | 25/75 [00:04<00:12,  3.88it/s]

📣 ALERT: HUMAN detected with 20.1% confidence — dog_4862.jpeg


 35%|████████████████████████████▍                                                     | 26/75 [00:04<00:13,  3.63it/s]

📣 ALERT: DOG detected with 98.6% confidence — dog_4863.jpeg


 36%|█████████████████████████████▌                                                    | 27/75 [00:05<00:09,  4.99it/s]

📣 ALERT: ELEPHANT detected with 100.0% confidence — elephant_0002.jpg





KeyboardInterrupt: 