## Setup 
This is originally on Google Colab.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Direct to your assignment folder.
%cd /content/drive/MyDrive/ML_P2_Illustrations

%load_ext autoreload
%autoreload 1

# Install
!pip install torch torchvision torchaudio ultralytics supervision
!!pip install transformers accelerate sentencepiece

Mounted at /content/drive
/content/drive/MyDrive/ML_P2_Illustrations
Collecting ultralytics
  Downloading ultralytics-8.3.48-py3-none-any.whl.metadata (35 kB)
Collecting supervision
  Downloading supervision-0.25.0-py3-none-any.whl.metadata (14 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.48-py3-none-any.whl (898 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m898.8/898.8 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading supervision-0.25.0-py3-none-any.whl (181 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, supervision, ultralytics
Successfully installed supervision-0.25.0 ultralytics-8.3.48 ultralytics-thop-2.0.13




## Entity Detection & Image Captioning
We applied YOLOv10 for entity detection and BLIP for image captioning to the whole dataset. For entity detection, the confidence threshold is set as 5% again for flexible future filtering.

In [7]:
import os
import json
import torch
from ultralytics import YOLO
import supervision as sv
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Settings
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CONFIDENCE_THRESHOLD = 0.05

# Load YOLO model
yolo_model = YOLO("yolov10x.pt")

# Load BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(DEVICE)

# Define category mappings
category_dict = {
    0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
    6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
    11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
    16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
    22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie',
    28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball',
    33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard',
    37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass',
    41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana',
    47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot',
    52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
    57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
    62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
    67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
    72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
    77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
}

animal_classes = {'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe'}

# Classification logic
def classify_detections(detected_classes):
    has_human = 'person' in detected_classes
    has_animal = any(animal in detected_classes for animal in animal_classes)
    if has_human and has_animal:
        return "Both"
    elif has_human:
        return "Human"
    elif has_animal:
        return "Animal"
    else:
        return "Neither"

# Process images with YOLO
def process_with_yolo(image_path):
    results = yolo_model(image_path, conf=CONFIDENCE_THRESHOLD, verbose=False)[0]
    detections = sv.Detections.from_ultralytics(results)

    detected_entities = []
    detected_classes = []

    for class_id, confidence, bbox in zip(detections.class_id, detections.confidence, detections.xyxy):
        class_name = category_dict[class_id]
        detected_classes.append(class_name)
        detected_entities.append({
            "class": class_name,
            "confidence": float(confidence),  # Convert to float for JSON serialization
            "bounding_box": [float(coord) for coord in bbox]  # Convert to list of floats
        })

    classification = classify_detections(detected_classes)
    return classification, detected_entities

# Process images with BLIP for caption generation
def process_with_blip(image_path):
    try:
        # Load and preprocess the image
        image = Image.open(image_path).convert("RGB")
        inputs = blip_processor(images=image, return_tensors="pt").to(DEVICE)
        outputs = blip_model.generate(**inputs, max_new_tokens=50)
        caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return "Error"

# Process the whole dataset and save as JSON
def process_dataset(base_folder):
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        if not os.path.isdir(folder_path):
            continue

        # Initialize results for the folder
        folder_results = {
            "book_name": folder_name,
            "pages": []
        }

        for file_name in os.listdir(folder_path):
            if file_name.endswith(".png"):
                image_path = os.path.join(folder_path, file_name)

                # YOLO detections
                classification, detected_entities = process_with_yolo(image_path)

                # BLIP caption
                caption = process_with_blip(image_path)

                # Append detection result
                folder_results["pages"].append({
                    "page": file_name,
                    "classification": classification,
                    "detected_entities": detected_entities,
                    "caption": caption
                })

        # Save results as JSON
        output_file_path = os.path.join(folder_path, f"detections.json")
        with open(output_file_path, 'w') as json_file:
            json.dump(folder_results, json_file, indent=4)

        print(f"Processed folder: {folder_name}, results saved to {output_file_path}")

# Define base folder for dataset
base_folder = "preprocessed_all"

# Run dataset processing
process_dataset(base_folder)


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Processed folder: lifedeathofjenny00newyiala, results saved to preprocessed_all/lifedeathofjenny00newyiala/detections.json
Processed folder: lifedeathofjenny00yorkiala, results saved to preprocessed_all/lifedeathofjenny00yorkiala/detections.json
Processed folder: lifeingreenwood00lans, results saved to preprocessed_all/lifeingreenwood00lans/detections.json
Processed folder: lifeofgeorgewash00ceciiala, results saved to preprocessed_all/lifeofgeorgewash00ceciiala/detections.json
Processed folder: lifeofjacksprath00londiala, results saved to preprocessed_all/lifeofjacksprath00londiala/detections.json
Processed folder: lifeofjacksprath00manciala, results saved to preprocessed_all/lifeofjacksprath00manciala/detections.json
Processed folder: lifeofjacksprath00yorkiala, results saved to preprocessed_all/lifeofjacksprath00yorkiala/detections.json
Processed folder: lifeofjosephson00macgiala, results saved to preprocessed_all/lifeofjosephson00macgiala/detections.json
Processed folder: lifetravel