# 1) Setup

In [1]:
!pip install -U ultralytics pycocotools > /dev/null

# 2) Download and unzip BBBC041

In [2]:
!wget -O /content/malaria.zip "https://data.broadinstitute.org/bbbc/BBBC041/malaria.zip"
!unzip -o /content/malaria.zip -d /content

--2026-01-04 04:38:49--  https://data.broadinstitute.org/bbbc/BBBC041/malaria.zip
Resolving data.broadinstitute.org (data.broadinstitute.org)... 69.173.68.137
Connecting to data.broadinstitute.org (data.broadinstitute.org)|69.173.68.137|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2259224287 (2.1G) [application/zip]
Saving to: ‘/content/malaria.zip’


2026-01-04 04:39:55 (33.0 MB/s) - ‘/content/malaria.zip’ saved [2259224287/2259224287]

Archive:  /content/malaria.zip
   creating: /content/malaria/
   creating: /content/malaria/images/
  inflating: /content/malaria/images/002f20ad-2ace-499c-9335-c9080bc3e6b5.png  
   creating: /content/__MACOSX/
   creating: /content/__MACOSX/malaria/
   creating: /content/__MACOSX/malaria/images/
  inflating: /content/__MACOSX/malaria/images/._002f20ad-2ace-499c-9335-c9080bc3e6b5.png  
  inflating: /content/malaria/images/003a89b0-a095-417a-8dd6-f408339bbc68.png  
  inflating: /content/__MACOSX/malaria/images/._003a89b0-a0

# 3) Inspect extracted structure (should have /content/malaria/{images,training.json,test.json,...})

In [3]:
!ls -R /content/malaria | head

/content/malaria:
images
test.json
training.json

/content/malaria/images:
002f20ad-2ace-499c-9335-c9080bc3e6b5.png
003a89b0-a095-417a-8dd6-f408339bbc68.png
003d1cdc-0aec-430b-90e0-f4d82135b1ac.png
005e60b6-77b8-458c-b57c-bfe0c7e7df78.png


# 4) Convert BBBC041 annotations to YOLO txt

In [4]:
import json, os

BASE_PATH = "/content/malaria"
IMG_DIR = os.path.join(BASE_PATH, "images")
LABEL_DIR = os.path.join(BASE_PATH, "labels")
os.makedirs(LABEL_DIR, exist_ok=True)

CLASS_MAP = {
    "red blood cell": 0,
    "ring": 1,
    "trophozoite": 2,
    "schizont": 3,
    "gametocyte": 4,
    "leukocyte": 5,      # include leukocyte
    # 'difficult' will be skipped
}

with open(os.path.join(BASE_PATH, "training.json"), "r") as f:
    data = json.load(f)

for item in data:
    img_info = item["image"]
    pathname = img_info["pathname"]          # e.g., "/images/xxx.png"
    img_name = os.path.basename(pathname)    # "xxx.png"
    img_path = os.path.join(IMG_DIR, img_name)
    if not os.path.exists(img_path):
        continue

    H = img_info["shape"]["r"]
    W = img_info["shape"]["c"]

    label_file = os.path.join(LABEL_DIR, img_name.replace(".png", ".txt"))
    with open(label_file, "w") as lf:
        for obj in item["objects"]:
            category = obj["category"]
            if category not in CLASS_MAP:
                continue  # skip unknown/difficult
            cls = CLASS_MAP[category]

            x1 = obj["bounding_box"]["minimum"]["c"]
            y1 = obj["bounding_box"]["minimum"]["r"]
            x2 = obj["bounding_box"]["maximum"]["c"]
            y2 = obj["bounding_box"]["maximum"]["r"]

            xc = ((x1 + x2) / 2) / W
            yc = ((y1 + y2) / 2) / H
            bw = (x2 - x1) / W
            bh = (y2 - y1) / H

            lf.write(f"{cls} {xc} {yc} {bw} {bh}\n")


# 5) Train/val split and copy into YOLO folder structure

In [5]:
from sklearn.model_selection import train_test_split
import shutil

SRC_IMG = IMG_DIR
SRC_LBL = LABEL_DIR
DST_BASE = "/content/malaria_yolo"

imgs = [f for f in os.listdir(SRC_IMG) if f.endswith(".png")]
train, val = train_test_split(imgs, test_size=0.2, random_state=42, shuffle=True)

def copy_split(img_list, split):
    img_dst = f"{DST_BASE}/images/{split}"
    lbl_dst = f"{DST_BASE}/labels/{split}"
    os.makedirs(img_dst, exist_ok=True)
    os.makedirs(lbl_dst, exist_ok=True)
    for img in img_list:
        shutil.copy(os.path.join(SRC_IMG, img), os.path.join(img_dst, img))
        shutil.copy(os.path.join(SRC_LBL, img.replace(".png", ".txt")), os.path.join(lbl_dst, img.replace(".png", ".txt")))

copy_split(train, "train")
copy_split(val, "val")

# 6) Create YOLO dataset YAML

In [6]:
%%writefile /content/malaria_yolo/malaria.yaml
path: /content/malaria_yolo
train: images/train
val: images/val

names:
  0: red_blood_cell
  1: ring
  2: trophozoite
  3: schizont
  4: gametocyte
  5: leukocyte


Writing /content/malaria_yolo/malaria.yaml


# 7) Quick sanity check

In [7]:
!head -n 5 /content/malaria_yolo/malaria.yaml
!ls /content/malaria_yolo/images/train | head
!ls /content/malaria_yolo/labels/train | head


path: /content/malaria_yolo
train: images/train
val: images/val

names:
002f20ad-2ace-499c-9335-c9080bc3e6b5.png
005e60b6-77b8-458c-b57c-bfe0c7e7df78.png
00a02700-2ea2-4590-9e15-ffc9160fd3de.png
00d04a90-80e5-4bce-9511-1b64eabb7a47.png
0154dd8e-72f8-4d78-a4dd-93e139577bd1.png
01b38a56-13eb-447b-89ee-fc21cb40dcc1.png
01b99da8-b66b-464b-a7ee-1cadab8db18b.png
027ab174-e47a-44f8-b85b-372449eeb7d8.png
02ea7aa8-c142-4a3d-939f-e77689741341.png
02fb5e19-6f94-4787-bb78-e550a29a3cbd.png
002f20ad-2ace-499c-9335-c9080bc3e6b5.txt
005e60b6-77b8-458c-b57c-bfe0c7e7df78.txt
00a02700-2ea2-4590-9e15-ffc9160fd3de.txt
00d04a90-80e5-4bce-9511-1b64eabb7a47.txt
0154dd8e-72f8-4d78-a4dd-93e139577bd1.txt
01b38a56-13eb-447b-89ee-fc21cb40dcc1.txt
01b99da8-b66b-464b-a7ee-1cadab8db18b.txt
027ab174-e47a-44f8-b85b-372449eeb7d8.txt
02ea7aa8-c142-4a3d-939f-e77689741341.txt
02fb5e19-6f94-4787-bb78-e550a29a3cbd.txt


# 8) Train YOLOv8m (200 epochs as requested)

In [8]:
from ultralytics import YOLO

model = YOLO("yolov8m.pt")
model.train(
    data="/content/malaria_yolo/malaria.yaml",
    epochs=150,
    imgsz=640,
    batch=16,
    optimizer="AdamW",
    lr0=1e-3,
    cos_lr=True,
    patience=30,
    device=0,
    workers=2,
    name="train_bbbc041_v8m",
    project="/content/runs/detect",
)


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8m.pt to 'yolov8m.pt': 100% ━━━━━━━━━━━━ 49.7MB 294.6MB/s 0.2s
Ultralytics 8.3.247 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=True, cutmix=0.0, data=/content/malaria_yolo/malaria.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=150, erasing=0.4, exist_ok=Fa

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0, 1, 2, 3, 4, 5])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7ed2350e6930>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
     

# 9) Validate best checkpoint

In [9]:
model = YOLO("/content/runs/detect/train_bbbc041_v8m/weights/best.pt")
model.val(data="/content/malaria_yolo/malaria.yaml")

Ultralytics 8.3.247 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 92 layers, 25,843,234 parameters, 0 gradients, 78.7 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 3496.8±598.1 MB/s, size: 1896.6 KB)
[K[34m[1mval: [0mScanning /content/malaria_yolo/labels/val.cache... 242 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 242/242 451.7Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 16/16 1.7s/it 27.0s
                   all        242      16179      0.828       0.74      0.824      0.699
        red_blood_cell        242      15676      0.984      0.983      0.994      0.858
                  ring         42         65       0.87      0.692      0.784      0.701
           trophozoite        124        352      0.803      0.869      0.885      0.716
              schizont         33         36      0.561      0.583      0.674      0.559
        

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0, 1, 2, 3, 4, 5])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7ed1c3b12ba0>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
     

# 10) (Optional) Run inference on val set and count detections

In [10]:
results = model("/content/malaria_yolo/images/val/*.png", conf=0.25)
counts = {}
for r in results:
    for c in r.boxes.cls:
        c = int(c)
        counts[c] = counts.get(c, 0) + 1
print("Detection counts per class id:", counts)


image 1/242 /content/malaria_yolo/images/val/003a89b0-a095-417a-8dd6-f408339bbc68.png: 480x640 87 red_blood_cells, 2 schizonts, 50.7ms
image 2/242 /content/malaria_yolo/images/val/003d1cdc-0aec-430b-90e0-f4d82135b1ac.png: 480x640 29 red_blood_cells, 27.6ms
image 3/242 /content/malaria_yolo/images/val/00c8364b-8c85-4502-bcfe-64736fe76815.png: 480x640 67 red_blood_cells, 2 rings, 27.6ms
image 4/242 /content/malaria_yolo/images/val/036404d1-b9fc-497b-bbe1-f4d9c5ac3ce0.png: 480x640 78 red_blood_cells, 4 rings, 27.6ms
image 5/242 /content/malaria_yolo/images/val/036e008f-07a1-4e92-899d-1822dc390ccb.png: 480x640 55 red_blood_cells, 2 trophozoites, 27.7ms
image 6/242 /content/malaria_yolo/images/val/03cbfcf2-8da6-462c-b30d-16d63357371f.png: 480x640 74 red_blood_cells, 4 trophozoites, 27.6ms
image 7/242 /content/malaria_yolo/images/val/04018590-4e66-4cb9-ad2d-83570f1b29ae.png: 480x640 57 red_blood_cells, 1 schizont, 27.6ms
image 8/242 /content/malaria_yolo/images/val/04de3a31-fc09-40be-9c2c-9