In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Print all data paths in nuscenes
import os

for root, dirs, files in os.walk('/content/drive/MyDrive/SOTA AI RL CV Agents DL ML App Website Cutting Edge Tech Humanitarian E2E Projects/Autonomous Driving SOTA Vision + Language Encoder -> World Model E2E with Demo/nuscenes'):
    for file in files:
        print(f"File Path: {file}")

File Path: gitattributes (2)
File Path: README (1).md
File Path: drivelm_carla_keyframes (1).txt
File Path: v1_1_val_nus_q_only (1).json
File Path: drivelm_carla_vqas (1).zip
File Path: v1_1_train_nus (1).json
File Path: drivelm_nus_imgs_val (1).zip
File Path: drivelm_nus_imgs_train (2).zip


In [3]:
!pip install ultralytics
!pip install timm

Collecting ultralytics
  Downloading ultralytics-8.3.214-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.214-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.17-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.214 ultralytics-thop-2.0.17


In [None]:
import torch
import numpy as np
from pathlib import Path
from tqdm import tqdm
from PIL import Image
import zipfile
from ultralytics import YOLO
from torchvision import models, transforms

# ============================================
# PATHS
# ============================================
TRAIN_ZIP = "/content/drive/MyDrive/SOTA AI RL CV Agents DL ML App Website Cutting Edge Tech Humanitarian E2E Projects/Autonomous Driving SOTA Vision + Language Encoder -> World Model E2E with Demo/nuscenes/drivelm_nus_imgs_train (2).zip"
OUTPUT_DIR = "/content/drive/MyDrive/extracted_features"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_SAMPLES = None

# ============================================
# 1. Load MobileNetV3 (FAST embeddings)
# ============================================
print("Loading MobileNetV3...")
mobilenet = models.mobilenet_v3_large(pretrained=True)
mobilenet.classifier = torch.nn.Identity()  # Remove classification head
mobilenet = mobilenet.to(DEVICE).eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print("✅ MobileNetV3 loaded (1280-dim, ~5ms)")

# ============================================
# 2. Load YOLOv11-seg
# ============================================
print("Loading YOLOv11-seg...")
yolo = YOLO('yolo11n-seg.pt')
print("✅ YOLO loaded")

# ============================================
# 3. Extract images
# ============================================
print("Extracting images...")
extracted = "./temp_images"
Path(extracted).mkdir(exist_ok=True)

with zipfile.ZipFile(TRAIN_ZIP, 'r') as z:
    z.extractall(extracted)

images = list(Path(extracted).rglob("*.jpg")) + list(Path(extracted).rglob("*.png"))
if NUM_SAMPLES:
    images = images[:NUM_SAMPLES]

print(f"Found {len(images)} images")

# ============================================
# 4. Extract features (FAST)
# ============================================
print("Extracting features...")

mobilenet_feats = []
yolo_feats = []

for img_path in tqdm(images):
    img = Image.open(img_path).convert('RGB')

    # MobileNet embedding
    img_tensor = preprocess(img).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        feat = mobilenet(img_tensor).cpu().numpy()[0]  # (1280,)
    mobilenet_feats.append(feat)

    # YOLO detection + segmentation
    results = yolo(img_path, verbose=False)
    boxes = results[0].boxes.data.cpu().numpy() if results[0].boxes else np.array([])
    masks = results[0].masks.data.cpu().numpy() if results[0].masks else np.array([])

    yolo_feat = np.zeros(256)
    for box in boxes:
        cls = int(box[5])
        if cls < 80:
            yolo_feat[cls] += 1

    if len(masks) > 0:
        yolo_feat[100] = len(masks)
        yolo_feat[101] = masks.mean()

    if len(boxes) > 0:
        yolo_feat[150] = len(boxes)
        yolo_feat[151] = boxes[:, 4].mean()

    yolo_feats.append(yolo_feat)

mobilenet_feats = np.stack(mobilenet_feats)
yolo_feats = np.stack(yolo_feats)

# ============================================
# 5. Save
# ============================================
print("Saving...")
Path(OUTPUT_DIR).mkdir(exist_ok=True, parents=True)

np.save(f"{OUTPUT_DIR}/mobilenet_features.npy", mobilenet_feats)
np.save(f"{OUTPUT_DIR}/yolo_features.npy", yolo_feats)

print(f"✅ DONE. {len(images)} samples")
print(f"   MobileNet: {mobilenet_feats.shape}")
print(f"   YOLO: {yolo_feats.shape}")



Loading MobileNetV3...
Downloading: "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_large-8738ca79.pth


100%|██████████| 21.1M/21.1M [00:00<00:00, 226MB/s]


✅ MobileNetV3 loaded (1280-dim, ~5ms)
Loading YOLOv11-seg...
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-seg.pt to 'yolo11n-seg.pt': 100% ━━━━━━━━━━━━ 5.9MB 94.7MB/s 0.1s
✅ YOLO loaded
Extracting images...
Found 24432 images
Extracting features...


 13%|█▎        | 3145/24432 [02:34<17:37, 20.14it/s]