In [2]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.54.0-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m122.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.34.1-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.33.4
    Uninstalling huggingface-hub-0.33.4:
      Successfully uninstalled huggingface-hub-0.33.4
  Attempting uninstall: transfor

## Local Inference on GPU
Model page: https://huggingface.co/theaiinstitute/theia-small-patch16-224-cdiv

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/theaiinstitute/theia-small-patch16-224-cdiv)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [15]:
# Install these if you haven’t already:
# !pip install transformers torch torchvision pillow tqdm

import os
from PIL import Image
from tqdm import tqdm
import numpy as np
import torch
from torchvision import transforms
from transformers import AutoModel

# --- 1) CONFIGURATION ---
model_id = "theaiinstitute/theia-small-patch16-224-cdiv"
image_folder = "/content/data"            # 👈 your images here
output_folder = "/content/theia_features" # 👈 where to save .npy files
os.makedirs(output_folder, exist_ok=True)

# --- 2) TRANSFORMS: just resize + to‑tensor ([0,1]) ---
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# --- 3) LOAD THEIA MODEL ---
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# --- 4) PROCESS EACH IMAGE ---
for fname in tqdm(sorted(os.listdir(image_folder)), desc="Extracting Theia features"):
    if not fname.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    # load + preprocess
    img = Image.open(os.path.join(image_folder, fname)).convert("RGB")
    inp = transform(img).unsqueeze(0)  # shape (1,3,224,224)

    # forward
    with torch.no_grad():
        out = model(x=inp)

    # pick the very first tensor returned (dict→first value, tuple→[0], tensor itself)
    if isinstance(out, dict):
        tokens = list(out.values())[0]
    elif isinstance(out, (tuple, list)):
        tokens = out[0]
    elif isinstance(out, torch.Tensor):
        tokens = out
    else:
        raise TypeError(f"Unexpected output type: {type(out)}")

    # to numpy, strip batch dim
    arr = tokens.squeeze().cpu().numpy()       # e.g. (197,384)
    cls = arr[0] if arr.ndim == 2 else None    # first token

    # save
    base = os.path.splitext(fname)[0]
    np.save(os.path.join(output_folder, f"{base}_all.npy"), arr)
    if cls is not None:
        np.save(os.path.join(output_folder, f"{base}_cls.npy"), cls)

print(f"\n✅ Done! Features saved to: {output_folder}")


Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Extracting Theia features: 100%|██████████| 50/50 [00:13<00:00,  3.60it/s]


✅ Done! Features saved to: /content/theia_features





In [17]:
!pip install stable_baselines3

Collecting stable_baselines3
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (

In [2]:
!pip install gymnasium-robotics mujoco glfw pyopengl


Collecting gymnasium-robotics
  Downloading gymnasium_robotics-1.4.1-py3-none-any.whl.metadata (8.9 kB)
Collecting mujoco
  Downloading mujoco-3.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting glfw
  Downloading glfw-2.9.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting PettingZoo>=1.23.0 (from gymnasium-robotics)
  Downloading pettingzoo-1.25.0-py3-none-any.whl.metadata (8.9 kB)
Downloading gymnasium_robotics-1.4.1-py3-none-any.whl (26.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.2/26.2 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mujoco-3.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB

In [13]:

!apt-get update -qq && apt-get install -y -qq xvfb ffmpeg libosmesa6-dev libgl1-mesa-glx
!pip install stable-baselines3 gymnasium gymnasium-robotics mujoco glfw pyopengl torch torchvision transformers tqdm pyvirtualdisplay

import os
# 2) Headless setup for MuJoCo rendering
os.environ['MUJOCO_GL'] = 'osmesa'
from pyvirtualdisplay import Display
Display(visible=0, size=(1400, 900))

import gymnasium as gym
import gymnasium_robotics   # registers Fetch envs
import torch
from torchvision import transforms
from transformers import AutoModel
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage, VecNormalize
from stable_baselines3.common.monitor import Monitor
import numpy as np

# --- 3) Wrapper that makes obs = raw RGB frames ---
class RenderEnv(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        # Initialize so render() is legal
        self.env.reset()
        img = self.env.render()
        self.observation_space = gym.spaces.Box(0, 255, shape=img.shape, dtype=np.uint8)
        self.action_space = env.action_space

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        return self.env.render(), {}  # obs, info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        return self.env.render(), reward, terminated, truncated, info

# --- 4) Theia feature extractor returning CLS token embeddings ---
class TheiaExtractor(BaseFeaturesExtractor):
    def __init__(self, obs_space, model_id="theaiinstitute/theia-small-patch16-224-cdiv"):
        super().__init__(obs_space, features_dim=384)
        self.theia = AutoModel.from_pretrained(model_id, trust_remote_code=True).eval()
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),  # [0,1]
        ])

    def forward(self, obs: torch.Tensor) -> torch.Tensor:
        # obs: [B, H, W, C], uint8 [0–255]
        imgs = obs.to(torch.float32) / 255.0       # → [0,1]
        imgs = imgs.permute(0, 3, 1, 2)            # → [B, C, H, W]
        batch = torch.stack([self.transform(img) for img in imgs], dim=0)
        with torch.no_grad():
            out = self.theia(x=batch)
        # extract first returned tensor (token embeddings)
        if isinstance(out, dict):
            tokens = next(iter(out.values()))
        elif isinstance(out, (tuple, list)):
            tokens = out[0]
        else:
            tokens = out
        # return CLS token [B, 384]
        return tokens[:, 0, :]

# --- 5) Environment factory using FetchReach-v4 with RenderEnv ---
def make_env():
    base = gym.make("FetchReach-v4", render_mode="rgb_array")
    wrapped = RenderEnv(base)
    return Monitor(wrapped)

# --- 6) Vectorize & normalize ---
venv = DummyVecEnv([make_env])
venv = VecTransposeImage(venv)                   # HWC → CHW
venv = VecNormalize(venv, norm_obs=True, clip_obs=10.0)

# --- 7) PPO + Theia setup with CnnPolicy ---
policy_kwargs = dict(
    features_extractor_class=TheiaExtractor,
    features_extractor_kwargs=dict(model_id="theaiinstitute/theia-small-patch16-224-cdiv")
)
model = PPO(
    "CnnPolicy",
    venv,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="./ppo_theia_tensorboard/"
)

# --- 8) Train ---
model.learn(total_timesteps=200_000)

# --- 9) Save & quick eval ---
model.save("ppo_fetchreach_theia")
obs, _ = venv.reset()
for _ in range(200):
    action, _ = model.predict(obs, deterministic=True)
    obs, _, dones, _, _ = venv.step(action)
    if dones.any():
        obs, _ = venv.reset()

print("✅ Done! Model saved as ppo_fetchreach_theia.zip")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


FatalError: gladLoadGL error

In [18]:
# 1) Zip the features folder
!zip -r theia_features.zip /content/theia_features

# 2) Download the zip to your local machine
from google.colab import files
files.download('theia_features.zip')


  adding: content/theia_features/ (stored 0%)
  adding: content/theia_features/ep004_step009_cls.npy (deflated 7%)
  adding: content/theia_features/ep001_step000_cls.npy (deflated 7%)
  adding: content/theia_features/ep000_step007_all.npy (deflated 7%)
  adding: content/theia_features/ep000_step003_cls.npy (deflated 7%)
  adding: content/theia_features/ep003_step000_cls.npy (deflated 7%)
  adding: content/theia_features/ep002_step004_all.npy (deflated 7%)
  adding: content/theia_features/ep004_step002_cls.npy (deflated 7%)
  adding: content/theia_features/ep000_step009_all.npy (deflated 7%)
  adding: content/theia_features/ep000_step001_all.npy (deflated 7%)
  adding: content/theia_features/ep002_step003_cls.npy (deflated 7%)
  adding: content/theia_features/ep004_step009_all.npy (deflated 7%)
  adding: content/theia_features/ep002_step004_cls.npy (deflated 7%)
  adding: content/theia_features/ep001_step006_all.npy (deflated 7%)
  adding: content/theia_features/ep000_step002_all.npy (d

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>