# Repeat from colab jupyter

In [25]:
# Dockerfile has state installed
# Looking at your Dockerfile, the issue is that uv tool install creates an isolated environment that may not persist properly in the Docker container
# cause wandb config files missing 
! state --help

usage: state [-h] {emb,tx} ...

positional arguments:
  {emb,tx}

options:
  -h, --help  show this help message and exit


# Clone the repository

In [2]:
# Clone the repository
! git clone https://github.com/ArcInstitute/state.git
%cd state

# Colab-specific config for pytorch lightning
import os
os.environ['MPLBACKEND'] = 'Agg'

Cloning into 'state'...
remote: Enumerating objects: 6345, done.[K
remote: Counting objects: 100% (505/505), done.[K
remote: Compressing objects: 100% (209/209), done.[K90/209)[K
remote: Total 6345 (delta 367), reused 345 (delta 274), pack-reused 5840 (from 2)[K
Receiving objects: 100% (6345/6345), 122.69 MiB | 17.25 MiB/s, done.
Resolving deltas: 100% (3866/3866), done.
/workspace/state


# Download the Replogle-Nadig training dataset.

In [3]:
%pip install requests tqdm

import requests
from tqdm.auto import tqdm  # picks the best bar for the environment

url = "https://storage.googleapis.com/vcc_data_prod/datasets/state/competition_support_set.zip"
output_path = "competition_support_set.zip"

# stream the download so we can track progress
response = requests.get(url, stream=True)
total = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f, tqdm(
    total=total, unit='B', unit_scale=True, desc="Downloading"
) as bar:
    for chunk in response.iter_content(chunk_size=8192):
        if not chunk:
            break
        f.write(chunk)
        bar.update(len(chunk))

[0mNote: you may need to restart the kernel to use updated packages.


Downloading:   0%|          | 0.00/6.69G [00:00<?, ?B/s]

In [4]:
from zipfile import ZipFile
from tqdm.auto import tqdm
import os

out_dir  = "competition_support_set"

os.makedirs(out_dir, exist_ok=True)

with ZipFile(output_path, 'r') as z:
    for member in tqdm(z.infolist(), desc="Unzipping", unit="file"):
        z.extract(member, out_dir)

Unzipping:   0%|          | 0/10 [00:00<?, ?file/s]

# Set Weights and Biases Entity for tracking

In [5]:
# @title Set Weights and Biases Entity for tracking
entity = "arcinstitute" # @param {"type":"string","placeholder":"arcinstitute"}
! sed -i 's|entity: your_entity_name|entity: ${entity}|g' src/state/configs/wandb/default.yaml

# Training

In [23]:
# State TX Training Command
# This setups up training for State across datasets, using ESM2 featurizations
# of genes as the perturbation embeddings. Note that we are now generalizing
# across both contexts and perturbations (not just contexts)
! state tx train \
  data.kwargs.toml_config_path="competition_support_set/starter.toml" \
  data.kwargs.num_workers=4 \
  data.kwargs.batch_col="batch_var" \
  data.kwargs.pert_col="target_gene" \
  data.kwargs.cell_type_key="cell_type" \
  data.kwargs.control_pert="non-targeting" \
  data.kwargs.perturbation_features_file="competition_support_set/ESM2_pert_features.pt" \
  training.max_steps=400 \
  training.ckpt_every_n_steps=200 \
  model=state_sm \
  wandb.tags="[test]" \
  output_dir="competition" \
  name="first_run"

Traceback (most recent call last):
  File "/root/.local/bin/state", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/__main__.py", line 61, in main
    cfg = load_hydra_config("tx", args.hydra_overrides)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/__main__.py", line 40, in load_hydra_config
    cfg = compose(config_name="config", overrides=overrides)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/hydra/compose.py", line 38, in compose
    cfg = gh.hydra.compose_config(
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/hydra/_internal/hydra.py", line 594, in compose_config
    cfg = self.config_loader.load_configuration(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^

In [9]:
# install wandb
! pip install wandb
import wandb

Collecting wandb
  Downloading wandb-0.21.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (10 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting protobuf!=4.21.0,!=5.28.0,<7,>=3.19.0 (from wandb)
  Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl.metadata (593 bytes)
Collecting pydantic<3 (from wandb)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.32.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3->wandb)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.2 (from pydantic<3->wandb)
  Downloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (6.8 kB)
Collecting typing-inspection>=0.4.0 (from pydantic<3->wandb)
  Downloading typing_inspection-0.4.1-py3-no

In [13]:
! state tx train --cfg job --help

usage: state tx train [-h] [hydra_overrides ...]

positional arguments:
  hydra_overrides  Hydra configuration overrides (e.g., data.batch_size=32)

options:
  -h, --help       show this help message and exit


In [21]:
# to make sure the state configue files wandb exist, reinstall the sate
# solve error hydra.errors.MissingConfigException: In 'config': Could not find 'wandb/default'
! uv run state --help --link-mode=copy

usage: state [-h] {emb,tx} ...

positional arguments:
  {emb,tx}

options:
  -h, --help  show this help message and exit


In [24]:
! ls file:///root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs

ls: cannot access 'file:///root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs': No such file or directory


# Run inference on the competition validation perturbations.

In [20]:
# View the available checkpoints
# This will be populated as you run training

! ls competition/first_run/checkpoints/

ls: cannot access 'competition/first_run/checkpoints/': No such file or directory


In [19]:
! state tx infer \
  --output "competition/prediction.h5ad" \
  --model_dir "competition/first_run" \
  --checkpoint "competition/first_run/checkpoints/final.ckpt" \
  --adata "competition_support_set/competition_val_template.h5ad" \
  --pert_col "target_gene"

Traceback (most recent call last):
  File "/root/.local/bin/state", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/__main__.py", line 68, in main
    run_tx_infer(args)
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/_cli/_tx/_infer.py", line 58, in run_tx_infer
    cfg = load_config(config_path)
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/_cli/_tx/_infer.py", line 51, in load_config
    raise FileNotFoundError(f"Could not find config file: {cfg_path}")
FileNotFoundError: Could not find config file: competition/first_run/config.yaml


# Run Cell-Eval on the resulting anndata and submit your entry to the leaderboard.

In [None]:
# install zstd for cell eval prep
! sudo apt install -y zstd

In [None]:
! tool run --from git+https://github.com/ArcInstitute/cell-eval@main cell-eval prep -i competition/prediction.h5ad -g competition_support_set/gene_names.csv