# Repeat from colab jupyter

In [1]:
# Dockerfile has state installed
# Looking at your Dockerfile, the issue is that uv tool install creates an isolated environment that may not persist properly in the Docker container
# cause wandb config files missing 
! state --help

usage: state [-h] {emb,tx} ...

positional arguments:
  {emb,tx}

options:
  -h, --help  show this help message and exit


# Clone the repository

In [2]:
# Clone the repository
! git clone https://github.com/ArcInstitute/state.git
%cd state

# Colab-specific config for pytorch lightning
import os
os.environ['MPLBACKEND'] = 'Agg'

Cloning into 'state'...
remote: Enumerating objects: 6345, done.[K
remote: Counting objects: 100% (505/505), done.[K
remote: Compressing objects: 100% (209/209), done.[K90/209)[K
remote: Total 6345 (delta 367), reused 345 (delta 274), pack-reused 5840 (from 2)[K
Receiving objects: 100% (6345/6345), 122.69 MiB | 17.25 MiB/s, done.
Resolving deltas: 100% (3866/3866), done.
/workspace/state


# Download the Replogle-Nadig training dataset.

In [54]:
! mkdir training_dataset/

In [55]:
%pip install requests tqdm

import requests
from tqdm.auto import tqdm  # picks the best bar for the environment

url = "https://storage.googleapis.com/vcc_data_prod/datasets/state/competition_support_set.zip"
output_path = "/workspace/training_dataset/competition_support_set.zip"

# stream the download so we can track progress
response = requests.get(url, stream=True)
total = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f, tqdm(
    total=total, unit='B', unit_scale=True, desc="Downloading"
) as bar:
    for chunk in response.iter_content(chunk_size=8192):
        if not chunk:
            break
        f.write(chunk)
        bar.update(len(chunk))

[0mNote: you may need to restart the kernel to use updated packages.


Downloading:   0%|          | 0.00/6.69G [00:00<?, ?B/s]

In [56]:
from zipfile import ZipFile
from tqdm.auto import tqdm
import os

out_dir  = "/workspace/training_dataset/competition_support_set"

os.makedirs(out_dir, exist_ok=True)

with ZipFile(output_path, 'r') as z:
    for member in tqdm(z.infolist(), desc="Unzipping", unit="file"):
        z.extract(member, out_dir)

Unzipping:   0%|          | 0/10 [00:00<?, ?file/s]

# Set Weights and Biases Entity for tracking

In [58]:
# @title Set Weights and Biases Entity for tracking
entity = "arcinstitute" # @param {"type":"string","placeholder":"arcinstitute"}
! sed -i 's|entity: your_entity_name|entity: ${entity}|g' state/src/state/configs/wandb/default.yaml

# Training

### State TX Training Command
# This setups up training for State across datasets, using ESM2 featurizations
# of genes as the perturbation embeddings. Note that we are now generalizing
# across both contexts and perturbations (not just contexts)
! state tx train \
  data.kwargs.toml_config_path="/workspace/training_dataset/competition_support_set/starter.toml" \
  data.kwargs.num_workers=4 \
  data.kwargs.batch_col="batch_var" \
  data.kwargs.pert_col="target_gene" \
  data.kwargs.cell_type_key="cell_type" \
  data.kwargs.control_pert="non-targeting" \
  data.kwargs.perturbation_features_file="/workspace/training_dataset/competition_support_set/ESM2_pert_features.pt" \
  training.max_steps=400 \
  training.ckpt_every_n_steps=200 \
  model=state_sm \
  wandb.tags="[first_run]" \
  output_dir="/workspace/colab_competition/" \
  name="first_run" data.kwargs.embed_key=null 

In [93]:
# to solve the interactive input 3 for wandb
import subprocess
import os

# Set environment variables
env = os.environ.copy()
env['MPLBACKEND'] = 'Agg'
env['PYTHONPATH'] = 'state/src:' + env.get('PYTHONPATH', '')

# Run with input
process = subprocess.Popen([
    'state', 'tx', 'train',
    'data.kwargs.toml_config_path=/workspace/training_dataset/competition_support_set/starter.toml',
    'data.kwargs.num_workers=4',
    'data.kwargs.batch_col=batch_var',
    'data.kwargs.pert_col=target_gene',
    'data.kwargs.cell_type_key=cell_type',
    'data.kwargs.control_pert=non-targeting',
    'data.kwargs.perturbation_features_file=/workspace/training_dataset/competition_support_set/ESM2_pert_features.pt',
    'data.kwargs.embed_key=null',
    'training.max_steps=400',
    'training.ckpt_every_n_steps=200',
    'model=state_sm',
    'wandb.tags=[first_run]',
    'output_dir=/workspace/colab_competition/',
    'name=first_run'
], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
   text=True, env=env)

# Send "3" as input
output, _ = process.communicate(input="3\n")
print(output)

Seed set to 42
/workspace/training_dataset/competition_support_set/{competition_train,k562_gwps,rpe1,jurkat,k562,hepg2}.h5
Dataset path does not exist: /workspace/training_dataset/competition_support_set/{competition_train,k562_gwps,rpe1,jurkat,k562,hepg2}.h5

Processing replogle_h1:   0%|          | 0/6 [00:00<?, ?it/s]
                                                             
Processed competition_train: 221273 train, 0 val, 0 test

Processing replogle_h1:   0%|          | 0/6 [00:00<?, ?it/s]
Processing replogle_h1:  17%|█▋        | 1/6 [00:00<00:00,  5.01it/s]No cell barcode information found in /workspace/training_dataset/competition_support_set/k562_gwps.h5. Generating generic barcodes.

                                                                     
Processed k562_gwps: 111605 train, 0 val, 0 test

Processing replogle_h1:  17%|█▋        | 1/6 [00:00<00:00,  5.01it/s]No cell barcode information found in /workspace/training_dataset/competition_support_set/rpe1.h5. Genera

In [84]:
import h5py

# Check the structure of one of your H5 files
with h5py.File('/workspace/training_dataset/competition_support_set/competition_train.h5', 'r') as f:
    print(type(f))
    print("Keys in root:", list(f.keys()))
    if 'obsm' in f:
        print("Keys in obsm:", list(f['obsm'].keys()))
        print("Keys in X:", list(f['X'].keys()))
    else:
        print("No 'obsm' group found")

<class 'h5py._hl.files.File'>
Keys in root: ['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
Keys in obsm: []
Keys in X: ['data', 'indices', 'indptr']


In [90]:
# input data structure exploring
with h5py.File('/workspace/training_dataset/competition_support_set/competition_train.h5', 'r') as f:
    X_data = f['X']
    print(X_data.__dict__)

{'_id': <h5py.h5g.GroupID object at 0xffff82139710>}


### 1. Sovle wandb config missing error 

In [47]:
# path that hydra used that doesn't contain folder wandb
! ls /root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs

__init__.py  data	   model		training
config.yaml  default.yaml  state-defaults.yaml


In [39]:
! cat  state/src/state/configs/wandb/default.yaml

# Generic wandb configuration
# Users should customize these values for their own use
entity: $arcinstitute
project: state
local_wandb_dir: ./wandb_logs
tags: [] 


In [49]:
! mkdir /root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs/wandb/

In [50]:
# wandb default solved
! cp state/src/state/configs/wandb/default.yaml /root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs/wandb/

In [35]:
! ls /root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/hydra/_internal/

__init__.py	       config_search_path_impl.py  hydra.py
__pycache__	       core_plugins		   instantiate
callbacks.py	       defaults_list.py		   sources_registry.py
config_repository.py   grammar


### 2. Sovle model not found error

In [60]:
! ls /root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs/model/

cellcontextmean.yaml	      replogle_llama_9657984.yaml
celltypemean.yaml	      replogle_llama_speculative.yaml
cpa.yaml		      scgpt-chemical.yaml
decoder_only.yaml	      scgpt-genetic.yaml
embedsum.yaml		      scvi.yaml
globalsimplesum.yaml	      simplesum.yaml
old_neuralot.yaml	      tahoe_best.yaml
pertsets.yaml		      tahoe_decoder_test.yaml
replogle_best.yaml	      tahoe_llama_156142032.yaml
replogle_gpt_11538696.yaml    tahoe_llama_195177264.yaml
replogle_gpt_12872384.yaml    tahoe_llama_212693232.yaml
replogle_gpt_15522108.yaml    tahoe_llama_289720032.yaml
replogle_gpt_28942368.yaml    tahoe_llama_30335984.yaml
replogle_gpt_31043724.yaml    tahoe_llama_31911552.yaml
replogle_gpt_5769512.yaml     tahoe_llama_46577712.yaml
replogle_llama_11645640.yaml  tahoe_llama_58562784.yaml
replogle_llama_21712320.yaml  tahoe_llama_60671280.yaml
replogle_llama_23290296.yaml  tahoe_llama_62089464.yaml
replogle_llama_4424880.yaml   tahoe_llama_93133848.yaml
replogle_llama_8849104.yaml


In [61]:
! cp state/src/state/configs/model/state_sm.yaml /root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/configs/model/

### 3. Solve the matplotlib backend issue. 
The problem is that Jupyter Lab sets the matplotlib backend to 'module://matplotlib_inline.backend_inline', but when you're running the command in a Docker container without a display, this backend isn't valid.

In [63]:
import os
os.environ['MPLBACKEND'] = 'Agg'

### 4. Training dataset path correction

 Dataset path does not exist: /content/state/competition_support_set/{competition_train,k562_gwps,rpe1,jurkat,k562,hepg2}.h5
 cat training_dataset/competition_support_set/starter.toml 
 Dataset paths - maps dataset names to their directories

replogle_h1 = "/content/state/competition_support_set/{competition_train,k562_gwps,rpe1,jurkat,k562,hepg2}.h5"

### 5. trying to access an obsm key (likely an embedding key) that doesn't exist in your H5 files. 

In [71]:
# no key in obsm 
import h5py

# Check the structure of one of your H5 files
with h5py.File('/workspace/training_dataset/competition_support_set/competition_train.h5', 'r') as f:
    print("Keys in root:", list(f.keys()))
    if 'obsm' in f:
        print("Keys in obsm:", list(f['obsm'].keys()))
    else:
        print("No 'obsm' group found")

Keys in root: ['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
Keys in obsm: []


In [21]:
# to make sure the state configue files wandb exist, reinstall the sate
# solve error hydra.errors.MissingConfigException: In 'config': Could not find 'wandb/default'
! uv run state --help --link-mode=copy

usage: state [-h] {emb,tx} ...

positional arguments:
  {emb,tx}

options:
  -h, --help  show this help message and exit


In [28]:
! ls /root/.local/bin/state

/root/.local/bin/state


In [37]:
!  pwd

/workspace


# Run inference on the competition validation perturbations.

In [97]:
# View the available checkpoints
# This will be populated as you run training

! ls /workspace/colab_competition/first_run/checkpoints/

ls: cannot access '/workspace/colab_competition/first_run/checkpoints/': No such file or directory


In [19]:
! state tx infer \
  --output "competition/prediction.h5ad" \
  --model_dir "competition/first_run" \
  --checkpoint "competition/first_run/checkpoints/final.ckpt" \
  --adata "competition_support_set/competition_val_template.h5ad" \
  --pert_col "target_gene"

Traceback (most recent call last):
  File "/root/.local/bin/state", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/__main__.py", line 68, in main
    run_tx_infer(args)
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/_cli/_tx/_infer.py", line 58, in run_tx_infer
    cfg = load_config(config_path)
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/share/uv/tools/arc-state/lib/python3.11/site-packages/state/_cli/_tx/_infer.py", line 51, in load_config
    raise FileNotFoundError(f"Could not find config file: {cfg_path}")
FileNotFoundError: Could not find config file: competition/first_run/config.yaml


# Run Cell-Eval on the resulting anndata and submit your entry to the leaderboard.

In [None]:
# install zstd for cell eval prep
! sudo apt install -y zstd

In [None]:
! tool run --from git+https://github.com/ArcInstitute/cell-eval@main cell-eval prep -i competition/prediction.h5ad -g competition_support_set/gene_names.csv

# Run Inference