In [1]:
import pandas as pd, numpy as np
from pathlib import Path
import zarr

In [1]:
import pandas as pd

df = pd.read_csv("/hpc/group/jilab/rz179/cellpt/combined/16s/3xZoomed/combined_meta_uniform3x16s_selected_train.csv")
label_col = "cell_type"  # whatever your label column is
counts = df[label_col].value_counts()
cap = int(3 * counts.median())  # try 3, then 2 if collapse remains

df_bal = df.groupby(label_col, group_keys=False).apply(
    lambda g: g.sample(n=min(len(g), cap), random_state=42)
)
df_bal.to_csv("/hpc/group/jilab/rz179/cellpt/combined/16s/3xZoomed/train_capped.csv", index=False)


In [None]:
# 1) Decode 'aa...-N' -> (prefix_int, suffix_int)
_hex_from = dict(zip("abcdefghijklmnop", "0123456789abcdef"))
def decode_cellid(cellid: str):
    pref, suf = cellid.split("-")
    # sanity: Xenium prefixes must be a..p only
    if any(ch not in _hex_from for ch in pref):
        raise ValueError(f"Bad prefix letters in {cellid}")
    hexstr = "".join(_hex_from[ch] for ch in pref)
    return int(hexstr, 16), int(suf)

# 2) Load CSV of labels (your file has columns: cellid, celltype)
csv = "/hpc/group/jilab/hz/xenium/celltype/Xenium_human_Lung_Cancer_FFPE.csv"
df = pd.read_csv(csv)
df = df.rename(columns={"cellid":"cell_id", "celltype":"cell_type"})
df[["prefix_int","suffix_int"]] = df["cell_id"].apply(
    lambda s: pd.Series(decode_cellid(str(s)))
)

# 3) Open Zarr and index its /cell_id table -> row index
z = zarr.open("/path/to/your/cells.zarr.zip", mode="r")  # adjust path
cid = np.asarray(z["cell_id"][:], dtype=np.uint32)       # shape (N,2)
# map (prefix_int, suffix_int) -> row idx
key2idx = { (int(p), int(s)): i for i,(p,s) in enumerate(cid) }
df["zarr_idx"] = df[["prefix_int","suffix_int"]].apply(lambda r: key2idx.get((int(r[0]), int(r[1]))), axis=1)

# 4) Report coverage
n_total   = len(df)
n_mapped  = df["zarr_idx"].notna().sum()
print(f"Mapped {n_mapped}/{n_total} rows to Zarr indices")
if n_mapped < n_total:
    print("Examples not found:", df[df["zarr_idx"].isna()].head(10)[["cell_id","cell_type"]].to_dict(orient="records"))