## Hest-1K ##
*Data Download*

In [1]:
from huggingface_hub import login
import pandas as pd

login(token="YOUR TOKEN")

meta_df = pd.read_csv("hf://datasets/MahmoodLab/hest/HEST_v1_1_0.csv")

DOWNLOAD_ALL = False        # 전체 폴더 다운받을지 or 일부 폴더만 다운받을지
FOLDERS = ['metadata', 'st', 'patches'] # 일부 폴더만 다운받을 경우 받을 폴더 목록 설정(현재 /metadata, /st, /patches 폴더만)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import datasets
import os
import zipfile
from huggingface_hub import snapshot_download
from tqdm import tqdm

def download_hest(patterns, local_dir, download_all):
    repo_id = 'MahmoodLab/hest'

    if not download_all:
      folders = FOLDERS
      allow_patterns = []
      for fid in ids_to_query:
          for folder in folders:
              allow_patterns.append(f"{folder}/{fid}[._]*")
      snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, repo_type="dataset", local_dir=local_dir)
    else:
      snapshot_download(repo_id=repo_id, allow_patterns=patterns, repo_type="dataset", local_dir=local_dir)

    seg_dir = os.path.join(local_dir, 'cellvit_seg')
    if os.path.exists(seg_dir):
        print('Unzipping cell vit segmentation...')
        for filename in tqdm([s for s in os.listdir(seg_dir) if s.endswith('.zip')]):
            path_zip = os.path.join(seg_dir, filename)
            with zipfile.ZipFile(path_zip, 'r') as zip_ref:
                zip_ref.extractall(seg_dir)

In [3]:
local_dir='../../hest_data'

meta_bw = meta_df[meta_df['organ']=='Bowel']

ids_to_query = meta_bw['id'].values
list_patterns = [f"*{id}[_.]**" for id in ids_to_query]

download_hest(list_patterns, local_dir, DOWNLOAD_ALL) 

Fetching 282 files: 100%|██████████| 282/282 [1:50:15<00:00, 23.46s/it]   


*Pre-processing*

In [6]:
import scanpy as sc
import json

for id in tqdm(ids_to_query):
    # AnnData
    adata = sc.read_h5ad(f"{local_dir}/st/{id}.h5ad")
    adata.layers["raw"] = adata.X.copy() # raw data adata.raw_counts에 백업

    adata.var_names_make_unique()
    if adata.raw is not None:
      adata.raw.var_names_make_unique()

    # metadata
    with open(f"{local_dir}/metadata/{id}.json") as f:
      meta = json.load(f)

    # metadata -> adata.obs에 추가
    organ = meta.get('organ')

    disease_state = meta.get('disease_state')
    if disease_state in ['Tumor', 'Cancer']:
      disease_state = 1
    elif disease_state == 'Healthy':
      disease_state = 0
    else:
      disease_state = ""

    oncotree_code = meta.get('oncotree_code')
    species = meta.get('species')

    adata.obs['sample_id'] = id
    adata.obs['organ'] = organ
    adata.obs['disease_state'] = disease_state
    adata.obs['oncotree_code'] = oncotree_code
    adata.obs['species'] = species

    # zero spot/gene filtering
    sc.pp.filter_cells(adata, min_counts=1)
    sc.pp.filter_genes(adata, min_counts=1)

    # normalization
    sc.pp.normalize_total(adata, inplace=True)  # Normalizing to median total counts
    sc.pp.log1p(adata)  # Logarithmize the data

    # HVG
    sc.pp.highly_variable_genes(adata, n_top_genes=2000)
    adata = adata[:, adata.var['highly_variable']].copy()

    adata.write_h5ad(f"{local_dir}/processed/{id}.h5ad")

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
100%|██████████| 94/94 [01:07<00:00,  1.39it/s]


## STimage-1K4M ##
*Data Download*

In [4]:
from huggingface_hub import hf_hub_download

local_path = hf_hub_download(
    repo_id="jiawennnn/STimage-1K4M",      # repo ID
    filename="meta/meta_all_gene02122025.csv",  # repo 내 path
    repo_type="dataset",
    local_dir="../stimage_data",            # 저장할 위치
    local_dir_use_symlinks=False,
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4165149b-a5fc-4244-8854-1d2fd67eaa78)')' thrown while requesting GET https://huggingface.co/api/resolve-cache/datasets/jiawennnn/STimage-1K4M/1c2a89874eed9bb70606373d2b7551c9583a0131/meta%2Fmeta_all_gene02122025.csv
Retrying in 1s [Retry 1/5].


../stimage_data/meta/meta_all_gene02122025.csv


In [5]:
import pandas as pd

# meta data 확인
meta_raw = pd.read_csv("../stimage_data/meta/meta_all_gene02122025.csv")
meta_raw

Unnamed: 0,slide,species,tissue,pmid,title,abstract,keywords,involve_cancer,tech,spot_num,gene_num
0,GSE144239_GSM4284316,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,666,17138
1,GSE144239_GSM4284317,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,646,17344
2,GSE144239_GSM4284318,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,638,17883
3,GSE144239_GSM4284319,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,590,16959
4,GSE144239_GSM4284320,human,skin,3257997438037084,Title 1: Multimodal Analysis of Composition an...,Abstract 1: To define the cellular composition...,Keywords 1: CRISPR screen; MIBI; intra-tumoral...,True,ST,521,17689
...,...,...,...,...,...,...,...,...,...,...,...
1144,Mouse_OlfactoryBulb_10X_03242022_Visium,mouse,olfactory bulb,,,,,False,Visium,1185,32285
1145,Human_Colon_10X_03252024_VisiumHD,human,colon,,,,,False,VisiumHD,545913,18085
1146,Human_Lung_10X_03292024_VisiumHD,human,lung,,,,,False,VisiumHD,605471,18085
1147,Mouse_Brain_10X_03292024_VIsiumHD,mouse,brain,,,,,False,VisiumHD,393543,19059


In [6]:
meta = meta_raw[meta_raw["species"]=="human"][["slide", "tissue", "pmid", "involve_cancer", "tech"]]
meta

Unnamed: 0,slide,tissue,pmid,involve_cancer,tech
0,GSE144239_GSM4284316,skin,3257997438037084,True,ST
1,GSE144239_GSM4284317,skin,3257997438037084,True,ST
2,GSE144239_GSM4284318,skin,3257997438037084,True,ST
3,GSE144239_GSM4284319,skin,3257997438037084,True,ST
4,GSE144239_GSM4284320,skin,3257997438037084,True,ST
...,...,...,...,...,...
1114,Human_Prostate_Erickson_08102022_Visium_Patien...,prostate,35948708,True,Visium
1115,Human_Prostate_Erickson_08102022_Visium_Patien...,prostate,35948708,True,Visium
1116,Human_Prostate_Erickson_08102022_Visium_Patien...,prostate,35948708,True,Visium
1145,Human_Colon_10X_03252024_VisiumHD,colon,,False,VisiumHD


In [37]:
slides = list(meta[meta["tissue"]=="kidney"]["slide"])

In [None]:
slides=slides[:-1]

In [15]:
from huggingface_hub import snapshot_download
from tqdm import tqdm

for slide in tqdm(slides):
    tech = meta[meta["slide"]==slide]["tech"].iloc[0] 

    snapshot_download(
    repo_id="jiawennnn/STimage-1K4M",
    repo_type="dataset",
    local_dir="../../stimage_data",          # 로컬 경로 설정
    local_dir_use_symlinks=False,
    allow_patterns=[
        f"{tech}/coord/{slide}_coord.csv",    # slide 내의 spot 위치 정보
        f"{tech}/gene_exp/{slide}_count.csv", # spot 별 유전자 발현량 raw data
        f"{tech}/image/{slide}.png"           # slide H&E image
    ],
)


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Fetching 3 files: 100%|██████████| 3/3 [00:13<00:00,  4.48s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:10<00:00,  3.53s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00,  2.81s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00,  3.22s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00,  2.72s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00,  2.63s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00,  3.20s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00,  2.37s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00,  2.35s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00,  3.05s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00,  2.74s/it]
Fetching 3 files: 100%|██████████| 3/3 [00:05<00:00,  1.88s/it]
Fetching 3 files: 100%|██████████| 3/3 [

*STimage-1K4M to Hest-1K*

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image

local_dir = "../../stimage_data"

for slide in tqdm(slides):

    coord_path = Path(f"{local_dir}/coord/{slide}_coord.csv")
    count_path = Path(f"{local_dir}/gene_exp/{slide}_count.csv")
    img_path = Path(f"{local_dir}/image/{slide}.png")

    # 1. coord 파일 전처리
    coord = pd.read_csv(coord_path, index_col=0)
    coord = coord.rename(columns={'yaxis': 'Y', 'xaxis': 'X'}) # HEST-1K 요구 맞춰 수정
    spot_diameter = coord.iloc[0]["r"]*2 # coord에 반지름 존재

    # 바코드 형식으로 인덱스 수정
    new_idx_crd = []
    for idx in coord.index:
        parts = idx.rsplit('_', 1)
        if len(parts) == 2 and 'x' in parts[1]:
            row, col = parts[1].split('x')
            new_idx = f"{int(row):03d}x{int(col):03d}"
            new_idx_crd.append(new_idx)
        else:
            new_idx_crd.append(idx)       
    coord.index = new_idx_crd

    # 2. count 파일 전처리
    count = pd.read_csv(count_path, index_col=0)

    new_idx_cnt = []
    for idx in count.index:
        parts = idx.rsplit('_', 1)
        if len(parts) == 2 and 'x' in parts[1]:
            row, col = parts[1].split('x')
            new_idx = f"{int(row):03d}x{int(col):03d}"
            new_idx_cnt.append(new_idx)
        else:
            new_idx_cnt.append(idx)       
    count.index = new_idx_cnt

    # 3. 공통 spot merge
    common_spots = count.index.intersection(coord.index)

    count = count.loc[common_spots]
    coord = coord.loc[common_spots, ['X', 'Y']].values

    # 4. AnnData 생성
    adata = sc.AnnData(count)
    adata.obsm['spatial'] = coord

    # obs column
    spatial = pd.DataFrame(
        adata.obsm['spatial'], 
        index=pd.Index(adata.obs.index, name='spot'),
        columns=['pxl_col_in_fullres', 'pxl_row_in_fullres']
    )

    # spatial 생성
    array_rows = []
    array_cols = []
    for idx in spatial.index:
        try:
            row, col = str(idx).split('x')
            array_rows.append(int(row))
            array_cols.append(int(col))
        except:
            array_rows.append(0)
            array_cols.append(0)
    
    spatial['array_row'] = array_rows
    spatial['array_col'] = array_cols
    
    # obs에 추가
    adata.obs = adata.obs.join(spatial)
    adata.obs['in_tissue'] = True

    # 6. 이미지 처리
    img = Image.open(img_path)
    img_down = img.resize((max(1, img.width//10), max(1, img.height//10)))
    img_array = np.array(img_down)

    # uns에 추가
    adata.uns['spatial'] = {
        'ST': {
           'images': {
                'downscaled_fullres': {'imgdata': img_array}
           }
        }
    }

    # 7. adata 파일로 저장
    adata.write_h5ad(f"{local_dir}/st/{slide}.h5ad")
    


*Pre-processing*

In [44]:
for slide in tqdm(slides):
    # AnnData
    adata = sc.read_h5ad(f"{local_dir}/st/{slide}.h5ad")
    adata.layers["raw"] = adata.X.copy() # raw data adata.raw_counts에 백업

    adata.var_names_make_unique()
    if adata.raw is not None:
      adata.raw.var_names_make_unique()

    # metadata -> adata.obs에 추가
    organ = meta[meta["slide"]==slide]["tissue"].iloc[0]
    disease_state = meta[meta["slide"]==slide]["involve_cancer"]
    species = "human" # 전처리시 human만 남김

    disease_state = meta.get('disease_state')
    if disease_state == True:
      disease_state = 1
    elif disease_state == False:
      disease_state = 0
    else:
      disease_state = ""
    
    adata.obs['sample_id'] = slide
    adata.obs['organ'] = organ
    adata.obs['disease_state'] = disease_state
    adata.obs['species'] = species

    # zero spot/gene filtering
    sc.pp.filter_cells(adata, min_counts=1)
    sc.pp.filter_genes(adata, min_counts=1)

    # normalization
    sc.pp.normalize_total(adata, inplace=True)  # Normalizing to median total counts
    sc.pp.log1p(adata)  # Logarithmize the data

    # HVG
    sc.pp.highly_variable_genes(adata, n_top_genes=2000)
    adata = adata[:, adata.var['highly_variable']].copy()

    adata.write_h5ad(f"{local_dir}/processed/{slide}.h5ad")

100%|██████████| 55/55 [02:52<00:00,  3.14s/it]
