In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/hematokytos/merged_anndata/merged_adata.h5ad"

adata = sc.read_h5ad(fpath)
adata

AnnData object with n_obs × n_vars = 171498 × 18867
    obs: 'n_genes', 'dataset', 'n_genes_by_counts', 'total_counts'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_id', 'token_id', 'gene_biotype', 'Chromosome', 'Start', 'End'
    layers: 'counts'

# merge the cell_type annotations

In [3]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/hematokytos/annotation/cell_types.csv"
df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['cell_id'] = df['obs_index'] + "_" + df["dataset"]
df = df.set_index('cell_id')
df.head()

df.shape=(174440, 4)


Unnamed: 0_level_0,obs_index,cell_type,dataset,standard_cell_type
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PreBNK_AGTTGAAC-TTGCATAT_1_pellin,PreBNK_AGTTGAAC-TTGCATAT_1,PreBNK,pellin,PreBNK
PreBNK_AATCCGGC-TGAAATGA_1_pellin,PreBNK_AATCCGGC-TGAAATGA_1,PreBNK,pellin,PreBNK
PreBNK_CAAACATT-TCTGTGGT_1_pellin,PreBNK_CAAACATT-TCTGTGGT_1,PreBNK,pellin,PreBNK
PreBNK_AATCGAAG-AGTGAAAG_1_pellin,PreBNK_AATCGAAG-AGTGAAAG_1,PreBNK,pellin,PreBNK
PreBNK_CGTGTACA-TTCCAGAC_1_pellin,PreBNK_CGTGTACA-TTCCAGAC_1,PreBNK,pellin,PreBNK


In [4]:
adata.obs = pd.merge(
    adata.obs,
    df,
    how='left',
    left_index=True,
    right_index=True,
)
adata.obs.head()

Unnamed: 0,n_genes,dataset_x,n_genes_by_counts,total_counts,obs_index,cell_type,dataset_y,standard_cell_type
PreBNK_AGTTGAAC-TTGCATAT_1_pellin,2637,pellin,2633,13711.738192,PreBNK_AGTTGAAC-TTGCATAT_1,PreBNK,pellin,PreBNK
PreBNK_AATCCGGC-TGAAATGA_1_pellin,1144,pellin,1142,7140.276133,PreBNK_AATCCGGC-TGAAATGA_1,PreBNK,pellin,PreBNK
PreBNK_CAAACATT-TCTGTGGT_1_pellin,877,pellin,873,5812.4095,PreBNK_CAAACATT-TCTGTGGT_1,PreBNK,pellin,PreBNK
PreBNK_CGTGTACA-TTCCAGAC_1_pellin,2233,pellin,2232,11925.223895,PreBNK_CGTGTACA-TTCCAGAC_1,PreBNK,pellin,PreBNK
PreBNK_CATGACGA-CTTACGGG_1_pellin,951,pellin,948,6039.884097,PreBNK_CATGACGA-CTTACGGG_1,PreBNK,pellin,PreBNK


In [5]:
def tokenize(adata, chunk_size=1000, max_tokens=2048, pad_token=0):
    """
    Tokenizes anndata objects for GeneFormer processing.

    Args:
        adata: Anndata object containing gene expression data.
        chunk_size: Number of cells to process at once (for memory efficiency).
        max_tokens: Maximum number of tokens to include per cell.

    Assumptions:
        1. adata.X values are normalized appropriately.
        2. Genes are already subset to the GeneFormer corpus with token IDs in adata.var.
        3. all needed annotations are merged into adata.obs 

    Returns:
        DataFrame with 'cell_id', 'input_ids', 'length', and 'total_length' columns.
    """

    token_ids = adata.var['token_id'].values
    result = []

    total_chunks = adata.shape[0] // chunk_size + 1  # Calculate total chunks for progress

    for i, start in enumerate(range(0, adata.shape[0], chunk_size)):
        end = start + chunk_size
        adata_chunk = adata[start:end, :]

        X = adata_chunk.to_df()
        X.columns = token_ids
    
        for cell_id, row in X.iterrows():
            ranks = row[row > 0].rank(method='first').astype(int).sort_values()
            input_ids = ranks.head(max_tokens).index.to_list()

            # Pad with pad_token if necessary
            padding_length = max_tokens - len(input_ids)
            if padding_length > 0:
                input_ids += [pad_token] * padding_length
            
            # get obs metadata
            new_row = adata_chunk.obs.loc[cell_id, ].to_dict()
            
            new_row['cell_id'] = cell_id
            new_row['input_ids'] = input_ids
            new_row['length'] = len(input_ids)
            new_row['total_length'] = len(ranks)
            result.append(new_row)
            
        # Print progress update
        if (i + 1) % 10 == 0 or i + 1 == total_chunks:  # Print every 10 chunks or on the last chunk
            print(f"Processed {min(end, adata.shape[0])} out of {adata.shape[0]} cells ({(i + 1) / total_chunks:.1%} complete)")
            
    result = pd.DataFrame(result)
    return result
    
result = tokenize(adata)
result.head()

Processed 10000 out of 171498 cells (5.8% complete)
Processed 20000 out of 171498 cells (11.6% complete)
Processed 30000 out of 171498 cells (17.4% complete)
Processed 40000 out of 171498 cells (23.3% complete)
Processed 50000 out of 171498 cells (29.1% complete)
Processed 60000 out of 171498 cells (34.9% complete)
Processed 70000 out of 171498 cells (40.7% complete)
Processed 80000 out of 171498 cells (46.5% complete)
Processed 90000 out of 171498 cells (52.3% complete)
Processed 100000 out of 171498 cells (58.1% complete)
Processed 110000 out of 171498 cells (64.0% complete)
Processed 120000 out of 171498 cells (69.8% complete)
Processed 130000 out of 171498 cells (75.6% complete)
Processed 140000 out of 171498 cells (81.4% complete)
Processed 150000 out of 171498 cells (87.2% complete)
Processed 160000 out of 171498 cells (93.0% complete)
Processed 170000 out of 171498 cells (98.8% complete)
Processed 171498 out of 171498 cells (100.0% complete)


Unnamed: 0,n_genes,dataset_x,n_genes_by_counts,total_counts,obs_index,cell_type,dataset_y,standard_cell_type,cell_id,input_ids,length,total_length
0,2637,pellin,2633,13711.738192,PreBNK_AGTTGAAC-TTGCATAT_1,PreBNK,pellin,PreBNK,PreBNK_AGTTGAAC-TTGCATAT_1_pellin,"[2010, 1544, 3397, 3593, 4718, 4928, 11206, 84...",2048,2633
1,1144,pellin,1142,7140.276133,PreBNK_AATCCGGC-TGAAATGA_1,PreBNK,pellin,PreBNK,PreBNK_AATCCGGC-TGAAATGA_1_pellin,"[1544, 17810, 99, 7633, 1714, 7556, 4354, 4917...",2048,1142
2,877,pellin,873,5812.4095,PreBNK_CAAACATT-TCTGTGGT_1,PreBNK,pellin,PreBNK,PreBNK_CAAACATT-TCTGTGGT_1_pellin,"[3593, 4354, 6615, 1725, 13420, 2438, 1985, 33...",2048,873
3,2233,pellin,2232,11925.223895,PreBNK_CGTGTACA-TTCCAGAC_1,PreBNK,pellin,PreBNK,PreBNK_CGTGTACA-TTCCAGAC_1_pellin,"[3397, 4316, 10371, 6485, 4318, 7947, 11992, 1...",2048,2232
4,951,pellin,948,6039.884097,PreBNK_CATGACGA-CTTACGGG_1,PreBNK,pellin,PreBNK,PreBNK_CATGACGA-CTTACGGG_1_pellin,"[2010, 500, 5036, 10317, 1415, 1178, 6484, 755...",2048,948


In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
A = pd.DataFrame([10, 5, 4, 3, 2, 1, 0, 0, 0, 0])

ranks = A.rank(ascending=False, method='max')
ranks


In [None]:
ranks = pdf.rank(axis=1, ascending=False, method='first').astype(int)
ranks = ranks - 1
ranks.head()

In [None]:
token_ids = adata.var['token_id'].values

In [None]:
input_list = ranks.apply(lambda x: list(token_ids[x])[:2048], axis=1)
input_list.head()

In [None]:
break

In [None]:
np.max(ranks.head(1).values)

In [None]:
break

In [None]:
ranks = df.rank(axis=1, ascending=False)
ranks.head()