# JPIC Data Engineering

In this file Joshua changes a file made by cooper to standardize the time point and replicate naming from files made by cooper

In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import anndata as an
import re

In [2]:
coopers_data_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data.h5ad"
ad = an.read(coopers_data_path)



In [4]:
def extract_timepoint_replicate_2015(data_id):
    match = re.match(r"S(\d+)([ab])", data_id)
    if match:
        time = int(match.group(1))
        replicate = 1 if match.group(2) == 'a' else 2
        return time, replicate
    else:
        return None, None

def extract_timepoint_replicate_2018(data_id):
    match = re.match(r"(\d+)_T(\d+)R(\d+)", data_id)
    if match:
        return int(match.group(2)), int(match.group(3))
    else:
        return None, None


In [5]:
adDs5 = ad[ad.obs['dataset'] == 'chen_2015']
adDs8 = ad[ad.obs['dataset'] == 'liu_2018']

In [6]:
timepoint_replicate = adDs8.obs.index.to_series().apply(extract_timepoint_replicate_2018)
timepoint_replicate_df = timepoint_replicate.apply(pd.Series)
timepoint_replicate_df.columns = ['order', 'replicate']

# Add the new columns to the AnnData object
adDs8.obs = adDs8.obs.join(timepoint_replicate_df)
adDs8

AnnData object with n_obs × n_vars = 48 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [7]:
timepoint_replicate = adDs5.obs.index.to_series().apply(extract_timepoint_replicate_2015)
timepoint_replicate_df = timepoint_replicate.apply(pd.Series)
timepoint_replicate_df.columns = ['order', 'replicate']

# Add the new columns to the AnnData object
adDs5.obs = adDs5.obs.join(timepoint_replicate_df)
adDs5

AnnData object with n_obs × n_vars = 18 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [8]:
adDs5.obs

Unnamed: 0_level_0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
S1a,chen_2015,S1a,0.0,0.0,7901832,True,1,1
S1b,chen_2015,S1b,0.0,0.0,8113329,True,1,2
S2a,chen_2015,S2a,0.0,0.0,9831046,False,2,1
S2b,chen_2015,S2b,0.0,0.0,10123271,False,2,2
S3a,chen_2015,S3a,1.0,8.0,10490839,False,3,1
S3b,chen_2015,S3b,1.0,8.0,10713844,False,3,2
S4a,chen_2015,S4a,2.0,16.0,9183324,False,4,1
S4b,chen_2015,S4b,2.0,16.0,9401913,False,4,2
S5a,chen_2015,S5a,3.0,24.0,9655719,False,5,1
S5b,chen_2015,S5b,3.0,24.0,9863515,False,5,2


In [9]:
adDs8.obs

Unnamed: 0_level_0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
63246_T0R1,liu_2018,63246,1.0,-48.0,11940999,True,0,1
63252_T1R1,liu_2018,63252,2.0,0.0,18063509,False,1,1
63249_T2R1,liu_2018,63249,3.0,8.0,11031474,False,2,1
63261_T3R1,liu_2018,63261,1.0,16.0,16761043,False,3,1
63258_T4R1,liu_2018,63258,2.0,24.0,8244802,False,4,1
63255_T5R1,liu_2018,63255,3.0,32.0,10615057,False,5,1
63270_T6R1,liu_2018,63270,1.0,40.0,16486670,False,6,1
63267_T7R1,liu_2018,63267,2.0,48.0,10127547,False,7,1
63264_T8R1,liu_2018,63264,3.0,56.0,11231585,False,8,1
63279_T9R1,liu_2018,63279,1.0,64.0,10781978,False,9,1


In [10]:
adDs_combined = adDs5.concatenate(adDs8, join='outer', index_unique=None)

  adDs_combined = adDs5.concatenate(adDs8, join='outer', index_unique=None)


In [12]:
adDs_combined.obs

Unnamed: 0_level_0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate,batch
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
S1a,chen_2015,S1a,0.0,0.0,7901832,True,1,1,0
S1b,chen_2015,S1b,0.0,0.0,8113329,True,1,2,0
S2a,chen_2015,S2a,0.0,0.0,9831046,False,2,1,0
S2b,chen_2015,S2b,0.0,0.0,10123271,False,2,2,0
S3a,chen_2015,S3a,1.0,8.0,10490839,False,3,1,0
...,...,...,...,...,...,...,...,...,...
63275_T11R3,liu_2018,63275,3.0,80.0,13515971,False,11,3,1
63290_T12R3,liu_2018,63290,1.0,88.0,9522866,False,12,3,1
63287_T13R3,liu_2018,63287,2.0,96.0,12370157,False,13,3,1
63284_T14R3,liu_2018,63284,3.0,104.0,10970735,False,14,3,1


In [14]:
adDs_combined

AnnData object with n_obs × n_vars = 66 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate', 'batch'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [13]:
# check that the size of the remerged ann data object is compatible with the size of Cooper's
ad

AnnData object with n_obs × n_vars = 66 × 19393
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [15]:
out_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data_jpic.h5ad"
adDs_combined.write(out_path)

# Tokenize

In [1]:
import pandas as pd
import anndata as ad
import numpy as np
import h5py
import os
import pickle
import scipy.sparse as sp
from geneformer import TranscriptomeTokenizer

In [2]:
input_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data_jpic.h5ad"

In [3]:
input_dir = os.path.dirname(input_path)
output_path = "/scratch/indikar_root/indikar1/cstansbu/geneformer/"
prefix = "test"

def get_attributes(h5ad_path):
    """
    Extracts attribute names from the `.obs` field of an h5ad AnnData file,
    returning them as a dictionary with keys and values being the attribute names.

    Args:
        h5ad_path (str): The path to the h5ad file.

    Returns:
        dict: A dictionary where keys and values are the unique attribute names 
              found in the `.obs` field of the h5ad file.
    """
    with h5py.File(h5ad_path, mode="r") as store:
        attribute_names = list(store["obs"].keys())

    attribute_name_dict = {name: name for name in attribute_names}  # Create dictionary
    return attribute_name_dict
    

custom_attr_name_dict = get_attributes(input_path)
custom_attr_name_dict

{'batch': 'batch',
 'control': 'control',
 'data_id': 'data_id',
 'dataset': 'dataset',
 'hour': 'hour',
 'n_counts': 'n_counts',
 'order': 'order',
 'replicate': 'replicate',
 'sample_id': 'sample_id',
 'timepoint': 'timepoint'}

In [4]:
def load_gene_median_dict(gene_median_file):
    """
    Loads a gene median dictionary from a pickle file.

    Args:
        gene_median_file (str): Path to the pickle file containing the gene median dictionary.

    Returns:
        dict: A dictionary mapping gene IDs to their median expression values.
    """

    with open(gene_median_file, "rb") as f:
        gene_median_dict = pickle.load(f)

    return gene_median_dict


def load_gene_tokenization(token_dictionary_file):
    """
    Loads gene tokenization data from a pickle file.

    Args:
        token_dictionary_file (str): Path to the pickle file containing the gene-token dictionary.

    Returns:
        dict: Gene-token dictionary (Ensembl ID: token).
        list: List of all gene keys (Ensembl IDs).
        dict: Dictionary mapping gene keys to True (used for selecting genes later).
    """

    with open(token_dictionary_file, "rb") as f:
        gene_token_dict = pickle.load(f)

    gene_keys = list(gene_token_dict.keys())

    # Optimization: Pre-allocate the list for slight performance improvement
    genelist_dict = dict.fromkeys(gene_keys, True)

    return gene_token_dict, gene_keys, genelist_dict


def rank_genes(gene_vector, gene_tokens):
    """Ranks genes based on expression values in descending order.

    Args:
        gene_vector (numpy.ndarray): Array of gene expression values.
        gene_tokens (numpy.ndarray): Array of corresponding gene tokens.

    Returns:
        numpy.ndarray: Array of gene tokens sorted by descending expression value.
    """
    return gene_tokens[np.argsort(-gene_vector)]


def normalize_counts(adata_chunk,  counts_column='n_counts', target_sum=10000):
    """Normalizes gene expression counts within a chunk of AnnData.

    Args:
        adata_chunk (AnnData): A chunk of the AnnData object containing gene expression data.
        counts_column (str): Name of the column in `adata_chunk.obs` containing the total counts per cell.
        target_sum (float): The desired total count per cell after normalization.
        norm_factor_vector (numpy.ndarray): An array of normalization factors for each gene.

    Returns:
        scipy.sparse.csr_matrix: A sparse matrix containing the normalized gene expression counts.

    This function performs the following steps:
        1. Extracts the total counts per cell from the specified column (`counts_column`).
        2. Normalizes the gene expression matrix (`adata_chunk.X`) by dividing by the total counts 
           and multiplying by the `target_sum`.
        3. Further adjusts the normalized values by dividing by the gene-specific normalization 
           factors (`norm_factor_vector`).
        4. Returns the normalized expression matrix as a sparse CSR matrix for efficient storage 
           and computation.
    """
    
    n_counts = adata_chunk.obs[counts_column].values[:, None]  # Cell counts as column vector
    X_norm = adata_chunk.X / n_counts * target_sum / norm_factor_vector
    return sp.csr_matrix(X_norm)  # Efficient sparse representation


def tokenize_anndata(adata, genelist_dict, gene_median_dict, 
                     chunk_size=100000, target_sum=10000):
    """
    Tokenizes and ranks genes within an AnnData object, optimizing for memory efficiency.

    This function processes gene expression data in chunks, applies normalization, and ranks genes
    for each cell based on their expression levels. The resulting tokenized and ranked gene
    representations, along with cell metadata, are returned.

    Args:
        adata (AnnData): The AnnData object containing gene expression data.
        genelist_dict (dict): Dictionary mapping gene IDs to boolean values indicating relevance.
        gene_median_dict (dict): Dictionary mapping gene IDs to their median expression values.
        chunk_size (int, optional): Number of cells to process in each chunk (default: 1000).
        target_sum (int, optional): Target sum for count normalization (default: 10000).

    Returns:
        tuple: 
            - list: List of tokenized and ranked gene lists for each cell.
            - dict: Dictionary containing cell metadata (keys are metadata column names).
    """
    # Filter relevant miRNAs
    coding_miRNA_mask = np.array([genelist_dict.get(i, False) for i in adata.var['ensembl_id']])
    coding_miRNA_loc = np.where(coding_miRNA_mask)[0]

    # Extract miRNA information
    coding_miRNA_ids = adata.var['ensembl_id'][coding_miRNA_loc]
    norm_factor_vector = np.array([gene_median_dict[i] for i in coding_miRNA_ids])
    coding_miRNA_tokens = np.array([gene_token_dict[i] for i in coding_miRNA_ids])

    tokenized_cells = []
    file_cell_metadata = {k: [] for k in adata.obs.columns}  # Initialize metadata dict

    # Process in chunks for memory efficiency
    for chunk_start in range(0, adata.shape[0], chunk_size):
        chunk_end = chunk_start + chunk_size
        adata_chunk = adata[chunk_start:chunk_end, coding_miRNA_loc]
        
        # Normalize counts (could be replaced with the untested function above)
        n_counts = adata_chunk.obs['n_counts'].values[:, None]
        X_norm = adata_chunk.X / n_counts * target_sum / norm_factor_vector
        X_norm = sp.csr_matrix(X_norm)  

        # Tokenize and rank genes for each cell in chunk
        for i in range(X_norm.shape[0]):
            ranks = rank_genes(X_norm[i].data, coding_miRNA_tokens[X_norm[i].indices])
            ranks = list(ranks[~np.isnan(ranks)].astype(int))

            tokenized_cells.append(ranks)

        # Update metadata
        for k in adata.obs.columns:
            file_cell_metadata[k].extend(adata_chunk.obs[k].tolist())

    return tokenized_cells, file_cell_metadata


In [5]:
DEFAULT_TOKEN_PATH = "/nfs/turbo/umms-indikar/shared/projects/geneformer/token_dictionary.pkl"
DEFAULT_MEDIAN_PATH = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer/gene_median_dictionary.pkl"

gene_token_dict, gene_keys, genelist_dict = load_gene_tokenization(DEFAULT_TOKEN_PATH)
gene_median_dict = load_gene_median_dict(DEFAULT_MEDIAN_PATH)

In [6]:
print(input_path)
adata = ad.read(input_path, backed="r")
adata

/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data_jpic.h5ad




AnnData object with n_obs × n_vars = 66 × 19393 backed at '/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data_jpic.h5ad'
    obs: 'dataset', 'sample_id', 'timepoint', 'hour', 'n_counts', 'control', 'order', 'replicate', 'batch'
    var: 'gene_id', 'token_id', 'Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_version', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level', 'ensembl_id'

In [7]:
tokenized_cells, cell_metadata = tokenize_anndata(adata, 
                                                  genelist_dict, 
                                                  gene_median_dict)

In [13]:
np.array(tokenized_cells).shape

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (66,) + inhomogeneous part.

In [12]:
pd.DataFrame(cell_metadata)

Unnamed: 0,dataset,sample_id,timepoint,hour,n_counts,control,order,replicate,batch
0,chen_2015,S1a,0.0,0.0,7901832,True,1,1,0
1,chen_2015,S1b,0.0,0.0,8113329,True,1,2,0
2,chen_2015,S2a,0.0,0.0,9831046,False,2,1,0
3,chen_2015,S2b,0.0,0.0,10123271,False,2,2,0
4,chen_2015,S3a,1.0,8.0,10490839,False,3,1,0
...,...,...,...,...,...,...,...,...,...
61,liu_2018,63275,3.0,80.0,13515971,False,11,3,1
62,liu_2018,63290,1.0,88.0,9522866,False,12,3,1
63,liu_2018,63287,2.0,96.0,12370157,False,13,3,1
64,liu_2018,63284,3.0,104.0,10970735,False,14,3,1
