In [1]:
import pandas as pd
import pickle

In [2]:
token_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/token_dictionary.pkl"
median_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer/gene_median_dictionary.pkl"
id_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer/gene_name_id_dict.pkl"


def make_gene_map(token_path, median_path, id_path):
    """
    Combines gene dictionaries based on their keys.

    Args:
        token_path (str): Path to the token dictionary.
        median_path (str): Path to the median dictionary.
        id_path (str): Path to the gene name-to-ID dictionary.

    Returns:
        dict: A combined dictionary where keys are gene IDs and values are tuples containing:
            - The tokenized representation of the gene.
            - The median embedding of the gene.
            - The gene name.
    """

    with open(token_path, "rb") as f:
        token_dict = pickle.load(f)

    with open(median_path, "rb") as f:
        median_dict = pickle.load(f)

    with open(id_path, "rb") as f:
        id_dict = pickle.load(f)
        id_dict = {value: key for key, value in id_dict.items()}
        
    df = pd.DataFrame.from_dict(token_dict, orient='index')
    df = df.reset_index()
    df.columns = ['gene_id', 'token_id']
    df['gene_name'] = df['gene_id'].map(id_dict)
    df['nonzero_median'] = df['gene_id'].map(median_dict)
    
    return df

df = make_gene_map(token_path, median_path, id_path)
print(f"{df.shape=}")
df.head()

df.shape=(25426, 4)


Unnamed: 0,gene_id,token_id,gene_name,nonzero_median
0,<pad>,0,,
1,<mask>,1,,
2,ENSG00000000003,2,TSPAN6,2.001186
3,ENSG00000000005,3,TNMD,3.228213
4,ENSG00000000419,4,DPM1,2.218874


In [3]:
# load GTF information 
gtf_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv"
gtf = pd.read_csv(gtf_path)
gtf = gtf[gtf['Feature'] == 'gene']
gtf = gtf[gtf['gene_name'].notna()]

columns = [ 
    'gene_id', 
    'gene_version',
    # 'gene_name', 
    'gene_biotype',
    'Chromosome', 
    'Start', 
    'End', 
]

gtf = gtf[columns].drop_duplicates()
print(f"{gtf.shape=}")
gtf.head()

  gtf = pd.read_csv(gtf_path)


gtf.shape=(41407, 6)


Unnamed: 0,gene_id,gene_version,gene_biotype,Chromosome,Start,End
0,ENSG00000160072,20,protein_coding,1,1471764,1497848
111,ENSG00000225972,1,unprocessed_pseudogene,1,629061,629433
114,ENSG00000198744,5,unprocessed_pseudogene,1,634375,634922
117,ENSG00000279928,2,unprocessed_pseudogene,1,182695,184174
129,ENSG00000142611,17,protein_coding,1,3069167,3438621


In [4]:
# load scenic
filepath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/scenic_resources/500bp_up_100bp_down_B.csv"
sdf = pd.read_csv(filepath)
sdf = sdf.rename(columns={"Unnamed: 0": "gene_name"})
sdf.head()

scenic_transcription_factors = sdf.columns.to_list()
print(len(scenic_transcription_factors))
scenic_transcription_factors[:10]

1606


['gene_name',
 'ABL1',
 'ACAA1',
 'ADNP',
 'ADNP2',
 'AEBP2',
 'AFF4',
 'AHCTF1',
 'AHDC1',
 'AHR']

In [8]:
# merge GTF information
df = pd.merge(df, 
              gtf, 
              how='left',
              left_on='gene_id',
              right_on='gene_id',
              )

df['scenic_tf'] = df['gene_name'].isin(scenic_transcription_factors)

# outpath = "/nfs/turbo/umms-indikar/shared/projects/geneformer/token_mappings.csv"
outpath = "../../data/token_mappings.csv"
df.to_csv(outpath, index=False)
df.head()

Unnamed: 0,gene_id,token_id,gene_name,nonzero_median,gene_version_x,gene_biotype_x,Chromosome_x,Start_x,End_x,scenic_tf,gene_version_y,gene_biotype_y,Chromosome_y,Start_y,End_y,gene_version,gene_biotype,Chromosome,Start,End
0,<pad>,0,,,,,,,,False,,,,,,,,,,
1,<mask>,1,,,,,,,,False,,,,,,,,,,
2,ENSG00000000003,2,TSPAN6,2.001186,15.0,protein_coding,X,100627107.0,100639991.0,False,15.0,protein_coding,X,100627107.0,100639991.0,15.0,protein_coding,X,100627107.0,100639991.0
3,ENSG00000000005,3,TNMD,3.228213,6.0,protein_coding,X,100584935.0,100599885.0,False,6.0,protein_coding,X,100584935.0,100599885.0,6.0,protein_coding,X,100584935.0,100599885.0
4,ENSG00000000419,4,DPM1,2.218874,14.0,protein_coding,20,50934866.0,50959140.0,False,14.0,protein_coding,20,50934866.0,50959140.0,14.0,protein_coding,20,50934866.0,50959140.0


In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
columns = [ 
    'gene_id', 
    'gene_version',
    'gene_name', 
    'gene_biotype',
    'Chromosome', 
    'Start', 
    'End', 
]

In [None]:
gtf.columns