In [1]:
import llm2geneset
import json
import time
import openai
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
aclient = openai.AsyncClient()
client = openai.OpenAI()

# Generate pathways

In [2]:
cond1 = "Brain"
cond2 = "Liver"

pathways = await llm2geneset.get_pathways(aclient, cond1, cond2, n=50)

100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.64s/it]
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.46s/it]


In [3]:
len(pathways)

101

# Search PubMed to find supporting evidence for the pathways

In [4]:
queries1 = [p+f" in {cond1}" for p in pathways]
queries2 = [p+f" in {cond2}" for p in pathways]  

In [5]:
pmids1 = await llm2geneset.esearch_async(queries1, "pubmed", 1)
pmids2 = await llm2geneset.esearch_async(queries2, "pubmed", 1)

100%|█████████████████████████████████████████| 101/101 [00:50<00:00,  2.00it/s]
100%|█████████████████████████████████████████| 101/101 [00:50<00:00,  2.00it/s]


In [6]:
pmids_cleaned1 = []
for i in pmids1:
    if len(i)>0:
        pmids_cleaned1.append(i[0])
    else:
        pmids_cleaned1.append(float('nan'))

In [7]:
pmids_cleaned2 = []
for i in pmids2:
    if len(i)>0:
        pmids_cleaned2.append(i[0])
    else:
        pmids_cleaned2.append(float('nan'))

In [8]:
pathway_df = pd.DataFrame({'pathway':pathways, 'pmids_cond1':pmids_cleaned1,'pmids_cond2':pmids_cleaned2})
pathway_df.head()

Unnamed: 0,pathway,pmids_cond1,pmids_cond2
0,Alcohol metabolism,34702580,32029510
1,Alpha Synuclein Pathway,32355963,37487948
2,Amino acid catabolism,28261376,35568239
3,Amygdala Development,32204831,35654975
4,Amyloid Precursor Protein Metabolism,21456963,37402372


In [9]:
len(pathway_df)

101

# Drop pathways if can't find supporting PubMed article

In [10]:
pathway_df = pathway_df.dropna().reset_index(drop=True)

In [11]:
len(pathway_df)

101

# Filter pathways that are too similar

In [12]:
emb_lst = llm2geneset.get_embeddings(client, pathways)
similarity_matrix = cosine_similarity(emb_lst)
np.percentile(similarity_matrix,97)

np.float64(0.5565708744653948)

In [13]:
def filter_similar_items(text_list, similarity_matrix, threshold=0):
    # filter items with similar meanings
    # if 2 items are similar, keep the item with longer length
    keep_indices = set(range(len(text_list)))  # Initialize with all indices

    for i in range(len(text_list)):
        for j in range(i + 1, len(text_list)):
            if similarity_matrix[i][j] >= threshold:
                if len(text_list[i]) >= len(text_list[j]):
                    if j in keep_indices:
                        keep_indices.remove(j)
                else:
                    if i in keep_indices:
                        keep_indices.remove(i)
    
    return [text_list[i] for i in sorted(keep_indices)]


In [14]:
# Filter items
threshold = np.percentile(similarity_matrix,97)
distinct_items = filter_similar_items(pathways, similarity_matrix, threshold)

print("Distinct items:", distinct_items)

Distinct items: [np.str_('Alpha Synuclein Pathway'), np.str_('Amino acid catabolism'), np.str_('Amyloid Precursor Protein Metabolism'), np.str_('Axon Regeneration'), np.str_('Behavioral Response to Stimulus'), np.str_('Bipolar Cell Differentiation'), np.str_('Blood-Brain Barrier Integrity'), np.str_('Calcium Signaling in Neurons'), np.str_('Carbohydrate metabolism'), np.str_('Circadian Rhythm'), np.str_('Coagulation cascade'), np.str_('Cognitive Processes'), np.str_('Complement activation'), np.str_('Copper homeostasis'), np.str_('Endobiotic metabolism'), np.str_('Endocytosis in Neuronal Cells'), np.str_('Fatty acid oxidation'), np.str_('GABAergic Synapse'), np.str_('Glycogen synthesis'), np.str_('Glycosylation'), np.str_('Hippocampus Development'), np.str_('Hormone biosynthetic process'), np.str_('Insulin receptor signaling pathway'), np.str_('Memory and Learning'), np.str_('Metabolism of xenobiotics'), np.str_('Microglial Cell Activation'), np.str_('Mitochondrial Function in Neurons'

In [15]:
len(distinct_items)

52

In [16]:
pathway_df = pathway_df[pathway_df.pathway.isin(distinct_items)].reset_index(drop=True)

# Generate genes for the remaining pathways

In [17]:
model = "gpt-3.5-turbo-0125"

## option1: using abstract to give context

In [18]:
abstracts = await llm2geneset.efetch_pubmed_async([[i] for i in pathway_df.pmids_cond1.values])

100%|███████████████████████████████████████████| 52/52 [00:25<00:00,  2.02it/s]


In [19]:
context = []
for x in abstracts:
    if len(x) > 0:
        context.append(x[0]['abstract'])
    else:
        context.append("")


pathway_df["llm_single_relevant_abstract"] = await llm2geneset.get_genes_context(aclient,
                                                                              context,
                                                                              pathway_df["pathway"].values, 
                                                                              model = model)

100%|███████████████████████████████████████████| 52/52 [00:09<00:00,  5.44it/s]


In [20]:
parsed_genes = [i['parsed_genes'] for i in pathway_df.llm_single_relevant_abstract.values]
pathway_df['abstract_genes_cond1']=parsed_genes

In [21]:
pathway_df.head()

Unnamed: 0,pathway,pmids_cond1,pmids_cond2,llm_single_relevant_abstract,abstract_genes_cond1
0,Alpha Synuclein Pathway,32355963,37487948,"{'parsed_genes': ['SNCA', 'PELI1', 'LAMP2', 'C...","[SNCA, PELI1, LAMP2, CD11b]"
1,Amino acid catabolism,28261376,35568239,"{'parsed_genes': ['ACAT1', 'ACO1', 'ALDH2', 'A...","[ACAT1, ACO1, ALDH2, ALDH9A1, ASAH1, ASL, ASS1..."
2,Amyloid Precursor Protein Metabolism,21456963,37402372,"{'parsed_genes': ['APP', 'PSEN1', 'PSEN2', 'AD...","[APP, PSEN1, PSEN2, ADAM10, BACE1, BACE2]"
3,Axon Regeneration,19822144,30509565,"{'parsed_genes': ['MAG', 'GD1a', 'GT1b'], 'rea...","[MAG, GD1a, GT1b]"
4,Behavioral Response to Stimulus,17956738,21868631,"{'parsed_genes': ['Drd2', 'Th', 'Fos', 'Bdnf',...","[Drd2, Th, Fos, Bdnf, Creb1, Camk2a]"


In [22]:
abstracts = await llm2geneset.efetch_pubmed_async([[i] for i in pathway_df.pmids_cond2.values])

100%|███████████████████████████████████████████| 52/52 [00:25<00:00,  2.02it/s]


In [23]:
context = []
for x in abstracts:
    if len(x) > 0:
        context.append(x[0]['abstract'])
    else:
        context.append("")


pathway_df["llm_single_relevant_abstract2"] = await llm2geneset.get_genes_context(aclient,
                                                                              context,
                                                                              pathway_df["pathway"].values, 
                                                                              model = model)

100%|███████████████████████████████████████████| 52/52 [00:09<00:00,  5.65it/s]


In [24]:
parsed_genes = [i['parsed_genes'] for i in pathway_df.llm_single_relevant_abstract2.values]
pathway_df['abstract_genes_cond2']=parsed_genes

In [25]:
pathway_df.head()

Unnamed: 0,pathway,pmids_cond1,pmids_cond2,llm_single_relevant_abstract,abstract_genes_cond1,llm_single_relevant_abstract2,abstract_genes_cond2
0,Alpha Synuclein Pathway,32355963,37487948,"{'parsed_genes': ['SNCA', 'PELI1', 'LAMP2', 'C...","[SNCA, PELI1, LAMP2, CD11b]","{'parsed_genes': ['SNCA', 'GRM5', 'SNCG'], 're...","[SNCA, GRM5, SNCG]"
1,Amino acid catabolism,28261376,35568239,"{'parsed_genes': ['ACAT1', 'ACO1', 'ALDH2', 'A...","[ACAT1, ACO1, ALDH2, ALDH9A1, ASAH1, ASL, ASS1...","{'parsed_genes': ['BCAT1', 'BCAT2', 'GOT1', 'G...","[BCAT1, BCAT2, GOT1, GOT2, GLUD1, GPT, GLS, AS..."
2,Amyloid Precursor Protein Metabolism,21456963,37402372,"{'parsed_genes': ['APP', 'PSEN1', 'PSEN2', 'AD...","[APP, PSEN1, PSEN2, ADAM10, BACE1, BACE2]","{'parsed_genes': ['APP', 'BACE1', 'PSEN1', 'PS...","[APP, BACE1, PSEN1, PSEN2]"
3,Axon Regeneration,19822144,30509565,"{'parsed_genes': ['MAG', 'GD1a', 'GT1b'], 'rea...","[MAG, GD1a, GT1b]","{'parsed_genes': ['LKB1', 'AMPK', 'NUAK1', 'ER...","[LKB1, AMPK, NUAK1, ERK]"
4,Behavioral Response to Stimulus,17956738,21868631,"{'parsed_genes': ['Drd2', 'Th', 'Fos', 'Bdnf',...","[Drd2, Th, Fos, Bdnf, Creb1, Camk2a]","{'parsed_genes': ['TNF', 'IL1B', 'IL6', 'CRH',...","[TNF, IL1B, IL6, CRH, HTR2A, HTR2C]"


In [26]:
abstract_genes = []
for i, row in pathway_df.iterrows():
    combine_set = set(row['abstract_genes_cond1']).union(set(row['abstract_genes_cond2']))
    abstract_genes.append(list(combine_set))

In [29]:
pathway_df["abstract_genes"] = abstract_genes

# option2: high confidence genes

In [30]:
pathway_df["llm_conf"] = await llm2geneset.get_genes(aclient,
                                                     pathway_df["pathway"].values,
                                                     model=model, 
                                                     prompt_type='conf')

100%|███████████████████████████████████████████| 52/52 [00:08<00:00,  6.16it/s]


In [31]:
genes = []
for i in pathway_df.llm_conf.values:
    high_conf_idx = np.where(np.array(i['conf']) == 'high')[0]
    if len(high_conf_idx)>0:
        high_conf_genes = np.array(i['parsed_genes'])[high_conf_idx]  
        genes.append(list(high_conf_genes))
    else:
        genes.append([])


In [32]:
pathway_df['conf_genes']=genes

In [33]:
pathway_df.head()

Unnamed: 0,pathway,pmids_cond1,pmids_cond2,llm_single_relevant_abstract,abstract_genes_cond1,llm_single_relevant_abstract2,abstract_genes_cond2,abstract_genes,llm_conf,conf_genes
0,Alpha Synuclein Pathway,32355963,37487948,"{'parsed_genes': ['SNCA', 'PELI1', 'LAMP2', 'C...","[SNCA, PELI1, LAMP2, CD11b]","{'parsed_genes': ['SNCA', 'GRM5', 'SNCG'], 're...","[SNCA, GRM5, SNCG]","[SNCA, LAMP2, PELI1, GRM5, SNCG, CD11b]","{'parsed_genes': ['SNCA', 'PARK2', 'PINK1', 'D...",[SNCA]
1,Amino acid catabolism,28261376,35568239,"{'parsed_genes': ['ACAT1', 'ACO1', 'ALDH2', 'A...","[ACAT1, ACO1, ALDH2, ALDH9A1, ASAH1, ASL, ASS1...","{'parsed_genes': ['BCAT1', 'BCAT2', 'GOT1', 'G...","[BCAT1, BCAT2, GOT1, GOT2, GLUD1, GPT, GLS, AS...","[BCAT1, GOT1, ALDH2, ASS1, MAOA, GLUD2, MAOB, ...","{'parsed_genes': ['BCAT1', 'BCAT2', 'GOT1', 'G...","[BCAT1, BCAT2]"
2,Amyloid Precursor Protein Metabolism,21456963,37402372,"{'parsed_genes': ['APP', 'PSEN1', 'PSEN2', 'AD...","[APP, PSEN1, PSEN2, ADAM10, BACE1, BACE2]","{'parsed_genes': ['APP', 'BACE1', 'PSEN1', 'PS...","[APP, BACE1, PSEN1, PSEN2]","[BACE1, PSEN2, BACE2, PSEN1, ADAM10, APP]","{'parsed_genes': ['APP', 'PSEN1', 'PSEN2', 'BA...",[APP]
3,Axon Regeneration,19822144,30509565,"{'parsed_genes': ['MAG', 'GD1a', 'GT1b'], 'rea...","[MAG, GD1a, GT1b]","{'parsed_genes': ['LKB1', 'AMPK', 'NUAK1', 'ER...","[LKB1, AMPK, NUAK1, ERK]","[AMPK, NUAK1, ERK, GD1a, MAG, GT1b, LKB1]","{'parsed_genes': ['PTEN', 'SOCS3', 'GSK3B', 'P...",[PTEN]
4,Behavioral Response to Stimulus,17956738,21868631,"{'parsed_genes': ['Drd2', 'Th', 'Fos', 'Bdnf',...","[Drd2, Th, Fos, Bdnf, Creb1, Camk2a]","{'parsed_genes': ['TNF', 'IL1B', 'IL6', 'CRH',...","[TNF, IL1B, IL6, CRH, HTR2A, HTR2C]","[CRH, Camk2a, Drd2, Creb1, IL6, Bdnf, Fos, IL1...","{'parsed_genes': ['DRD2', 'COMT', 'BDNF', 'KCN...",[]


# Save to GMT

In [34]:
def save_to_gmt(pathways,genes,gmt_file):
    with open(gmt_file, 'w') as gf:
        for gene_set_name, genes in zip(pathways,genes) :
            # GMT format: gene_set_name\tdescription\tgene1\tgene2\tgene3...
            if len(genes)>0:
                genes = np.unique(genes)
                line = f"{gene_set_name}\t" + ''.join(f"\t{gene}" for gene in genes)
                gf.write(line + "\n")    

In [35]:
gmt_file = "libs_human/gmt_tailored/" + cond1+"_"+cond2+"_abstract.txt"
save_to_gmt(pathway_df.pathway.values,pathway_df.abstract_genes.values,gmt_file)

In [36]:
gmt_file = "libs_human/gmt_tailored/" + cond1+"_"+cond2+"_highConf.txt"
save_to_gmt(pathway_df.pathway.values,pathway_df.conf_genes.values,gmt_file)