In [20]:
import llm2geneset
import json
import time
import openai
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
aclient = openai.AsyncClient()
client = openai.OpenAI()

# Generate pathways

In [3]:
cond1 = "Brain"
cond2 = "Liver"

pathways = await llm2geneset.get_pathways(aclient, cond1, cond2, n=50)

100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.33s/it]
100%|█████████████████████████████████████████████| 1/1 [00:06<00:00,  6.44s/it]


In [4]:
len(pathways)

100

# Search PubMed to find supporting evidence for the pathways

In [5]:
queries = [p+f" in {cond1}" for p in pathways]  

In [6]:
pmids = await llm2geneset.esearch_async(queries, "pubmed", 1)

100%|█████████████████████████████████████████| 100/100 [00:49<00:00,  2.01it/s]


In [13]:
pmids_cleaned = []
for i in pmids:
    if len(i)>0:
        pmids_cleaned.append(i[0])
    else:
        pmids_cleaned.append(float('nan'))

In [14]:
pathway_df = pd.DataFrame({'pathway':pathways, 'pmids':pmids_cleaned})
pathway_df.head()

Unnamed: 0,pathway,pmids
0,Acetaldehyde metabolism,17590995
1,Acyl-CoA metabolic process,30903449
2,Adenosine Receptor Signaling Pathway,27225921
3,Aldosterone synthesis and secretion,19261742
4,Amino acid catabolism,28261376


# Drop pathways if can't find supporting PubMed article

In [17]:
pathway_df = pathway_df.dropna().reset_index(drop=True)

In [18]:
len(pathway_df)

99

# Filter pathways that are too similar

In [21]:
emb_lst = llm2geneset.get_embeddings(client, pathways)
similarity_matrix = cosine_similarity(emb_lst)
np.percentile(similarity_matrix,97)

np.float64(0.5611436503102355)

In [22]:
def filter_similar_items(text_list, similarity_matrix, threshold=0):
    # filter items with similar meanings
    # if 2 items are similar, keep the item with longer length
    keep_indices = set(range(len(text_list)))  # Initialize with all indices

    for i in range(len(text_list)):
        for j in range(i + 1, len(text_list)):
            if similarity_matrix[i][j] >= threshold:
                if len(text_list[i]) >= len(text_list[j]):
                    if j in keep_indices:
                        keep_indices.remove(j)
                else:
                    if i in keep_indices:
                        keep_indices.remove(i)
    
    return [text_list[i] for i in sorted(keep_indices)]


In [23]:
# Filter items
threshold = np.percentile(similarity_matrix,97)
distinct_items = filter_similar_items(pathways, similarity_matrix, threshold)

print("Distinct items:", distinct_items)

Distinct items: [np.str_('Acetaldehyde metabolism'), np.str_('Acyl-CoA metabolic process'), np.str_('Adenosine Receptor Signaling Pathway'), np.str_('Aldosterone synthesis and secretion'), np.str_('Blood-brain Barrier Maintenance'), np.str_('Calcium Signaling in Neurons'), np.str_('Central Nervous System Development'), np.str_('Circadian Entrainment'), np.str_('Detoxification of drugs and alcohol'), np.str_('Fructose and mannose metabolism'), np.str_('Glucuronidation'), np.str_('Glutathione metabolism'), np.str_('Glycogen synthesis'), np.str_('Glyoxylate and dicarboxylate metabolism'), np.str_('Insulin-like growth factor signaling'), np.str_('Iron homeostasis'), np.str_('Kidney-bean lectin metabolic process'), np.str_('Linoleic acid metabolism'), np.str_('Lipid droplet organization'), np.str_('Lipoprotein particle clearance'), np.str_('Long-term Potentiation'), np.str_('Maintaining Neuronal Synaptic Structure'), np.str_('Memory Formation'), np.str_('Metabotropic Glutamate Receptor Grou

In [24]:
len(distinct_items)

48

In [25]:
pathway_df = pathway_df[pathway_df.pathway.isin(distinct_items)].reset_index(drop=True)

# Generate genes for the remaining pathways

In [28]:
model = "gpt-3.5-turbo-0125"

## option1: using abstract to give context

In [27]:
abstracts = await llm2geneset.efetch_pubmed_async([[i] for i in pathway_df.pmids.values])

100%|███████████████████████████████████████████| 47/47 [00:23<00:00,  2.00it/s]


In [29]:
context = []
for x in abstracts:
    if len(x) > 0:
        context.append(x[0]['abstract'])
    else:
        context.append("")


pathway_df["llm_single_relevant_abstract"] = await llm2geneset.get_genes_context(aclient,
                                                                              context,
                                                                              pathway_df["pathway"].values, 
                                                                              model = model)

100%|███████████████████████████████████████████| 47/47 [00:04<00:00,  9.77it/s]


In [31]:
parsed_genes = [i['parsed_genes'] for i in pathway_df.llm_single_relevant_abstract.values]
pathway_df['abstract_genes']=parsed_genes

# option2: high confidence genes

In [30]:
pathway_df["llm_conf"] = await llm2geneset.get_genes(aclient,
                                                     pathway_df["pathway"].values,
                                                     model=model, 
                                                     prompt_type='conf')

100%|███████████████████████████████████████████| 47/47 [00:06<00:00,  7.43it/s]


In [49]:
genes = []
for i in pathway_df.llm_conf.values:
    high_conf_idx = np.where(np.array(i['conf']) == 'high')[0]
    if len(high_conf_idx)>0:
        high_conf_genes = np.array(i['parsed_genes'])[high_conf_idx]  
        genes.append(list(high_conf_genes))
    else:
        genes.append([])


In [51]:
pathway_df['conf_genes']=genes

In [53]:
pathway_df.head()

Unnamed: 0,pathway,pmids,llm_single_relevant_abstract,llm_conf,abstract_genes,conf_genes
0,Acetaldehyde metabolism,17590995,"{'parsed_genes': ['ALDH2', 'ADH1B', 'ADH1C', '...","{'parsed_genes': ['ALDH2', 'ADH1B', 'ADH1C', '...","[ALDH2, ADH1B, ADH1C, ADH4, ADH5, ADH7, CYP2E1]","[ALDH2, ADH1B, ADH1C]"
1,Acyl-CoA metabolic process,30903449,"{'parsed_genes': ['ACAA1', 'ACAA2', 'ACACB', '...","{'parsed_genes': ['ACAA1', 'ACAA2', 'ACADL', '...","[ACAA1, ACAA2, ACACB, ACADM, ACADS, ACADVL, AC...","[ACAA1, ACAA2, ACADL, ACADM, ACADS, ACADVL, AC..."
2,Adenosine Receptor Signaling Pathway,27225921,"{'parsed_genes': ['ADORA1', 'ADORA2A', 'ADORA2...","{'parsed_genes': ['ADORA1', 'ADORA2A', 'ADORA2...","[ADORA1, ADORA2A, ADORA2B, ADORA3, ADRA1A, ADR...","[ADORA1, ADORA2A, ADORA2B, ADORA3]"
3,Aldosterone synthesis and secretion,19261742,"{'parsed_genes': ['CYP11B1', 'CYP11B2', 'NR3C2...","{'parsed_genes': ['CYP11A1', 'CYP11B1', 'CYP11...","[CYP11B1, CYP11B2, NR3C2, HSD11B2]","[CYP11A1, CYP11B1, CYP11B2]"
4,Blood-brain Barrier Maintenance,24309662,"{'parsed_genes': ['ABCB1', 'ABCC1', 'ABCG2', '...","{'parsed_genes': ['CLDN5', 'OCLN', 'TJP1', 'JA...","[ABCB1, ABCC1, ABCG2, CLDN5, TJP1, SDC1, CDH5,...","[CLDN5, OCLN, TJP1]"


# Save to GMT

In [59]:
def save_to_gmt(pathways,genes,gmt_file):
    with open(gmt_file, 'w') as gf:
        for gene_set_name, genes in zip(pathways,genes) :
            # GMT format: gene_set_name\tdescription\tgene1\tgene2\tgene3...
            if len(genes)>0:
                genes = np.unique(genes)
                line = f"{gene_set_name}\t" + ''.join(f"\t{gene}" for gene in genes)
                gf.write(line + "\n")    

In [60]:
gmt_file = "libs_human/gmt_tailored/" + cond1+"_"+cond2+"_abstract.txt"
save_to_gmt(pathway_df.pathway.values,pathway_df.abstract_genes.values,gmt_file)

In [61]:
gmt_file = "libs_human/gmt_tailored/" + cond1+"_"+cond2+"_highConf.txt"
save_to_gmt(pathway_df.pathway.values,pathway_df.conf_genes.values,gmt_file)