In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import hypergeom
from asynciolimiter import StrictLimiter
import asyncio
import tqdm.asyncio
aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)

In [3]:
models = ["gpt-3.5-turbo-0125"]
lib_names = ["KEGG_2021_Human"]#,
             #"Reactome_2022", 
             #"WikiPathway_2023_Human"]

In [4]:
ouput = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr)

        # use llm2geneset to generate geneset name
        res = await llm2geneset.gs_proposal(aclient, test_sets, model=model)
        most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
        names = [i[0] for i in most_overrepresented]
        name_embs = llm2geneset.get_embeddings(client, names)
        
        for i, ref in enumerate(test_descr):
            gt_emb = [gt_embs[i]]
            name = names[i]
            name_emb = [name_embs[i]]
            scores = scorer.score(ref, name)
            rouge1= scores['rouge1'].recall
            rouge2= scores['rouge2'].recall
            rougeL= scores['rougeL'].fmeasure      
            csim = cosine_similarity(gt_emb,name_emb).squeeze()
                
            x={"model":model,
               "library":lib_name,
               "gt_name":ref,
               "llm2geneset_name":name,
               "llm2geneset_ROUGE1":rouge1,
               "llm2geneset_ROUGE2":rouge2,   
               "llm2geneset_ROUGEL":rougeL,                  
               "llm2geneset_csim":csim}    
            ouput.append(x)

KEGG_2021_Human


  0%|                                                                                                                                        | 0/320 [00:00<?, ?it/s]

retrying
string indices must be integers, not 'str'
```json
{
    "type": "array",
    "items": [
        {
            "p": "Base excision repair"
        },
        {
            "p": "DNA repair"
        },
        {
            "p": "DNA replication"
        },
        {
            "p": "Single-strand break repair"
        },
        {
            "p": "Mismatch repair"
        }
    ]
}
```


  2%|██▍                                                                                                                             | 6/320 [00:05<03:19,  1.57it/s]

retrying
string indices must be integers, not 'str'
```json
{
    "type": "array",
    "items": [
        {
            "p": "MAPK signaling pathway"
        },
        {
            "p": "Wnt signaling pathway"
        },
        {
            "p": "Neurotrophin signaling pathway"
        },
        {
            "p": "Cell cycle"
        },
        {
            "p": "Apoptosis"
        }
    ]
}
```


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:50<00:00,  6.37it/s]


In [5]:
df = pd.DataFrame(ouput)
df

Unnamed: 0,model,library,gt_name,llm2geneset_name,llm2geneset_ROUGE1,llm2geneset_ROUGE2,llm2geneset_ROUGEL,llm2geneset_csim
0,gpt-3.5-turbo-0125,KEGG_2021_Human,ABC transporters,ABC transporters pathway,1.000000,1.000000,0.800000,0.8754979396376352
1,gpt-3.5-turbo-0125,KEGG_2021_Human,AGE-RAGE signaling pathway in diabetic complic...,PI3K-Akt signaling pathway,0.285714,0.166667,0.363636,0.48119822049801747
2,gpt-3.5-turbo-0125,KEGG_2021_Human,AMPK signaling pathway,mTOR signaling pathway,0.666667,0.500000,0.666667,0.6976126688801991
3,gpt-3.5-turbo-0125,KEGG_2021_Human,Acute myeloid leukemia,Cell proliferation,0.000000,0.000000,0.000000,0.2779928359354127
4,gpt-3.5-turbo-0125,KEGG_2021_Human,Adherens junction,Adherens junction,1.000000,1.000000,1.000000,0.9999987834883033
...,...,...,...,...,...,...,...,...
315,gpt-3.5-turbo-0125,KEGG_2021_Human,cAMP signaling pathway,cAMP signaling pathway,1.000000,1.000000,1.000000,0.9999990831431064
316,gpt-3.5-turbo-0125,KEGG_2021_Human,cGMP-PKG signaling pathway,Regulation of calcium ion transport,0.000000,0.000000,0.000000,0.33729741116308454
317,gpt-3.5-turbo-0125,KEGG_2021_Human,mRNA surveillance pathway,mRNA cleavage,0.333333,0.000000,0.400000,0.6231189870078935
318,gpt-3.5-turbo-0125,KEGG_2021_Human,mTOR signaling pathway,Wnt signaling pathway,0.666667,0.500000,0.666667,0.5734167476016507


# Jacqueline's version of gs_proposal

In [136]:
async def gs_proposal(aclient, protein_lists, model="gpt-4o", n_retry=1):

    async def gse(genes):
        rate_limiter = StrictLimiter(0.95 * 10000.0 / 60.0)
        await rate_limiter.wait()
        
        # 1. Examine genes and propose possible pathways and processes.
        bio_process = await llm2geneset.get_pathways_from_genes(aclient, genes, model, n_retry=3, use_sysmsg=False)
        
        # 2. Select gene sets with high confidence
        high_conf_names = np.array(bio_process[0]['parsed_pathways'])[np.array(bio_process[0]['conf']) == 'high']
        
        # Add check to regenerate bio_process if no high confidence names found
        retry_count = 0
        max_retries = 3
        while len(high_conf_names) < 1 and retry_count < max_retries:
            bio_process = await llm2geneset.get_pathways_from_genes(aclient, genes, model, n_retry=3, use_sysmsg=False)
            high_conf_names = np.array(bio_process[0]['parsed_pathways'])[np.array(bio_process[0]['conf']) == 'high']
            retry_count += 1
        
        # 3. Generate these gene sets with and without input genes as context.
        proposed = await llm2geneset.get_genes(aclient, high_conf_names, model=model, use_tqdm=False)
        
        # 4. Compute hypergeometric p-vals
        p_vals = []
        for idx in range(len(high_conf_names)):
            llm_genes = proposed[idx]["parsed_genes"]
            intersection = set(llm_genes).intersection(set(genes))
            p_val = hypergeom.sf(
                len(intersection) - 1, 19846 - len(genes), len(genes), len(llm_genes)
            )
            p_vals.append(p_val)
        
        return list(zip(high_conf_names, p_vals))
    
    res = await tqdm.asyncio.tqdm.gather(*(gse(p) for p in protein_lists))
    return res


In [137]:
ouput = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        test_sets = gen_res["curated_genesets"][0:10]
        test_descr = gen_res["descr_cleaned"][0:10]
        gt_embs = llm2geneset.get_embeddings(client, test_descr)

        # use llm2geneset to generate geneset name
        res = await gs_proposal(aclient, test_sets, model=model)
        most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
        names = [i[0] for i in most_overrepresented]
        name_embs = llm2geneset.get_embeddings(client, names)
        
        for i, ref in enumerate(test_descr):
            try:
                gt_emb = [gt_embs[i]]
                name = names[i]
                name_emb = [name_embs[i]]
                scores = scorer.score(ref, name)
                rouge1= scores['rouge1'].recall
                rouge2= scores['rouge2'].recall
                rougeL= scores['rougeL'].fmeasure      
                csim = cosine_similarity(gt_emb,name_emb).squeeze()
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":ref,
                   "llm2geneset_name":name,
                   "llm2geneset_ROUGE1":rouge1,
                   "llm2geneset_ROUGE2":rouge2,   
                   "llm2geneset_ROUGEL":rougeL,                  
                   "llm2geneset_csim":csim}    
            except:
                # occasionally gsai will fail to output a gene set name
                print("error! Ref name: ", ref, " llm2geneset name: ",name)
                x={"model":model,
                   "library":lib_name,
                   "gt_name":ref,
                   "llm2geneset_name":name,
                   "llm2geneset_ROUGE1":None,
                   "llm2geneset_ROUGE2":None,   
                   "llm2geneset_ROUGEL":None,                  
                   "llm2geneset_csim":None}    
            ouput.append(x)

KEGG_2021_Human


  0%|                                                      | 0/10 [00:00<?, ?it/s]
  0%|                                                      | 0/67 [00:00<?, ?it/s][A

  0%|                                                      | 0/69 [00:00<?, ?it/s][A[A


  0%|                                                     | 0/100 [00:00<?, ?it/s][A[A[A



  0%|                                                      | 0/37 [00:00<?, ?it/s][A[A[A[A




  0%|                                                      | 0/45 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                     | 0/150 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                      | 0/71 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                     | 0/186 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                                     | 0/120 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A

Reactome_2022


  0%|                                                      | 0/10 [00:00<?, ?it/s]
  0%|                                                      | 0/18 [00:00<?, ?it/s][A

  0%|                                                     | 0/131 [00:00<?, ?it/s][A[A


  0%|                                                      | 0/26 [00:00<?, ?it/s][A[A[A



  0%|                                                      | 0/14 [00:00<?, ?it/s][A[A[A[A




  0%|                                                       | 0/8 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                      | 0/25 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                     | 0/102 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                      | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                                      | 0/77 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A

WikiPathway_2023_Human


  0%|                                                      | 0/10 [00:00<?, ?it/s]
  0%|                                                      | 0/12 [00:00<?, ?it/s][A

  0%|                                                      | 0/91 [00:00<?, ?it/s][A[A


  0%|                                                       | 0/8 [00:00<?, ?it/s][A[A[A



  0%|                                                      | 0/50 [00:00<?, ?it/s][A[A[A[A




  0%|                                                      | 0/30 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                      | 0/19 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                      | 0/16 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                     | 0/105 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                                       | 0/7 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A

KEGG_2021_Human


  0%|                                                      | 0/10 [00:00<?, ?it/s]
  0%|                                                      | 0/67 [00:00<?, ?it/s][A

  0%|                                                      | 0/69 [00:00<?, ?it/s][A[A


  0%|                                                     | 0/100 [00:00<?, ?it/s][A[A[A



  0%|                                                      | 0/37 [00:00<?, ?it/s][A[A[A[A




  0%|                                                      | 0/45 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                     | 0/150 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                      | 0/71 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                     | 0/186 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                                     | 0/120 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A

Reactome_2022


  0%|                                                      | 0/10 [00:00<?, ?it/s]
  0%|                                                      | 0/77 [00:00<?, ?it/s][A

  0%|                                                     | 0/131 [00:00<?, ?it/s][A[A


  0%|                                                      | 0/18 [00:00<?, ?it/s][A[A[A



  0%|                                                      | 0/14 [00:00<?, ?it/s][A[A[A[A




  0%|                                                       | 0/8 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                      | 0/25 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                     | 0/102 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                      | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                                      | 0/26 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A

WikiPathway_2023_Human


  0%|                                                      | 0/10 [00:00<?, ?it/s]
  0%|                                                      | 0/50 [00:00<?, ?it/s][A

  0%|                                                      | 0/30 [00:00<?, ?it/s][A[A


  0%|                                                      | 0/37 [00:00<?, ?it/s][A[A[A



  0%|                                                       | 0/7 [00:00<?, ?it/s][A[A[A[A




  0%|                                                      | 0/91 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                      | 0/19 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                     | 0/105 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                      | 0/16 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                                       | 0/8 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A

In [138]:
pd.DataFrame(ouput)

Unnamed: 0,model,library,gt_name,llm2geneset_name,llm2geneset_ROUGE1,llm2geneset_ROUGE2,llm2geneset_ROUGEL,llm2geneset_csim
0,gpt-3.5-turbo-0125,KEGG_2021_Human,ABC transporters,lipid metabolism,0.0,0.0,0.0,0.2794889447873599
1,gpt-3.5-turbo-0125,KEGG_2021_Human,AGE-RAGE signaling pathway in diabetic complic...,TGF-beta signaling pathway,0.285714,0.166667,0.363636,0.4807390365102233
2,gpt-3.5-turbo-0125,KEGG_2021_Human,AMPK signaling pathway,Intracellular trafficking,0.0,0.0,0.0,0.3055825784767985
3,gpt-3.5-turbo-0125,KEGG_2021_Human,Acute myeloid leukemia,Cell proliferation,0.0,0.0,0.0,0.2781471356160234
4,gpt-3.5-turbo-0125,KEGG_2021_Human,Adherens junction,Protein tyrosine phosphatase activity,0.0,0.0,0.0,0.2207520667294265
5,gpt-3.5-turbo-0125,KEGG_2021_Human,Adipocytokine signaling pathway,Lipid biosynthetic process,0.0,0.0,0.0,0.3825258805866526
6,gpt-3.5-turbo-0125,KEGG_2021_Human,Adrenergic signaling in cardiomyocytes,Muscle contraction,0.0,0.0,0.0,0.2710085551984218
7,gpt-3.5-turbo-0125,KEGG_2021_Human,African trypanosomiasis,Inflammatory response,0.0,0.0,0.0,0.2139488872690333
8,gpt-3.5-turbo-0125,KEGG_2021_Human,"Alanine, aspartate and glutamate metabolism",lipid metabolism,0.2,0.0,0.285714,0.4034571251649207
9,gpt-3.5-turbo-0125,KEGG_2021_Human,Alcoholism,Chromatin remodeling,0.0,0.0,0.0,0.172209083289116
