In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)


In [3]:
ref='Glutathione Metabolism'
name = 'Glutathione metabolism and antioxidant defense'
scorer.score(ref, name)

{'rouge1': Score(precision=0.4, recall=1.0, fmeasure=0.5714285714285715),
 'rouge2': Score(precision=0.25, recall=1.0, fmeasure=0.4),
 'rougeL': Score(precision=0.4, recall=1.0, fmeasure=0.5714285714285715)}

In [5]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]

ouput = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr)

        # use GSAI to generate geneset name
        gsai_res = await llm2geneset.gsai(aclient, test_sets, model=model, n_retry=1)
        gsai_names = [i['name'] for i in gsai_res]
        gsai_name_embs = llm2geneset.get_embeddings(client, gsai_names)
    

        for i, ref in enumerate(test_descr):
            try:
                gt_emb = [gt_embs[i]]
                gsai_name = gsai_names[i]
                gsai_name_emb = [gsai_name_embs[i]]
                scores = scorer.score(ref, gsai_name)
                gsai_rouge1= scores['rouge1'].recall
                gsai_rouge2= scores['rouge2'].recall
                gsai_rougeL= scores['rougeL'].fmeasure      
                gsai_csim = cosine_similarity(gt_emb,gsai_name_emb).squeeze()
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":ref,
                   "gsai_name":gsai_name,
                   "gsai_ROUGE1":gsai_rouge1,
                   "gsai_ROUGE2":gsai_rouge2,   
                   "gsai_ROUGEL":gsai_rougeL,                  
                   "gsai_csim":gsai_csim}    
            except:
                # occasionally gsai will fail to output a gene set name
                print("error! Ref name: ", ref, " gsai name: ",gsai_name)
                x={"model":model,
                   "library":lib_name,
                   "gt_name":ref,
                   "gsai_name":gsai_name,
                   "gsai_ROUGE1":None,
                   "gsai_ROUGE2":None,   
                   "gsai_ROUGEL":None,                  
                   "gsai_csim":None}    
        
            ouput.append(x)

KEGG_2021_Human


100%|███████████████████████████████████████████| 320/320 [00:08<00:00, 36.30it/s]


Reactome_2022


100%|█████████████████████████████████████████| 1818/1818 [00:21<00:00, 83.16it/s]


WikiPathway_2023_Human


100%|███████████████████████████████████████████| 801/801 [00:14<00:00, 56.97it/s]


KEGG_2021_Human


100%|███████████████████████████████████████████| 320/320 [00:24<00:00, 12.94it/s]


Reactome_2022


100%|█████████████████████████████████████████| 1818/1818 [00:50<00:00, 36.29it/s]


error! Ref name:  Nucleotide Catabolism  gsai name:  None
WikiPathway_2023_Human


100%|███████████████████████████████████████████| 801/801 [00:34<00:00, 23.41it/s]


In [6]:
df = pd.DataFrame(ouput)
df.to_csv("gsai_outputs.tsv", sep="\t", index=False)

In [7]:
df.head()

Unnamed: 0,model,library,gt_name,gsai_name,gsai_ROUGE1,gsai_ROUGE2,gsai_ROUGEL,gsai_csim
0,gpt-3.5-turbo-0125,KEGG_2021_Human,ABC transporters,ABC Transporter-Mediated Cellular Efflux and D...,1.0,1.0,0.4,0.6801285554874819
1,gpt-3.5-turbo-0125,KEGG_2021_Human,AGE-RAGE signaling pathway in diabetic complic...,Cellular stress response and inflammatory sign...,0.142857,0.0,0.153846,0.4395502833424629
2,gpt-3.5-turbo-0125,KEGG_2021_Human,AMPK signaling pathway,Glucose metabolism and insulin signaling,0.333333,0.0,0.25,0.5212444397848753
3,gpt-3.5-turbo-0125,KEGG_2021_Human,Acute myeloid leukemia,Cell Signaling and Transcriptional Regulation ...,0.0,0.0,0.0,0.2596153226759489
4,gpt-3.5-turbo-0125,KEGG_2021_Human,Adherens junction,Regulation of Cell Adhesion and Signaling Path...,0.0,0.0,0.0,0.4732685736420919
