In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)


In [3]:
ref='Glutathione Metabolism'
name = 'Glutathione metabolism and antioxidant defense'
scorer.score(ref, name)

{'rouge1': Score(precision=0.4, recall=1.0, fmeasure=0.5714285714285715),
 'rouge2': Score(precision=0.25, recall=1.0, fmeasure=0.4),
 'rougeL': Score(precision=0.4, recall=1.0, fmeasure=0.5714285714285715)}

In [11]:
models = ["gpt-3.5-turbo-0125"]
lib_names = ["WikiPathway_2023_Human",
           "Reactome_2022", 
          "WikiPathway_2023_Human"]


ouput = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr)

        # use GSAI to generate geneset name
        gsai_res = await llm2geneset.gsai(aclient, test_sets, model="gpt-4o", n_retry=1)
        gsai_names = [i['name'] for i in gsai_res]
        gsai_name_embs = llm2geneset.get_embeddings(client, gsai_names)
    

        for i, ref in enumerate(test_descr):
            try:
                gt_emb = [gt_embs[i]]
                gsai_name = gsai_names[i]
                gsai_name_emb = [gsai_name_embs[i]]
                scores = scorer.score(ref, gsai_name)
                gsai_rouge1= scores['rouge1'].recall
                gsai_rouge2= scores['rouge2'].recall
                gsai_rougeL= scores['rougeL'].fmeasure      
                gsai_csim = cosine_similarity(gt_emb,gsai_name_emb).squeeze()
                
                x={"library":lib_name,
                   "gt_name":ref,
                   "gsai_name":gsai_name,
                   "gsai_ROUGE1":gsai_rouge1,
                   "gsai_ROUGE2":gsai_rouge2,   
                   "gsai_ROUGEL":gsai_rougeL,                  
                   "gsai_csim":gsai_csim}    
    
                ouput.append(x)
            except:
                print("failed: ", ref)
                print("gsai_name: ", gsai_name)
                x={"library":lib_name,
                   "gt_name":ref,
                   "gsai_name":None,
                   "gsai_ROUGE1":None,
                   "gsai_ROUGE2":None,   
                   "gsai_ROUGEL":None,                  
                   "gsai_csim":None}    
    
                ouput.append(x)
        


WikiPathway_2023_Human


100%|███████████████████████████████████████████| 801/801 [00:38<00:00, 20.86it/s]


failed:  ERK Pathway In Huntington 39 S Disease
gsai_name:  None
failed:  Embryonic Stem Cell Pluripotency Pathways
gsai_name:  None
failed:  NRP1 Triggered Signaling Pathways In Pancreatic Cancer
gsai_name:  None
Reactome_2022


100%|█████████████████████████████████████████| 1818/1818 [00:52<00:00, 34.90it/s]


failed:  Aberrant Regulation Of Mitotic G1/S Transition In Cancer Due To RB1 Defects
gsai_name:  None
failed:  Downregulation Of SMAD2/3:SMAD4 Transcriptional Activity
gsai_name:  None
failed:  Metabolism Of Amino Acids And Derivatives
gsai_name:  None
WikiPathway_2023_Human


100%|███████████████████████████████████████████| 801/801 [00:28<00:00, 28.20it/s]


failed:  Glycosphingolipid Metabolism
gsai_name:  None


In [12]:
df = pd.DataFrame(ouput)
df.to_csv("gsai_outputs.tsv", sep="\t", index=False)

In [13]:
df.head()

Unnamed: 0,library,gt_name,gsai_name,gsai_ROUGE1,gsai_ROUGE2,gsai_ROUGEL,gsai_csim
0,WikiPathway_2023_Human,Glutathione Metabolism,Glutathione metabolism and antioxidant defense,1.0,1.0,0.571429,0.8613841252902132
1,WikiPathway_2023_Human,Alanine And Aspartate Metabolism,Amino Acid and Neurotransmitter Metabolism,0.5,0.0,0.444444,0.5751021054296209
2,WikiPathway_2023_Human,Translation Factors,Protein synthesis regulation and initiation,0.0,0.0,0.0,0.4349091501098792
3,WikiPathway_2023_Human,Electron Transport Chain OXPHOS System In Mito...,Mitochondrial Electron Transport Chain and Oxi...,0.428571,0.333333,0.428571,0.7197051650328561
4,WikiPathway_2023_Human,GPCRs Other,G-protein coupled receptor (GPCR) signaling an...,0.5,0.0,0.181818,0.6483530566924421
