In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import hypergeom
from asynciolimiter import StrictLimiter
import asyncio
import tqdm.asyncio
aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)

In [3]:
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]

In [4]:
output = []
for model in models:   
    print(model)
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr)

        # use llm2geneset to generate geneset name
        res = await llm2geneset.gs_proposal(aclient, test_sets, model=model)
        most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
        names = [i[0] for i in most_overrepresented]
        name_embs = llm2geneset.get_embeddings(client, names)
        
        for i, ref in enumerate(test_descr):
            gt_emb = [gt_embs[i]]
            name = names[i]
            name_emb = [name_embs[i]]
            scores = scorer.score(ref, name)
            rouge1= scores['rouge1'].recall
            rouge2= scores['rouge2'].recall
            rougeL= scores['rougeL'].fmeasure      
            csim = cosine_similarity(gt_emb,name_emb).squeeze()
                
            x={"model":model,
               "library":lib_name,
               "gt_name":ref,
               "llm2geneset_name":name,
               "llm2geneset_ROUGE1":rouge1,
               "llm2geneset_ROUGE2":rouge2,   
               "llm2geneset_ROUGEL":rougeL,                  
               "llm2geneset_csim":csim}    
            output.append(x)

gpt-3.5-turbo-0125
KEGG_2021_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:48<00:00,  6.57it/s]


Reactome_2022


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 1813/1818 [01:16<00:06,  1.27s/it]

retrying
List about 5 biological pathways, biological processes, or cellular components that contain the following genes """SLC12A3,SLC12A4,SLC12A5,SLC12A6,SLC12A1,SLC12A2,CTNS,SLC9A3,SLC5A8,SLC9A4,SLC9A5,SLC9A7,SLC9A8,SLC9A9,CALM1,AHCYL2,SLC1A1,SLC1A2,SLC1A3,SLC1A4,SLC1A5,SLC1A6,SLC1A7,SLC5A5,SLC9A1,SLC9A2,SLC26A11,SLC12A7,SLC15A1,SLC34A3,SLC15A2,SLC38A2,SLC15A3,SLC38A1,SLC6A19,SLC6A18,SLC34A2,SLC34A1,SLC6A15,SLC6A14,SLC6A12,SLC16A10,SLC6A6,SLC26A1,SLC26A3,SLC26A2,SLC6A20,SLC15A4,SLC38A4,SLC38A3,SLC38A5,SLC4A10,SLC7A5,SLC7A6,SLC7A7,SLC7A8,SLC7A9,SLC26A9,SLC26A4,SLC26A7,SLC26A6,SLC3A1,SLC3A2,SLC7A1,SLC7A11,SLC7A10,SLC7A2,SLC7A3,SLC36A2,SLC36A1,SLC36A4,SLC17A1,SLC32A1,SLC8B1,SLC4A8,SLC4A9,SLC25A18,SLC25A10,SLC24A3,SLC24A5,SLC24A4,SLC20A2,SLC24A1,SLC43A1,SLC43A2,SLC5A12,SRI,SLC4A1,SLC20A1,SLC4A2,SLC4A3,SLC4A4,SLC4A5,SLC8A1,SLC8A2,SLC4A7,SLC8A3,SLC25A29,SLC17A6,SLC17A7,SLC17A8,SLC25A22,SLC25A26,SLC17A5""" with high confidence. Be as specific as possible. Use the following JSON schema:
```

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [01:36<00:00, 18.92it/s]


WikiPathway_2023_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:53<00:00, 15.07it/s]


gpt-4o-2024-05-13
KEGG_2021_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [01:08<00:00,  4.68it/s]


Reactome_2022


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [02:29<00:00, 12.19it/s]


WikiPathway_2023_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [01:17<00:00, 10.37it/s]


In [5]:
df = pd.DataFrame(output)
df.to_csv("llm2geneset_outputs.tsv", sep="\t", index=False)
df

Unnamed: 0,model,library,gt_name,llm2geneset_name,llm2geneset_ROUGE1,llm2geneset_ROUGE2,llm2geneset_ROUGEL,llm2geneset_csim
0,gpt-3.5-turbo-0125,KEGG_2021_Human,ABC transporters,ATP binding cassette (ABC) transporter complex,1.000000,1.000000,0.500000,0.6809030625974123
1,gpt-3.5-turbo-0125,KEGG_2021_Human,AGE-RAGE signaling pathway in diabetic complic...,Inflammatory response,0.000000,0.000000,0.000000,0.25340540819010726
2,gpt-3.5-turbo-0125,KEGG_2021_Human,AMPK signaling pathway,mTOR signaling pathway,0.666667,0.500000,0.666667,0.6977507297222042
3,gpt-3.5-turbo-0125,KEGG_2021_Human,Acute myeloid leukemia,PI3K-Akt signaling pathway,0.000000,0.000000,0.000000,0.2536233394545665
4,gpt-3.5-turbo-0125,KEGG_2021_Human,Adherens junction,Cell-cell adherens junction,1.000000,1.000000,0.666667,0.8581968492841091
...,...,...,...,...,...,...,...,...
5873,gpt-4o-2024-05-13,WikiPathway_2023_Human,Serotonin Receptor 4 6 7 And NR3C Signaling,MAPK signaling pathway,0.125000,0.000000,0.181818,0.39546705427249884
5874,gpt-4o-2024-05-13,WikiPathway_2023_Human,Toll Like Receptor Signaling Pathway,Toll-like receptor signaling pathway,1.000000,1.000000,1.000000,0.9135843672254682
5875,gpt-4o-2024-05-13,WikiPathway_2023_Human,TCA Cycle Aka Krebs Or Citric Acid Cycle,Tricarboxylic Acid Cycle (TCA Cycle),0.500000,0.285714,0.307692,0.80311084790496
5876,gpt-4o-2024-05-13,WikiPathway_2023_Human,Nucleotide GPCRs,Adenosine Receptor Signaling,0.000000,0.000000,0.000000,0.45561326162562077
