In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

In [7]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-3.5-turbo-0125"]
lib_names = ["WikiPathway_2023_Human"]

output = []
for model in models:
    print(model)
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr"]
        test_descr_cleaned = gen_res["descr_cleaned"]
        
        gt_embs = llm2geneset.get_embeddings(client, test_descr_cleaned)

        for method in ["llm2geneset", "GSAI"]:
            if method == "GSAI":
                # use GSAI to generate geneset name
                gsai_res = await llm2geneset.gsai(aclient, test_sets, model=model, n_retry=3)
                names = [i['name'] for i in gsai_res]
                in_toks = [i["in_toks"] for i in gsai_res]
                out_toks = [i["out_toks"] for i in gsai_res]
            elif method == "llm2geneset":
                llm2geneset_res = await llm2geneset.gs_proposal(aclient, test_sets, model=model)
                def res2name(res):
                    df = res["ora_results"]
                    min_p_val_row = df.loc[df['p_val'].idxmin()]
                    return min_p_val_row['bio_process']
                
                names = list(map(res2name, llm2geneset_res))
                in_toks = [i["tot_in_toks"] for i in llm2geneset_res]
                out_toks = [i["tot_out_toks"] for i in llm2geneset_res]
                #most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
                #names = [i[0] for i in most_overrepresented]
    
            name_embs = llm2geneset.get_embeddings(client, names)
            
            for i, ref in enumerate(test_descr_cleaned):
                scores = scorer.score(ref, names[i])
                rouge1 = scores['rouge1'].recall
                rouge2 = scores['rouge2'].recall
                rougeL = scores['rougeL'].recall      
                csim = np.dot(gt_embs[i], name_embs[i])
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":test_descr[i],
                   "gt_name_clean":ref,
                   "name":names[i],
                   "ROUGE1":rouge1,
                   "ROUGE2":rouge2,   
                   "ROUGEL":rougeL,                  
                   "csim":csim,
                   "method": method,
                   "in_toks":in_toks[i],
                   "out_toks": out_toks[i]                   
                  }    
            
                output.append(x)

gpt-3.5-turbo-0125
WikiPathway_2023_Human


  0%|                                                                                                                              | 0/801 [00:00<?, ?it/s]

retrying
List about 5 biological pathways, biological processes, or cellular components that contain the following genes """ABCC4,NCOA2,NCOA3,ABCC2,ABCC3,HSP90AA1,NCOA1,UGT1A1,CYP4F12,CYP2C9,CYP2A6,DNAJC7,PSMC5,GSTA2,ABCB1,SRC,NR1I2,CYP3A4,CYP3A5,CYP2C19,CYP3A7,FOXO1,SLCO1B1,SULT2A1,RXRA,SRPX2,CYP2B6,NRIP1,UGT1A6,UGT1A4,PPARGC1A,UGT1A9""" with high confidence. Be as specific as possible. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "p": {
                "type": "string",
            },
        },
        "required": ["p"]
    }
}
```
Example output will look like the following:
```json
[{"p":"bp_or_pathway1"},
 {"p":"bp_or_pathway2"},
 {"p":"bp_or_pathway3"},
 {"p":"bp_or_pathway4"},
```
The element `p` designates a pathway, biological process or cellular component. Place the output in a JSON code block. Do not add any comments in the JSON code block.

'p'
```json
[{"bp_or_pathway1":"Drug met

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [05:59<00:00,  2.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:39<00:00, 20.32it/s]


In [8]:
df = pd.DataFrame(output)
df.to_csv("gsai_vs_llm2geneset_outputs.tsv", sep="\t", index=False)

In [9]:
df["out_toks"].sum()/1e6 * 1.5 + df["in_toks"].sum()/1e6 * 0.5

3.7615735000000003