In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
from statsmodels.stats.multitest import multipletests

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

In [11]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human"]

output = []
for model in models:
    print(model)
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr"]
        test_descr_cleaned = gen_res["descr_cleaned"]
        
        gt_embs = llm2geneset.get_embeddings(client, test_descr_cleaned)

        for method in ["llm2geneset", "GSAI"]:
            if method == "GSAI":
                # use GSAI to generate geneset name
                gsai_res = await llm2geneset.gsai(aclient, test_sets, model=model, n_retry=3)
                names = [i['name'] for i in gsai_res]
                in_toks = [i["in_toks"] for i in gsai_res]
                out_toks = [i["out_toks"] for i in gsai_res]
            elif method == "llm2geneset":
                llm2geneset_res = await llm2geneset.gs_proposal(aclient, test_sets, model=model)
                #def res2name(res):
                #    df = res["ora_results"]
                #    min_p_val_row = df.loc[df['p_val'].idxmin()]
                #    return min_p_val_row['bio_process']
                def res2name(res):
                    df = res["ora_results"]
                    _, pvals_corrected, _, _ = multipletests(df["p_val"], method='bonferroni')
                    df["p_adj"] = pvals_corrected
                    df = df[df["p_adj"] < 0.01]
                    comb_name = ", ".join(df["bio_process"].to_list())
                    if comb_name == "":
                        comb_name = "None found."
                    return comb_name                
                names = list(map(res2name, llm2geneset_res))
                in_toks = [i["tot_in_toks"] for i in llm2geneset_res]
                out_toks = [i["tot_out_toks"] for i in llm2geneset_res]
                #most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
                #names = [i[0] for i in most_overrepresented]
    
            name_embs = llm2geneset.get_embeddings(client, names)
            
            for i, ref in enumerate(test_descr_cleaned):
                scores = scorer.score(ref, names[i])
                rouge1 = scores['rouge1'].recall
                rouge2 = scores['rouge2'].recall
                rougeL = scores['rougeL'].recall      
                csim = np.dot(gt_embs[i], name_embs[i])
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":test_descr[i],
                   "gt_name_clean":ref,
                   "name":names[i],
                   "ROUGE1":rouge1,
                   "ROUGE2":rouge2,   
                   "ROUGEL":rougeL,                  
                   "csim":csim,
                   "method": method,
                   "in_toks":in_toks[i],
                   "out_toks": out_toks[i]                   
                  }    
            
                output.append(x)

gpt-4o-2024-05-13
KEGG_2021_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:36<00:00,  8.77it/s]
 43%|█████████████████████████████████████████████████▋                                                                  | 137/320 [00:07<00:02, 73.65it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 48%|███████████████████████████████████████████████████████▍                                                            | 153/320 [00:08<00:01, 95.57it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 66%|████████████████████████████████████████████████████████████████████████████▍                                       | 211/320 [00:09<00:01, 63.22it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 294/320 [00:11<00:01, 20.40it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [08:06<00:00,  1.52s/it]


In [12]:
df = pd.DataFrame(output)
df.to_csv("gsai_vs_llm2geneset_outputs.tsv", sep="\t", index=False)

In [13]:
df["out_toks"].sum()/1e6 * 5 + df["in_toks"].sum()/1e6 * 15

18.057205