In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import pandas as pd
import random
from itertools import combinations
from rouge_score import rouge_scorer
import numpy as np
from statsmodels.stats.multitest import multipletests

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2'], use_stemmer=True)

In [3]:
def sample_pairs(integer_list, num_pairs):
    possible_pairs = [(a, b) for a, b in combinations(integer_list, 2)]
    sampled_pairs = random.sample(possible_pairs, min(num_pairs, len(possible_pairs)))
    return sampled_pairs

In [4]:
file_names = [
    "gsai_vs_llm2geneset_outputs_KEGG_2021_Human.tsv",
    "gsai_vs_llm2geneset_outputs_Reactome_2022.tsv",
    "gsai_vs_llm2geneset_outputs_WikiPathway_2023_Human.tsv"
]

dfs = [pd.read_csv("outputs/" + file, sep='\t') for file in file_names]
combined = pd.concat(dfs, ignore_index=True)
gsai_out = combined[combined["method"] == "GSAI"]
llm2geneset_out = combined[combined["method"] == "llm2geneset"]

In [5]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-4o-mini-2024-07-18", "gpt-3.5-turbo-0125", "gpt-4o-2024-08-06"]
random.seed(30)
output = []
for model in models:
    print(model)
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Limit to gene sets easily identified alone.
        gsai_cur = gsai_out[(gsai_out["model"] == model) & (gsai_out["library"] == lib_name)]
        gsai_cur = gsai_cur[gsai_cur["csim"] >= 0.7]
        llm2geneset_cur = llm2geneset_out[(llm2geneset_out["model"] == model) & (llm2geneset_out["library"] == lib_name)]
        llm2geneset_cur = llm2geneset_cur[llm2geneset_cur["csim"] >= 0.7]
        llm2geneset_gt = set(llm2geneset_cur["gt_name"].to_list())
        gsai_gt = set(gsai_cur["gt_name"].to_list())
        selected = gsai_gt.intersection(llm2geneset_gt)

        # Extract selected gene sets.
        sel_idx = [i for i, descr in enumerate(gen_res["descr"]) if descr in selected]
        descr_sel = [ gen_res["descr"][i] for i in sel_idx ]
        descr_cleaned_sel = [ gen_res["descr_cleaned"][i] for i in sel_idx ]
        curated_sel = [ gen_res["curated_genesets"][i] for i in sel_idx ]

        # Construct pairs.
        pairs = sample_pairs(range(len(descr_sel)), 50)
        test_descr = []
        test_descr_cleaned = []
        test_genes = []
        for p in pairs:
            merged_genes = curated_sel[p[0]] + curated_sel[p[1]]
            test_genes.append(merged_genes)
            random.shuffle(merged_genes)
            test_descr.append(descr_sel[p[0]] + ", " + descr_sel[p[1]])
            test_descr_cleaned.append(descr_cleaned_sel[p[0]] + ", " + descr_cleaned_sel[p[1]])

        # Embed ground truth descriptions.
        gt_emb = llm2geneset.get_embeddings(client, test_descr_cleaned)
        
        # use GSAI to generate geneset name and embed
        gsai_res = await llm2geneset.gsai_bench(aclient, test_genes, model=model, n_retry=3)
        gsai_name = [i['name'] for i in gsai_res]
        gsai_name_emb = llm2geneset.get_embeddings(client, gsai_name)

        # Evaluate GSAI results.
        for idx in range(len(test_descr)):
            scores = scorer.score(test_descr_cleaned[idx], gsai_name[idx])
            gsai_rouge1= scores['rouge1'].recall
            gsai_rouge2= scores['rouge2'].recall
            gsai_csim = np.dot(gt_emb[idx],gsai_name_emb[idx])
        
            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "gt_name_clean":test_descr_cleaned[idx],
               "name":gsai_name[idx],
               "ROUGE1":gsai_rouge1,
               "ROUGE2":gsai_rouge2,   
               "csim":gsai_csim,
               "method":"GSAI"}    
            
            output.append(x)

        # use LLM2geneset to generate geneset name and embed
        llm2geneset_res = await llm2geneset.gs_proposal_bench(aclient, test_genes, model=model, n_retry=3)
        llm2geneset_name = []
        def res2name(res):
            df = res["ora_results"]
            _, pvals_corrected, _, _ = multipletests(df["p_val"], method='bonferroni')
            df["p_adj"] = pvals_corrected
            df = df[df["p_adj"] < 0.01]
            comb_name = ", ".join(df["set_descr"].to_list())
            if comb_name == "":
                comb_name = "None found."
            return comb_name
            
        llm2geneset_name = list(map(res2name, llm2geneset_res))
        llm2geneset_name_emb = llm2geneset.get_embeddings(client, llm2geneset_name)
        
        for idx in range(len(test_descr)):
            # evaluate llm2geneset proposed gene set name
            scores = scorer.score(test_descr_cleaned[idx], llm2geneset_name[idx])
            llm2geneset_rouge1= scores['rouge1'].recall
            llm2geneset_rouge2= scores['rouge2'].recall
            llm2geneset_csim = np.dot(gt_emb[idx],llm2geneset_name_emb[idx])

            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "gt_name_clean":test_descr_cleaned[idx],
               "name":llm2geneset_name[idx],
               "ROUGE1":llm2geneset_rouge1,
               "ROUGE2":llm2geneset_rouge2,   
               "csim":llm2geneset_csim,
               "method":"llm2geneset"}    
        
            output.append(x)

gpt-4o-mini-2024-07-18
KEGG_2021_Human


 72%|██████████████████████████████████████████████████████████████████████████████████████████▋                                   | 36/50 [00:08<00:01,  8.16it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.70it/s]


Reactome_2022


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:10<00:00,  4.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:28<00:00,  1.77it/s]


WikiPathway_2023_Human


 34%|██████████████████████████████████████████▊                                                                                   | 17/50 [00:06<00:04,  8.01it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 46/50 [00:10<00:00,  8.04it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.35it/s]
  0%|                                                                                                                                       | 0/50 [00:00<?, ?it/s]

retrying
List 5 biological pathways, biological processes, or cellular components that contain the following genes """RAD9A,GABBR2,CCNB3,RB1,BRCA1,RRM2B,ABAT,ALDH9A1,CDK6,CDK1,GABRB2,CCNB2,RPA2,RAD51,RAD52,GADD45G,CDK4,E2F1,CASP9,GABRD,CDC25A,GAD1,CHEK1,ATR,GABBR1,CCNE1,CCND2,GABRG3,CHEK2,AP2B1,TLK2,GABRG2,CDKN1A,MRE11,H2AX,AP2S1,BID,GABRA2,BBC3,CDK2,GABRA6,TNFRSF10B,FANCD2,AP2A1,CDC25C,RAD50,GABRA4,PRKDC,CDKN1B,CDK5,HUS1B,GADD45B,GABRP,FAS,MDM2,CASP8,SMC1A,SFN,SESN1,GABRE,ATM,AP2A2,GADD45A,GABRA5,RFC1,DDB2,MYC,PML,CYCS,PIDD1,TP53,AKT1,CREB1,GABRQ,CASP3,SLC6A1,GABRA3,GABRG1,AP2M1,GABRB3,RAD17,GAD2,APAF1,TP53AIP1,ABL1,CCND1,TLK1,ATRIP,SLC6A11,RAD1,CCND3,BAX,CCNB1,CCNE2,GPHN,NBN,GABRA1,GABRB1,SLC32A1,PMAIP1""" with high confidence. Be as specific as possible. List non-overlapping pathways, processes, or components. Do not include the gene names in the outputs. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
  

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:17<00:00,  2.79it/s]


gpt-3.5-turbo-0125
KEGG_2021_Human


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:15<00:00,  3.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:16<00:00,  3.00it/s]


Reactome_2022


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:08<00:00,  5.94it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:58<00:00,  1.16s/it]


WikiPathway_2023_Human


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:08<00:00,  6.07it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.07it/s]


gpt-4o-2024-08-06
KEGG_2021_Human


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:20<00:00,  2.45it/s]


Reactome_2022


 46%|█████████████████████████████████████████████████████████▉                                                                    | 23/50 [00:11<00:03,  7.45it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 56%|██████████████████████████████████████████████████████████████████████▌                                                       | 28/50 [00:11<00:01, 11.70it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:20<00:00,  2.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.07it/s]


WikiPathway_2023_Human


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:28<00:00,  1.77it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:16<00:00,  3.11it/s]


In [6]:
df = pd.DataFrame(output)

In [7]:
df.head()

Unnamed: 0,model,library,gt_name,gt_name_clean,name,ROUGE1,ROUGE2,csim,method
0,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Glycolysis / Gluconeogenesis, Wnt signaling pa...","Glycolysis / Gluconeogenesis, Wnt signaling pa...",Wnt signaling pathway regulation and development,0.6,0.5,0.559984,GSAI
1,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Amino sugar and nucleotide sugar metabolism, U...","Amino sugar and nucleotide sugar metabolism, U...",Ubiquitin-mediated proteolysis and protein mod...,0.444444,0.25,0.624214,GSAI
2,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Fatty acid elongation, Pentose phosphate pathway","Fatty acid elongation, Pentose phosphate pathway",Lipid and carbohydrate metabolism integration,0.0,0.0,0.47627,GSAI
3,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Glycerophospholipid metabolism, Phosphatidylin...","Glycerophospholipid metabolism, Phosphatidylin...",Phosphoinositide metabolism and signaling,0.4,0.0,0.790195,GSAI
4,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"B cell receptor signaling pathway, Oxidative p...","B cell receptor signaling pathway, Oxidative p...",Mitochondrial Bioenergetics and Immune Signaling,0.142857,0.0,0.56413,GSAI


In [9]:
df.to_csv('outputs/mix_genesets_outputs.tsv', sep="\t", index=False)