In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import pandas as pd
import random
from itertools import combinations
from rouge_score import rouge_scorer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from statsmodels.stats.multitest import multipletests

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2'], use_stemmer=True)

In [3]:
def sample_pairs(integer_list, num_pairs):
    possible_pairs = [(a, b) for a, b in combinations(integer_list, 2)]
    sampled_pairs = random.sample(possible_pairs, min(num_pairs, len(possible_pairs)))
    return sampled_pairs

In [4]:
combined = pd.read_csv("gsai_vs_llm2geneset_outputs.tsv", sep="\t")
gsai_out = combined[combined["method"] == "GSAI"]
llm2geneset_out = combined[combined["method"] == "llm2geneset"]

In [5]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
#models = ["gpt-3.5-turbo-0125"]
#lib_names = ["WikiPathway_2023_Human"]
random.seed(30)
output = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Limit to gene sets easily identified alone.
        gsai_cur = gsai_out[(gsai_out["model"] == model) & (gsai_out["library"] == lib_name)]
        gsai_cur = gsai_cur[gsai_cur["csim"] >= 0.75]
        llm2geneset_cur = llm2geneset_out[(llm2geneset_out["model"] == model) & (llm2geneset_out["library"] == lib_name)]
        llm2geneset_cur = llm2geneset_cur[llm2geneset_cur["csim"] >= 0.75]
        llm2geneset_gt = set(llm2geneset_cur["gt_name"].to_list())
        gsai_gt = set(gsai_cur["gt_name"].to_list())
        selected = gsai_gt.intersection(llm2geneset_gt)

        # Extract selected gene sets.
        sel_idx = [i for i, descr in enumerate(gen_res["descr"]) if descr in selected]
        descr_sel = [ gen_res["descr"][i] for i in sel_idx ]
        descr_cleaned_sel = [ gen_res["descr_cleaned"][i] for i in sel_idx ]
        curated_sel = [ gen_res["curated_genesets"][i] for i in sel_idx ]

        # Construct pairs.
        pairs = sample_pairs(range(len(descr_sel)), 50)
        test_descr = []
        test_descr_cleaned = []
        test_genes = []
        for p in pairs:
            merged_genes = curated_sel[p[0]] + curated_sel[p[1]]
            test_genes.append(merged_genes)
            random.shuffle(merged_genes)
            test_descr.append(descr_sel[p[0]] + ", " + descr_sel[p[1]])
            test_descr_cleaned.append(descr_cleaned_sel[p[0]] + ", " + descr_cleaned_sel[p[1]])

        # Embed ground truth descriptions.
        gt_emb = llm2geneset.get_embeddings(client, test_descr_cleaned)
        
        # use GSAI to generate geneset name and embed
        gsai_res = await llm2geneset.gsai(aclient, test_genes, model=model, n_retry=3)
        gsai_name = [i['name'] for i in gsai_res]
        gsai_name_emb = llm2geneset.get_embeddings(client, gsai_name)

        # Evaluate GSAI results.
        for idx in range(len(test_descr)):
            scores = scorer.score(test_descr_cleaned[idx], gsai_name[idx])
            gsai_rouge1= scores['rouge1'].recall
            gsai_rouge2= scores['rouge2'].recall
            gsai_csim = np.dot(gt_emb[idx],gsai_name_emb[idx])
        
            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "gt_name_clean":test_descr_cleaned[idx],
               "name":gsai_name[idx],
               "ROUGE1":gsai_rouge1,
               "ROUGE2":gsai_rouge2,   
               "csim":gsai_csim,
               "method":"GSAI"}    
            
            output.append(x)

        # use LLM2geneset to generate geneset name and embed
        llm2geneset_res = await llm2geneset.gs_proposal(aclient, test_genes, model=model, n_retry=3)
        llm2geneset_name = []
        def res2name(res):
            df = res["ora_results"]
            _, pvals_corrected, _, _ = multipletests(df["p_val"], method='bonferroni')
            df["p_adj"] = pvals_corrected
            df = df[df["p_adj"] < 0.01]
            comb_name = ", ".join(df["bio_process"].to_list())
            if comb_name == "":
                comb_name = "None found."
            return comb_name
            
        llm2geneset_name = list(map(res2name, llm2geneset_res))
        llm2geneset_name_emb = llm2geneset.get_embeddings(client, llm2geneset_name)
        
        for idx in range(len(test_descr)):
            # evaluate llm2geneset proposed gene set name
            scores = scorer.score(test_descr_cleaned[idx], llm2geneset_name[idx])
            llm2geneset_rouge1= scores['rouge1'].recall
            llm2geneset_rouge2= scores['rouge2'].recall
            llm2geneset_csim = np.dot(gt_emb[idx],llm2geneset_name_emb[idx])

            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "gt_name_clean":test_descr_cleaned[idx],
               "name":llm2geneset_name[idx],
               "ROUGE1":llm2geneset_rouge1,
               "ROUGE2":llm2geneset_rouge2,   
               "csim":llm2geneset_csim,
               "method":"LLM2geneset"}    
        
            output.append(x)

KEGG_2021_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:56<00:00,  1.14s/it]


Reactome_2022


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:49<00:00,  1.01it/s]


WikiPathway_2023_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:52<00:00,  1.06s/it]


KEGG_2021_Human


 34%|███████████████████████████████████████████▌                                                                                    | 17/50 [00:09<00:09,  3.62it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 54%|█████████████████████████████████████████████████████████████████████                                                           | 27/50 [00:10<00:05,  4.25it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 68%|███████████████████████████████████████████████████████████████████████████████████████                                         | 34/50 [00:11<00:02,  7.38it/s]

retrying
name is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 48/50 [00:18<00:01,  1.91it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.01it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:32<00:00,  1.55it/s]


Reactome_2022


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  3.87it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:26<00:00,  1.85it/s]


WikiPathway_2023_Human


 62%|███████████████████████████████████████████████████████████████████████████████▎                                                | 31/50 [00:08<00:01, 13.48it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you select "Ribosome biogenesis" as the process name but
only a few genes in the system contribute

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:27<00:00,  1.81it/s]


In [6]:
df = pd.DataFrame(output)

In [7]:
df.to_csv('mix_genesets_outputs.tsv', sep="\t", index=False)