In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import pandas as pd
import random
from itertools import combinations
from rouge_score import rouge_scorer
import numpy as np
from statsmodels.stats.multitest import multipletests

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2'], use_stemmer=True)

In [3]:
def sample_pairs(integer_list, num_pairs):
    possible_pairs = [(a, b) for a, b in combinations(integer_list, 2)]
    sampled_pairs = random.sample(possible_pairs, min(num_pairs, len(possible_pairs)))
    return sampled_pairs

In [4]:
file_names = [
    "gsai_vs_llm2geneset_outputs_KEGG_2021_Human.tsv",
    "gsai_vs_llm2geneset_outputs_Reactome_2022.tsv",
    "gsai_vs_llm2geneset_outputs_WikiPathway_2023_Human.tsv"
]

dfs = [pd.read_csv("outputs/" + file, sep='\t') for file in file_names]
combined = pd.concat(dfs, ignore_index=True)
gsai_out = combined[combined["method"] == "GSAI"]
llm2geneset_out = combined[combined["method"] == "llm2geneset"]

In [5]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-4o-mini-2024-07-18", "gpt-3.5-turbo-0125", "gpt-4o-2024-08-06"]
random.seed(30)
output = []
for model in models:
    print(model)
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Limit to gene sets easily identified alone.
        gsai_cur = gsai_out[(gsai_out["model"] == model) & (gsai_out["library"] == lib_name)]
        gsai_cur = gsai_cur[gsai_cur["csim"] >= 0.7]
        llm2geneset_cur = llm2geneset_out[(llm2geneset_out["model"] == model) & (llm2geneset_out["library"] == lib_name)]
        llm2geneset_cur = llm2geneset_cur[llm2geneset_cur["csim"] >= 0.7]
        llm2geneset_gt = set(llm2geneset_cur["gt_name"].to_list())
        gsai_gt = set(gsai_cur["gt_name"].to_list())
        selected = gsai_gt.intersection(llm2geneset_gt)

        # Extract selected gene sets.
        sel_idx = [i for i, descr in enumerate(gen_res["descr"]) if descr in selected]
        descr_sel = [ gen_res["descr"][i] for i in sel_idx ]
        descr_cleaned_sel = [ gen_res["descr_cleaned"][i] for i in sel_idx ]
        curated_sel = [ gen_res["curated_genesets"][i] for i in sel_idx ]

        # Construct pairs.
        pairs = sample_pairs(range(len(descr_sel)), 50)
        test_descr = []
        test_descr_cleaned = []
        test_genes = []
        for p in pairs:
            merged_genes = curated_sel[p[0]] + curated_sel[p[1]]
            test_genes.append(merged_genes)
            random.shuffle(merged_genes)
            test_descr.append(descr_sel[p[0]] + ", " + descr_sel[p[1]])
            test_descr_cleaned.append(descr_cleaned_sel[p[0]] + ", " + descr_cleaned_sel[p[1]])

        # Embed ground truth descriptions.
        gt_emb = llm2geneset.get_embeddings(client, test_descr_cleaned)
        
        # use GSAI to generate geneset name and embed
        gsai_res = await llm2geneset.gsai_bench(aclient, test_genes, model=model, n_retry=3, prompt_file="gsai_prompt2.txt")
        gsai_name = [i['name'] for i in gsai_res]
        gsai_name_emb = llm2geneset.get_embeddings(client, gsai_name)

        # Evaluate GSAI results.
        for idx in range(len(test_descr)):
            scores = scorer.score(test_descr_cleaned[idx], gsai_name[idx])
            gsai_rouge1= scores['rouge1'].recall
            gsai_rouge2= scores['rouge2'].recall
            gsai_csim = np.dot(gt_emb[idx],gsai_name_emb[idx])
        
            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "gt_name_clean":test_descr_cleaned[idx],
               "name":gsai_name[idx],
               "ROUGE1":gsai_rouge1,
               "ROUGE2":gsai_rouge2,   
               "csim":gsai_csim,
               "method":"GSAI_2hint"}    
            
            output.append(x)

 

gpt-4o-mini-2024-07-18
KEGG_2021_Human


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  3.91it/s]


Reactome_2022


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.63it/s]


WikiPathway_2023_Human


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.33it/s]


gpt-3.5-turbo-0125
KEGG_2021_Human


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.55it/s]


Reactome_2022


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.63it/s]


WikiPathway_2023_Human


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  7.58it/s]


gpt-4o-2024-08-06
KEGG_2021_Human


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:09<00:00,  5.27it/s]


Reactome_2022


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:10<00:00,  4.72it/s]


WikiPathway_2023_Human


 30%|█████████████████████████████████████████                                                                                                | 15/50 [00:06<00:05,  6.78it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.
There are 2 distinct biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you se

 54%|█████████████████████████████████████████████████████████████████████████▉                                                               | 27/50 [00:07<00:01, 14.76it/s]

retrying
conf is none
Write a critical analysis of the biological processes performed by this system of interacting proteins.
There are 2 distinct biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data.  After completing your analysis, propose a brief and
detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence score to the process name you selected.  This score should
follow the name in parentheses and range from 0.00 to 1.00. A score of 0.00 indicates the lowest confidence, while 1.00 reflects
the highest confidence. This score helps gauge how accurately the chosen name represents the functions and activities within
the system of interacting proteins. When determining your score, consider the proportion of genes in the protein system that
participate in the identified biological process. For instance, if you se

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.39it/s]


In [6]:
df = pd.DataFrame(output)

In [7]:
df.head()

Unnamed: 0,model,library,gt_name,gt_name_clean,name,ROUGE1,ROUGE2,csim,method
0,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Glycolysis / Gluconeogenesis, Wnt signaling pa...","Glycolysis / Gluconeogenesis, Wnt signaling pa...",Wnt signaling pathway modulation and cellular ...,0.6,0.5,0.577511,GSAI
1,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Amino sugar and nucleotide sugar metabolism, U...","Amino sugar and nucleotide sugar metabolism, U...",Ubiquitin-Proteasome System Regulation and Cel...,0.222222,0.0,0.480704,GSAI
2,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Fatty acid elongation, Pentose phosphate pathway","Fatty acid elongation, Pentose phosphate pathway",Fatty Acid Metabolism and Lipid Biosynthesis,0.333333,0.2,0.644241,GSAI
3,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"Glycerophospholipid metabolism, Phosphatidylin...","Glycerophospholipid metabolism, Phosphatidylin...",Phosphoinositide signaling and lipid metabolism,0.4,0.0,0.758728,GSAI
4,gpt-4o-mini-2024-07-18,KEGG_2021_Human,"B cell receptor signaling pathway, Oxidative p...","B cell receptor signaling pathway, Oxidative p...",Mitochondrial Bioenergetics and Cellular Signa...,0.142857,0.0,0.526477,GSAI


In [8]:
df.to_csv('outputs/prompt2_mix_genesets_output.tsv', sep="\t", index=False)