In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
from statsmodels.stats.multitest import multipletests

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2'], use_stemmer=True)

In [3]:
def flatten_list_of_lists(lst):
    return [item for sublist in lst for item in sublist]

def unflatten_list_of_lists(flat_list, original_structure):
    result = []
    index = 0
    for sublist in original_structure:
        length = len(sublist)
        result.append(flat_list[index:index + length] if length > 0 else [])
        index += length
    return result

In [4]:
import re

def clean_elements(array):
    """Use regular expression to remove (GO:xxx) substring,  
       R-HSA-xxx substrings, and WPxxx substrings"""
    cleaned_array = []
    for element in array:
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

gobp = llm2geneset.read_gmt("libs_human/gmt/GO_Biological_Process_2023.txt")
gobp["descr"] = clean_elements(gobp["descr"])

print(len(gobp["descr"]))
#genes = [await llm2geneset.get_genes(aclient, "Antigen presentation")]
#df = llm2geneset.simple_ora(genes[0]['parsed_genes'], gobp["descr"], gobp["genes"])
#df

5407


In [5]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]

for lib_name in lib_names:
    print(lib_name)
    output = []
    model = "gpt-4o-mini-2024-07-18"
    with open("libs_human/" + model + "/" + lib_name + ".json") as f:
        gen_res = json.load(f)
    
    # Get embeddings for ground truth.
    test_sets = gen_res["curated_genesets"]
    test_descr = gen_res["descr"]
    test_descr_cleaned = gen_res["descr_cleaned"]
    gt_embs = llm2geneset.get_embeddings(client, test_descr_cleaned)

    method = "ORA"
    llm2geneset_res = llm2geneset.gs_ora_bench(test_sets, gobp)
    def res2name(res):
        df = res["ora_results"]
        _, pvals_corrected, _, _ = multipletests(df["p_val"], method='bonferroni')
        df["p_adj"] = pvals_corrected
        df = df[df["p_adj"] < 0.01]
        if df.shape[0] == 0:
            return ["None found."]
        else: 
            return df["set_descr"].to_list()            
    names = list(map(res2name, llm2geneset_res))
    in_toks = [i["tot_in_toks"] for i in llm2geneset_res]
    out_toks = [i["tot_out_toks"] for i in llm2geneset_res]

                
    # Since there can be multiple names per gene set
    # compute cosine similarity for each one. 
    flat_names = flatten_list_of_lists(names)
    flat_name_embs = llm2geneset.get_embeddings(client, flat_names)
    name_emb = unflatten_list_of_lists(flat_name_embs,names)

    # Get best score for significant gene sets and report. 
    for i, ref in enumerate(test_descr_cleaned):
        scores = [scorer.score(ref, n) for n in names[i]] 
        rouge1 = [s['rouge1'].recall for s in scores]
        rouge2 = [s['rouge2'].recall for s in scores]

        csim = [np.dot(gt_embs[i], emb) for emb in name_emb[i]]
            
        for model in ["gpt-4o-mini-2024-07-18", "gpt-3.5-turbo-0125", "gpt-4o-2024-08-06"]:    
            x={"model": model,
               "library":lib_name,
               "gt_name":test_descr[i],
               "gt_name_clean":ref,
               "name":", ".join(names[i]),
               "ROUGE1":max(rouge1),
               "ROUGE2":max(rouge2),  
               "csim":max(csim),
               "method": method,
               "in_toks":in_toks[i],
               "out_toks": out_toks[i]                   
               }
            output.append(x)
    # generate an output per library            
    df = pd.DataFrame(output) 
    outfile = "outputs/ora_outputs_" + lib_name + ".tsv"
    df.to_csv(outfile, sep="\t", index=False)
      
        

KEGG_2021_Human


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [01:44<00:00,  3.08it/s]


Reactome_2022


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [07:41<00:00,  3.94it/s]


WikiPathway_2023_Human


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [03:41<00:00,  3.61it/s]


In [None]:

lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-4o-mini-2024-07-18", "gpt-3.5-turbo-0125", "gpt-4o-2024-08-06"]

for lib_name in lib_names:
    print(lib_name)
    output = []
    for model in models:
        print(model)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr"]
        test_descr_cleaned = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr_cleaned)

        for method in ["llm2geneset", "GSAI"]:
            if method == "GSAI":
                # use GSAI to generate geneset name
                gsai_res = await llm2geneset.gsai_bench(aclient, test_sets, model=model, n_retry=5)
                # use lists of size 1 for names.
                names = [[i['name']]for i in gsai_res]
                in_toks = [i["in_toks"] for i in gsai_res]
                out_toks = [i["out_toks"] for i in gsai_res]
            elif method == "llm2geneset":
                llm2geneset_res = await llm2geneset.gs_proposal_bench(aclient, test_sets, model=model)
                # Extract multiple names meeting significance.
                def res2name(res):
                    df = res["ora_results"]
                    print(df)
                    _, pvals_corrected, _, _ = multipletests(df["p_val"], method='bonferroni')
                    df["p_adj"] = pvals_corrected
                    df = df[df["p_adj"] < 0.01]
                    if df.shape[0] == 0:
                        return ["None found."]
                    else: 
                        return df["set_descr"].to_list()            
                names = list(map(res2name, llm2geneset_res))
                in_toks = [i["tot_in_toks"] for i in llm2geneset_res]
                out_toks = [i["tot_out_toks"] for i in llm2geneset_res]
            
                
            # Since there can be multiple names per gene set
            # compute cosine similarity for each one. 
            flat_names = flatten_list_of_lists(names)
            flat_name_embs = llm2geneset.get_embeddings(client, flat_names)
            name_emb = unflatten_list_of_lists(flat_name_embs,names)

            # Get best score for significant gene sets and report. 
            for i, ref in enumerate(test_descr_cleaned):
                scores = [scorer.score(ref, n) for n in names[i]] 
                rouge1 = [s['rouge1'].recall for s in scores]
                rouge2 = [s['rouge2'].recall for s in scores]

                csim = [np.dot(gt_embs[i], emb) for emb in name_emb[i]]
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":test_descr[i],
                   "gt_name_clean":ref,
                   "name":", ".join(names[i]),
                   "ROUGE1":max(rouge1),
                   "ROUGE2":max(rouge2),  
                   "csim":max(csim),
                   "method": method,
                   "in_toks":in_toks[i],
                   "out_toks": out_toks[i]                   
                  }
                output.append(x)
    # generate an output per library            
    df = pd.DataFrame(output) 
    outfile = "outputs/gsai_vs_llm2geneset_outputs_" + lib_name + ".tsv"
    df.to_csv(outfile, sep="\t", index=False)
      
        

In [None]:

lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-4o-mini-2024-07-18", "gpt-3.5-turbo-0125", "gpt-4o-2024-08-06"]

for lib_name in lib_names:
    print(lib_name)
    output = []
    for model in models:
        print(model)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr"]
        test_descr_cleaned = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr_cleaned)

        #for method in ["llm2geneset", "GSAI"]:
        for method in ["ORA"]:
            if method == "GSAI":
                # use GSAI to generate geneset name
                gsai_res = await llm2geneset.gsai_bench(aclient, test_sets, model=model, n_retry=5)
                # use lists of size 1 for names.
                names = [[i['name']]for i in gsai_res]
                in_toks = [i["in_toks"] for i in gsai_res]
                out_toks = [i["out_toks"] for i in gsai_res]
            elif method == "llm2geneset" or method == "ORA":
                if method == "llm2geneset":
                    llm2geneset_res = await llm2geneset.gs_proposal_bench(aclient, test_sets, model=model)
                elif method == "ORA":
                    print(method)
                    print(len(test_sets))
                    llm2geneset_res = llm2geneset.gs_ora_bench(test_sets, gobp)
                    print("done")
                else:
                    llm2geneset_res = None
                # Extract multiple names meeting significance.
                def res2name(res):
                    df = res["ora_results"]
                    print(df)
                    _, pvals_corrected, _, _ = multipletests(df["p_val"], method='bonferroni')
                    df["p_adj"] = pvals_corrected
                    df = df[df["p_adj"] < 0.01]
                    if df.shape[0] == 0:
                        return ["None found."]
                    else: 
                        return df["set_descr"].to_list()            
                names = list(map(res2name, llm2geneset_res))
                in_toks = [i["tot_in_toks"] for i in llm2geneset_res]
                out_toks = [i["tot_out_toks"] for i in llm2geneset_res]
            
                
            # Since there can be multiple names per gene set
            # compute cosine similarity for each one. 
            flat_names = flatten_list_of_lists(names)
            flat_name_embs = llm2geneset.get_embeddings(client, flat_names)
            name_emb = unflatten_list_of_lists(flat_name_embs,names)

            # Get best score for significant gene sets and report. 
            for i, ref in enumerate(test_descr_cleaned):
                scores = [scorer.score(ref, n) for n in names[i]] 
                rouge1 = [s['rouge1'].recall for s in scores]
                rouge2 = [s['rouge2'].recall for s in scores]

                csim = [np.dot(gt_embs[i], emb) for emb in name_emb[i]]
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":test_descr[i],
                   "gt_name_clean":ref,
                   "name":", ".join(names[i]),
                   "ROUGE1":max(rouge1),
                   "ROUGE2":max(rouge2),  
                   "csim":max(csim),
                   "method": method,
                   "in_toks":in_toks[i],
                   "out_toks": out_toks[i]                   
                  }
                output.append(x)
    # generate an output per library            
    df = pd.DataFrame(output) 
    outfile = "outputs/gsai_vs_llm2geneset_outputs_" + lib_name + ".tsv"
    df.to_csv(outfile, sep="\t", index=False)
      
        

KEGG_2021_Human
gpt-4o-mini-2024-07-18
ORA
320


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [01:45<00:00,  3.03it/s]


done
                        set_descr         oddr  generatio   bgratio  \
1510              Lipid Transport   132.272553   0.377778  0.005442   
2592  Organic Substance Transport    65.289070   0.377778  0.010128   
5400         Xenobiotic Transport   358.849600   0.177778  0.001008   
1495        Leukotriene Transport  6516.949367   0.133333  0.000302   
782            Cholesterol Efflux   233.595041   0.155556  0.001159   

      richFactor  foldEnrich         p_val         p_adj  \
1510    0.157407   69.420165  8.383300e-28  4.532851e-24   
2592    0.084577   37.300387  5.372345e-23  1.452414e-19   
5400    0.400000  176.408889  4.466040e-17  8.049293e-14   
1495    1.000000  441.022222  9.605437e-17  1.298415e-13   
782     0.304348  134.224155  4.506372e-14  4.873191e-11   

                                           intersection  \
1510  ABCC2,ABCA3,ABCA6,ABCA5,ABCA13,ABCA9,ABCA8,ABC...   
2592  ABCA3,ABCA6,ABCA5,ABCA9,ABCA13,ABCA8,ABCA12,AB...   
5400   ABCC2,ABCA8,ABCC3,ABCC5

 25%|██████████████████████████████████                                                                                                      | 80/320 [00:26<01:20,  2.99it/s]

 28%|█████████████████████████████████████▍                                                                                                  | 88/320 [00:28<01:10,  3.31it/s]