Notebook to evaluate overlap between LLM generated gene sets and

Outputs:
outputs/gen_time.tsv (time required to generate gene sts)
outputs/tok_use.tsv (table with tokens used for model/prompt combinations)
outputs/genes_overlap.tsv (overrepresentation results for LLM generated gene sets)

In [1]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [2]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-4o-mini-2024-07-18", "gpt-3.5-turbo-0125", "gpt-4o-2024-08-06"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
gene_sets = ["llm_genes_norole", "llm_genes_role", "llm_genes_reason",
             "llm_genes_conf_high", "llm_ensembled"]

In [3]:
# Create gene sets with only high or high/medium confidence.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        llm_genes_conf_high = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high"])
        llm_genes_conf_medium = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["medium"])
        llm_genes_conf_low = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["low"])
        llm_genes_conf_high_medium = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high", "medium"])

        gen_res["llm_genes_conf_high"] = llm_genes_conf_high
        gen_res["llm_genes_conf_high_medium"] = llm_genes_conf_high_medium
        gen_res["llm_genes_conf_medium"] = llm_genes_conf_medium
        gen_res["llm_genes_conf_low"] = llm_genes_conf_low
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4) 

In [4]:
# Use ensemble generations to define gene sets.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Append separate generations together.
        llm_genes = [gen_res["llm_genes_norole"]]
        for i in range(4):
            llm_genes.append(gen_res["llm_ensemble_" + str(i)])
            
        # Generate ensembled set. 
        llm_ensembled = llm2geneset.ensemble_genes(gen_res["descr_cleaned"], llm_genes, 5)
    
        # Add ensembled results.
        gen_res["llm_ensembled"] = llm_ensembled
        
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

In [5]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        x = {"lib_name": lib_name,
             "model": model,
             "gen_time_role": gen_res["gen_time_role"],
             "gen_time_norole": gen_res["gen_time_norole"],
             "gen_time_reasoning": gen_res["gen_time_reasoning"],
             "gen_time_conf": gen_res["gen_time_conf"]
        }
        gen_time_table.append(x)
df = pd.DataFrame(gen_time_table)
df.to_csv("outputs/gen_time.tsv",sep="\t",index=None)
df

Unnamed: 0,lib_name,model,gen_time_role,gen_time_norole,gen_time_reasoning,gen_time_conf
0,KEGG_2021_Human,gpt-4o-mini-2024-07-18,26.515748,15.174009,34.608303,24.790152
1,KEGG_2021_Human,gpt-3.5-turbo-0125,40.71158,21.634682,16.923785,14.943095
2,KEGG_2021_Human,gpt-4o-2024-08-06,32.68191,30.819701,39.722813,34.77851
3,Reactome_2022,gpt-4o-mini-2024-07-18,411.468713,50.428429,51.160516,801.40462
4,Reactome_2022,gpt-3.5-turbo-0125,35.365794,34.001247,43.410815,31.869861
5,Reactome_2022,gpt-4o-2024-08-06,93.410272,115.903737,52.931276,409.171145
6,WikiPathway_2023_Human,gpt-4o-mini-2024-07-18,401.165772,31.707753,58.294887,401.682324
7,WikiPathway_2023_Human,gpt-3.5-turbo-0125,98.8785,28.196767,19.184499,31.967324
8,WikiPathway_2023_Human,gpt-4o-2024-08-06,44.804884,33.037866,407.042043,41.880466


In [6]:
hgnc_symbols = pd.read_csv("outputs/hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgnc_symbols = set(hgnc_symbols)

In [7]:
gene_sets

['llm_genes_norole',
 'llm_genes_role',
 'llm_genes_reason',
 'llm_genes_conf_high',
 'llm_ensembled']

In [8]:
database_res = []
tok_use = []

for lib_name in lib_names:
    print(lib_name)
    for model in models:
        print(model)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        for gset in gene_sets:
            print(gset)
            iou_output = []
            in_toks = 0
            out_toks = 0
            for idx in range(len(gen_res["descr"])):
                curated_genes = gen_res["curated_genesets"][idx]
                parsed_llm_genes = gen_res[gset][idx]['parsed_genes']
                ntries = gen_res[gset][idx]['ntries']
                
                in_toks += gen_res[gset][idx]['in_toks']
                out_toks += gen_res[gset][idx]['out_toks']
    
                non_hgnc = len(set(parsed_llm_genes) - hgnc_symbols)
    
                llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
    
            
                intersection = set(llm_genes).intersection(set(curated_genes))
                p_val = hypergeom.sf(len(intersection)-1,
                                     19846, 
                                     len(curated_genes), 
                                     len(llm_genes))
                
                # generatio == recall 
                generatio = float(len(intersection)) / len(set(curated_genes))                                                                                                 
                bgratio = float(len(set(llm_genes))) / 19846                                                                                                    
                                                                                                                                                                   
                richFactor = None                                                                                                                                      
                foldEnrich = None                                                                                                                                      
                if len(llm_genes) > 0:   
                    # richFactor == precision                                                                                                                              
                    richFactor = float(len(intersection)) / len(set(llm_genes))                                                                                        
                    foldEnrich = generatio / bgratio                                                                                                                   
  
                x = {
                    'database': lib_name,
                    'model' : model,
                    'gene_set' : gset, 
                    'descr': gen_res["descr"][idx],
                    'descr_cleaned': gen_res["descr_cleaned"][idx],
                    'ncurated': len(curated_genes),
                    'nllm': len(llm_genes),
                    'ninter': len(intersection),
                    'generatio': generatio,
                    'bgratio': bgratio,
                    'richFactor': richFactor,
                    'foldEnrich': foldEnrich,
                    'non_hgnc': non_hgnc,
                    'ntries' : ntries,
                    'ndup': len(parsed_llm_genes) - len(llm_genes),
                    'p_val': p_val
                }
                iou_output.append(x)

            toks_cur = {"lib_name": lib_name, "model": model,
                        "gene_set": gset,
                        "in_toks": in_toks, "out_toks" : out_toks}
            tok_use.append(toks_cur)
            df = pd.DataFrame(iou_output)
           
            _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
            df['p_val_adj'] = pvals_corrected
            #& (df["generatio"] >= 0.5
            print(df[(df['p_val_adj'] < 0.01) ].shape[0] / df.shape[0])
            database_res.append(df)    
        print("")

KEGG_2021_Human
gpt-4o-mini-2024-07-18
llm_genes_norole
0.8625
llm_genes_role
0.840625
llm_genes_reason
0.803125
llm_genes_conf_high
0.709375
llm_ensembled
0.653125

gpt-3.5-turbo-0125
llm_genes_norole
0.89375
llm_genes_role
0.890625
llm_genes_reason
0.74375
llm_genes_conf_high
0.6
llm_ensembled
0.59375

gpt-4o-2024-08-06
llm_genes_norole
0.91875
llm_genes_role
0.94375
llm_genes_reason
0.903125
llm_genes_conf_high
0.815625
llm_ensembled
0.825

Reactome_2022
gpt-4o-mini-2024-07-18
llm_genes_norole
0.6600660066006601
llm_genes_role
0.6716171617161716
llm_genes_reason
0.5654565456545655
llm_genes_conf_high
0.5099009900990099
llm_ensembled
0.4680968096809681

gpt-3.5-turbo-0125
llm_genes_norole
0.6556655665566556
llm_genes_role
0.6606160616061606
llm_genes_reason
0.5121012101210121
llm_genes_conf_high
0.38943894389438943
llm_ensembled
0.38173817381738173

gpt-4o-2024-08-06
llm_genes_norole
0.7915291529152916
llm_genes_role
0.8074807480748075
llm_genes_reason
0.7475247524752475
llm_genes_co

In [9]:
df_tok = pd.DataFrame(tok_use)
df_tok.to_csv("outputs/tok_use.tsv", sep="\t", index=None)
df_tok[df_tok["model"] == "gpt-4o-2024-08-06"]["in_toks"].sum()/1e6*2.5+df_tok[df_tok["model"] == "gpt-4o-2024-08-06"]["out_toks"].sum()/1e6*10

82.40393499999999

In [10]:
df = pd.concat(database_res, ignore_index=True)

In [11]:
df

Unnamed: 0,database,model,gene_set,descr,descr_cleaned,ncurated,nllm,ninter,generatio,bgratio,richFactor,foldEnrich,non_hgnc,ntries,ndup,p_val,p_val_adj
0,KEGG_2021_Human,gpt-4o-mini-2024-07-18,llm_genes_norole,ABC transporters,ABC transporters,45,34,28,0.622222,0.001713,0.823529,363.194771,4,1,0,2.121814e-73,6.789806e-71
1,KEGG_2021_Human,gpt-4o-mini-2024-07-18,llm_genes_norole,AGE-RAGE signaling pathway in diabetic complic...,AGE-RAGE signaling pathway in diabetic complic...,100,17,9,0.090000,0.000857,0.529412,105.067059,1,1,0,3.405400e-17,1.089728e-14
2,KEGG_2021_Human,gpt-4o-mini-2024-07-18,llm_genes_norole,AMPK signaling pathway,AMPK signaling pathway,120,25,18,0.150000,0.001260,0.720000,119.076000,2,1,0,1.427988e-35,4.569563e-33
3,KEGG_2021_Human,gpt-4o-mini-2024-07-18,llm_genes_norole,Acute myeloid leukemia,Acute myeloid leukemia,67,20,7,0.104478,0.001008,0.350000,103.673134,1,1,0,2.710042e-13,8.672134e-11
4,KEGG_2021_Human,gpt-4o-mini-2024-07-18,llm_genes_norole,Adherens junction,Adherens junction,71,21,6,0.084507,0.001058,0.285714,79.863179,2,1,0,8.786538e-11,2.811692e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44080,WikiPathway_2023_Human,gpt-4o-2024-08-06,llm_ensembled,Serotonin Receptor 4 6 7 And NR3C Signaling WP734,Serotonin Receptor 4 6 7 And NR3C Signaling,19,5,5,0.263158,0.000252,1.000000,1044.526316,0,5,0,4.534613e-16,3.632225e-13
44081,WikiPathway_2023_Human,gpt-4o-2024-08-06,llm_ensembled,Toll Like Receptor Signaling Pathway WP75,Toll Like Receptor Signaling Pathway,102,15,14,0.137255,0.000756,0.933333,181.597386,0,5,0,5.290307e-32,4.237536e-29
44082,WikiPathway_2023_Human,gpt-4o-2024-08-06,llm_ensembled,TCA Cycle Aka Krebs Or Citric Acid Cycle WP78,TCA Cycle Aka Krebs Or Citric Acid Cycle,18,17,15,0.833333,0.000857,0.882353,972.843137,0,5,0,4.998146e-48,4.003515e-45
44083,WikiPathway_2023_Human,gpt-4o-2024-08-06,llm_ensembled,Nucleotide GPCRs WP80,Nucleotide GPCRs,11,4,4,0.363636,0.000202,1.000000,1804.181818,0,5,0,5.106985e-14,4.090695e-11


In [12]:
df.to_csv("outputs/genes_overlap.tsv",sep="\t",index=None)