In [43]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [44]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
#lib_names = ["WikiPathway_2023_Human"]
#gene_sets = ["llm_genes_role", "llm_genes_norole", "llm_genes_reason",
#            "llm_genes_conf_high", "llm_genes_conf_high_medium", "llm_ensembled"]

gene_sets = ["llm_genes_role", "llm_genes_reason",
             "llm_genes_conf_high",  "llm_ensembled", "llm_genes_rag"]

In [45]:
# Create gene sets with only high or high/medium confidence.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        llm_genes_conf_high = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high"])
        llm_genes_conf_high_medium = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high", "medium"])

        gen_res["llm_genes_conf_high"] = llm_genes_conf_high
        gen_res["llm_genes_conf_high_medium"] = llm_genes_conf_high_medium
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4) 

In [46]:
# Use ensemble generations to define gene sets.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Append separate generations together.
        llm_genes = [gen_res["llm_genes_norole"]]
        for i in range(4):
            llm_genes.append(gen_res["llm_ensemble_" + str(i)])
            
        # Generate ensembled set. 
        llm_ensembled = llm2geneset.ensemble_genes(gen_res["descr_cleaned"], llm_genes, 5)
    
        # Add ensembled results.
        gen_res["llm_ensembled"] = llm_ensembled
        
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

In [47]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        x = {"lib_name": lib_name,
             "model": model,
             "gen_time_role": gen_res["gen_time_role"],
             "gen_time_norole": gen_res["gen_time_norole"],
             "gen_time_reasoning": gen_res["gen_time_reasoning"],
             "gen_time_conf": gen_res["gen_time_conf"],
             "gen_time_rag": gen_res["gen_time_rag"]
        }
        gen_time_table.append(x)
df = pd.DataFrame(gen_time_table)
df.to_csv("gen_time.tsv",sep="\t",index=None)
df

Unnamed: 0,lib_name,model,gen_time_role,gen_time_norole,gen_time_reasoning,gen_time_conf,gen_time_rag
0,KEGG_2021_Human,gpt-3.5-turbo-0125,97.471539,59.641582,12.884575,15.758627,338.997886
1,KEGG_2021_Human,gpt-4o-2024-05-13,25.138377,24.146841,22.30365,20.539535,349.175431
2,Reactome_2022,gpt-3.5-turbo-0125,65.464516,40.419058,25.274014,55.920584,1844.864143
3,Reactome_2022,gpt-4o-2024-05-13,35.385588,94.229863,43.002771,38.653894,1881.73728
4,WikiPathway_2023_Human,gpt-3.5-turbo-0125,25.406484,20.999619,13.916806,22.751334,826.655255
5,WikiPathway_2023_Human,gpt-4o-2024-05-13,51.99826,24.904363,24.617129,30.099921,835.889114


In [48]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [49]:
gene_sets

['llm_genes_role',
 'llm_genes_reason',
 'llm_genes_conf_high',
 'llm_ensembled',
 'llm_genes_rag']

In [50]:
database_res = []
tok_use = []

for lib_name in lib_names:
    print(lib_name)
    for model in models:
        print(model)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        for gset in gene_sets:
            print(gset)
            iou_output = []
            in_toks = 0
            out_toks = 0
            for idx in range(len(gen_res["descr"])):
                curated_genes = gen_res["curated_genesets"][idx]
                parsed_llm_genes = gen_res[gset][idx]['parsed_genes']
                ntries = gen_res[gset][idx]['ntries']
                
                in_toks += gen_res[gset][idx]['in_toks']
                out_toks += gen_res[gset][idx]['out_toks']
    
                non_hgcn = len(set(parsed_llm_genes) - hgcn_symbols)
    
                llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
    
            
                intersection = set(llm_genes).intersection(set(curated_genes))
                p_val = hypergeom.sf(len(intersection)-1,
                                     19846-len(curated_genes), 
                                     len(curated_genes), 
                                     len(llm_genes))
                generatio = None
                if len(llm_genes) > 0:
                    generatio = float(len(intersection))/len(llm_genes)

                x = {
                    'database': lib_name,
                    'model' : model,
                    'gene_set' : gset, 
                    'descr': gen_res["descr"][idx],
                    'descr_cleaned': gen_res["descr_cleaned"][idx],
                    'ncurated': len(curated_genes),
                    'nllm': len(llm_genes),
                    'ninter': len(intersection),
                    'generatio': generatio,
                    'bgratio': float(len(curated_genes))/19846,
                    'non_hgcn': non_hgcn,
                    'ntries' : ntries,
                    'ndup': len(parsed_llm_genes) - len(llm_genes),
                    'p_val': p_val
                }
                iou_output.append(x)

            toks_cur = {"lib_name": lib_name, "model": model,
                        "gene_set": gset,
                        "in_toks": in_toks, "out_toks" : out_toks}
            tok_use.append(toks_cur)
            df = pd.DataFrame(iou_output)
           
            _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
            df['p_val_adj'] = pvals_corrected
            #& (df["generatio"] >= 0.5
            print(df[(df['p_val_adj'] < 0.01) ].shape[0] / df.shape[0])
            database_res.append(df)    
        print("")

KEGG_2021_Human
gpt-3.5-turbo-0125
llm_genes_role
0.896875
llm_genes_reason
0.765625
llm_genes_conf_high
0.63125
llm_ensembled
0.575
llm_genes_rag
0.759375

gpt-4o-2024-05-13
llm_genes_role
0.946875
llm_genes_reason
0.90625
llm_genes_conf_high
0.865625
llm_ensembled
0.809375
llm_genes_rag
0.909375

Reactome_2022
gpt-3.5-turbo-0125
llm_genes_role
0.6705170517051705
llm_genes_reason
0.5143014301430143
llm_genes_conf_high
0.3734873487348735
llm_ensembled
0.39933993399339934
llm_genes_rag
0.4636963696369637

gpt-4o-2024-05-13
llm_genes_role
0.8146314631463146
llm_genes_reason
0.7645764576457645
llm_genes_conf_high
0.7244224422442245
llm_ensembled
0.6721672167216721
llm_genes_rag
0.77007700770077

WikiPathway_2023_Human
gpt-3.5-turbo-0125
llm_genes_role
0.7253433208489388
llm_genes_reason
0.6254681647940075
llm_genes_conf_high
0.42446941323345816
llm_ensembled
0.4968789013732834
llm_genes_rag
0.5717852684144819

gpt-4o-2024-05-13
llm_genes_role
0.8614232209737828
llm_genes_reason
0.81897627

In [52]:
df_tok = pd.DataFrame(tok_use)
df_tok.to_csv("tok_use.tsv", sep="\t", index=None)
df_tok

Unnamed: 0,lib_name,model,gene_set,in_toks,out_toks
0,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,51489,90284
1,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_reason,65400,67948
2,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_conf_high,70520,54151
3,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_ensembled,240600,371720
4,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_rag,130563,50647
5,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_role,51772,117996
6,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_reason,65532,187685
7,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_conf_high,70652,152304
8,KEGG_2021_Human,gpt-4o-2024-05-13,llm_ensembled,243009,534011
9,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_rag,129636,107369


In [53]:
df_tok_gpt4 = df_tok[df_tok["model"] == "gpt-4o-2024-05-13"]
df_tok_gpt4["in_toks"].sum() / 1e6 * 5 + df_tok_gpt4["in_toks"].sum() / 1e6 * 15

np.float64(108.02796000000001)

In [54]:
df_tok_gpt3 = df_tok[df_tok["model"] == "gpt-3.5-turbo-0125"]
df_tok_gpt3["in_toks"].sum() / 1e6 * 0.5 + df_tok_gpt3["in_toks"].sum() / 1e6 * 1.5

np.float64(10.763234)

In [55]:
df = pd.concat(database_res, ignore_index=True)

In [56]:
df

Unnamed: 0,database,model,gene_set,descr,descr_cleaned,ncurated,nllm,ninter,generatio,bgratio,non_hgcn,ntries,ndup,p_val,p_val_adj
0,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,ABC transporters,ABC transporters,45,13,7,0.538462,0.002267,6,1,0,3.258955e-16,1.042866e-13
1,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,AGE-RAGE signaling pathway in diabetic complic...,AGE-RAGE signaling pathway in diabetic complic...,100,69,22,0.318841,0.005039,14,1,0,1.297110e-34,4.150751e-32
2,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,AMPK signaling pathway,AMPK signaling pathway,120,39,13,0.333333,0.006047,1,1,0,5.695660e-20,1.822611e-17
3,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,Acute myeloid leukemia,Acute myeloid leukemia,67,20,4,0.200000,0.003376,3,1,0,5.592375e-07,1.789560e-04
4,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,Adherens junction,Adherens junction,71,20,8,0.400000,0.003578,2,1,0,2.236612e-15,7.157158e-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29385,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_genes_rag,Serotonin Receptor 4 6 7 And NR3C Signaling WP734,Serotonin Receptor 4 6 7 And NR3C Signaling,19,22,6,0.272727,0.000957,0,1,0,2.379616e-14,1.906073e-11
29386,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_genes_rag,Toll Like Receptor Signaling Pathway WP75,Toll Like Receptor Signaling Pathway,102,26,20,0.769231,0.005140,0,1,0,5.664995e-42,4.537661e-39
29387,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_genes_rag,TCA Cycle Aka Krebs Or Citric Acid Cycle WP78,TCA Cycle Aka Krebs Or Citric Acid Cycle,18,27,18,0.666667,0.000907,0,1,0,1.347571e-55,1.079404e-52
29388,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_genes_rag,Nucleotide GPCRs WP80,Nucleotide GPCRs,11,45,0,0.000000,0.000554,0,1,1,1.000000e+00,1.000000e+00


In [57]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)