In [1]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [2]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
gene_sets = ["llm_genes_role", "llm_genes_norole", "llm_genes_reason", 
             "llm_genes_conf_high", "llm_ensembled"]

In [3]:
# Create gene sets with only high or high/medium confidence.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        llm_genes_conf_high = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high"])
        llm_genes_conf_high_medium = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high", "medium"])

        gen_res["llm_genes_conf_high"] = llm_genes_conf_high
        gen_res["llm_genes_conf_high_medium"] = llm_genes_conf_high_medium
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4) 

In [12]:
# Use ensemble generations to define gene sets.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Append separate generations together.
        llm_genes = []
        for i in range(4):
            llm_genes.append(gen_res["llm_ensemble_" + str(i)])
            
        # Generate ensembled set. 
        llm_ensembled = llm2geneset.ensemble_genes(gen_res["descr_cleaned"], llm_genes, 2)
    
        # Add ensembled results.
        gen_res["llm_ensembled"] = llm_ensembled
        
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

In [13]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        x = {"lib_name": lib_name,
             "model": model,
             "gen_time_role": gen_res["gen_time_role"],
             "gen_time_reasoning": gen_res["gen_time_reasoning"],
             "gen_time_conf": gen_res["gen_time_conf"],
             "gen_time_norole": gen_res["gen_time_norole"]
        }
        gen_time_table.append(x)
df = pd.DataFrame(gen_time_table)
df.to_csv("gen_time.tsv",sep="\t",index=None)
df

Unnamed: 0,lib_name,model,gen_time_role,gen_time_reasoning,gen_time_conf,gen_time_norole
0,KEGG_2021_Human,gpt-3.5-turbo-0125,97.471539,12.884575,15.758627,59.641582
1,KEGG_2021_Human,gpt-4o-2024-05-13,25.138377,22.30365,20.539535,24.146841
2,Reactome_2022,gpt-3.5-turbo-0125,65.464516,25.274014,55.920584,40.419058
3,Reactome_2022,gpt-4o-2024-05-13,35.385588,43.002771,38.653894,94.229863
4,WikiPathway_2023_Human,gpt-3.5-turbo-0125,25.406484,13.916806,22.751334,20.999619
5,WikiPathway_2023_Human,gpt-4o-2024-05-13,51.99826,24.617129,30.099921,24.904363


In [14]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [15]:
database_res = []
tok_use = []

for lib_name in lib_names:
    print(lib_name)
    for model in models:
        print(model)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        for gset in gene_sets:
            print(gset)
            iou_output = []
            in_toks = 0
            out_toks = 0
            for idx in range(len(gen_res["descr"])):
                curated_genes = gen_res["curated_genesets"][idx]
                parsed_llm_genes = gen_res[gset][idx]['parsed_genes']
                
                in_toks += gen_res[gset][idx]['in_toks']
                out_toks += gen_res[gset][idx]['out_toks']
    
                non_hgcn = len(set(parsed_llm_genes) - hgcn_symbols)
    
                llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
    
                
                intersection = set(llm_genes).intersection(set(curated_genes))
                p_val = hypergeom.sf(len(intersection)-1,
                                     19846-len(curated_genes), 
                                     len(curated_genes), 
                                     len(llm_genes))
                generatio = None
                if len(llm_genes) > 0:
                    generatio = float(len(intersection))/len(llm_genes)

                x = {
                    'database': lib_name,
                    'model' : model,
                    'gene_set' : gset, 
                    'descr': gen_res["descr_cleaned"][idx],
                    'ncurated': len(curated_genes),
                    'nllm': len(llm_genes),
                    'ninter': len(intersection),
                    'generatio': generatio,
                    'bgratio': float(len(curated_genes))/19846,
                    'non_hgcn': non_hgcn,
                    'ndup': len(parsed_llm_genes) - len(llm_genes),
                    'p_val': p_val
                }
                iou_output.append(x)

            toks_cur = {"lib_name": lib_name, "model": model,
                        "gene_set": gset,
                        "in_toks": in_toks, "out_toks" : out_toks}
            tok_use.append(toks_cur)
            df = pd.DataFrame(iou_output)

            # no genes, remove for ensembling
            #df = df[df["nllm"] != 0]
            
            _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
            df['p_val_adj'] = pvals_corrected
            print(df[(df['p_val_adj'] < 0.01) & (df["generatio"] >= 0.5)].shape[0] / df.shape[0])
            database_res.append(df)    
        print("")

KEGG_2021_Human
gpt-3.5-turbo-0125
llm_genes_role
0.540625
llm_genes_norole
0.5625
llm_genes_reason
0.734375
llm_genes_conf_high
0.61875
llm_ensembled
0.7125

gpt-4o-2024-05-13
llm_genes_role
0.55625
llm_genes_norole
0.578125
llm_genes_reason
0.684375
llm_genes_conf_high
0.81875
llm_ensembled
0.678125

Reactome_2022
gpt-3.5-turbo-0125
llm_genes_role
0.3355335533553355
llm_genes_norole
0.3767876787678768
llm_genes_reason
0.4977997799779978
llm_genes_conf_high
0.356985698569857
llm_ensembled
0.4735973597359736

gpt-4o-2024-05-13
llm_genes_role
0.290979097909791
llm_genes_norole
0.3162816281628163
llm_genes_reason
0.4328932893289329
llm_genes_conf_high
0.5748074807480749
llm_ensembled
0.39603960396039606

WikiPathway_2023_Human
gpt-3.5-turbo-0125
llm_genes_role
0.3333333333333333
llm_genes_norole
0.36079900124843944
llm_genes_reason
0.5955056179775281
llm_genes_conf_high
0.4094881398252185
llm_ensembled
0.4893882646691635

gpt-4o-2024-05-13
llm_genes_role
0.2833957553058677
llm_genes_noro

In [16]:
df_tok = pd.DataFrame(tok_use)
df_tok.to_csv("tok_use.tsv", sep="\t", index=None)
df_tok

Unnamed: 0,lib_name,model,gene_set,in_toks,out_toks
0,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,51489,90284
1,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_norole,48120,74499
2,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_reason,65400,67948
3,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_conf_high,70520,54151
4,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_ensembled,192480,297221
5,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_role,51772,117996
6,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_norole,48572,107538
7,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_reason,65532,187685
8,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_conf_high,70652,152304
9,KEGG_2021_Human,gpt-4o-2024-05-13,llm_ensembled,194437,426473


In [17]:
df_tok_gpt4 = df_tok[df_tok["model"] == "gpt-4o-2024-05-13"]
df_tok_gpt4["in_toks"].sum() / 1e6 * 5 + df_tok_gpt4["in_toks"].sum() / 1e6 * 15

82.51794000000001

In [18]:
df_tok_gpt3 = df_tok[df_tok["model"] == "gpt-3.5-turbo-0125"]
df_tok_gpt3["in_toks"].sum() / 1e6 * 0.5 + df_tok_gpt3["in_toks"].sum() / 1e6 * 1.5

8.198712

In [30]:
df = pd.concat(database_res, ignore_index=True)

In [27]:
df.sort_values("p_val")

Unnamed: 0,database,model,gene_set,descr,ncurated,nllm,ninter,generatio,bgratio,non_hgcn,ndup,p_val,p_val_adj
492,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Cytoplasmic Ribosomal Proteins,88,68,68,1.000000,0.004434,0,0,6.634147e-177,5.313952e-174
3,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Electron Transport Chain OXPHOS System In Mito...,105,70,67,0.957143,0.005291,0,0,2.042915e-160,1.636375e-157
43,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Proteasome Degradation,62,41,39,0.951220,0.003124,0,0,2.872470e-102,2.300849e-99
774,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Oxidative Phosphorylation,60,64,43,0.671875,0.003023,0,0,1.785630e-101,1.430290e-98
384,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Mitochondrial Complex I Assembly Model OXPHOS ...,56,41,35,0.853659,0.002822,0,0,2.698709e-89,2.161666e-86
...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Pancreatic Cancer Subtypes,48,9,0,0.000000,0.002419,0,0,1.000000e+00,1.000000e+00
83,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Glial Cell Differentiation,7,2,0,0.000000,0.000353,0,0,1.000000e+00,1.000000e+00
98,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,Gastric Cancer Network 1,27,6,0,0.000000,0.001360,0,0,1.000000e+00,1.000000e+00
672,WikiPathway_2023_Human,gpt-4o-2024-05-13,llm_ensembled,2Q13 Copy Number Variation Syndrome,60,0,0,,0.003023,0,0,1.000000e+00,1.000000e+00


In [28]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)