In [1]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [2]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
gene_sets = ["llm_genes_role", "llm_genes_norole", "llm_genes_reason", 
             "llm_genes_conf_high", "llm_genes_conf_high_medium", "llm_ensembled"]

In [3]:
# Create gene sets with only high or high/medium confidence.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        llm_genes_conf_high = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high"])
        llm_genes_conf_high_medium = llm2geneset.sel_conf(gen_res["descr"], gen_res["llm_genes_conf"], ["high", "medium"])

        gen_res["llm_genes_conf_high"] = llm_genes_conf_high
        gen_res["llm_genes_conf_high_medium"] = llm_genes_conf_high_medium
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4) 

In [4]:
# Use ensemble generations to define gene sets.
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Append separate generations together.
        llm_genes = [gen_res["llm_genes_norole"]]
        for i in range(4):
            llm_genes.append(gen_res["llm_ensemble_" + str(i)])
            
        # Generate ensembled set. 
        llm_ensembled = llm2geneset.ensemble_genes(gen_res["descr_cleaned"], llm_genes, 5)
    
        # Add ensembled results.
        gen_res["llm_ensembled"] = llm_ensembled
        
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

In [5]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        x = {"lib_name": lib_name,
             "model": model,
             "gen_time_role": gen_res["gen_time_role"],
             "gen_time_norole": gen_res["gen_time_norole"],
             "gen_time_reasoning": gen_res["gen_time_reasoning"],
             "gen_time_conf": gen_res["gen_time_conf"]
        }
        gen_time_table.append(x)
df = pd.DataFrame(gen_time_table)
df.to_csv("gen_time.tsv",sep="\t",index=None)
df

Unnamed: 0,lib_name,model,gen_time_role,gen_time_norole,gen_time_reasoning,gen_time_conf
0,KEGG_2021_Human,gpt-3.5-turbo-0125,97.471539,59.641582,12.884575,15.758627
1,KEGG_2021_Human,gpt-4o-2024-05-13,25.138377,24.146841,22.30365,20.539535
2,Reactome_2022,gpt-3.5-turbo-0125,65.464516,40.419058,25.274014,55.920584
3,Reactome_2022,gpt-4o-2024-05-13,35.385588,94.229863,43.002771,38.653894
4,WikiPathway_2023_Human,gpt-3.5-turbo-0125,25.406484,20.999619,13.916806,22.751334
5,WikiPathway_2023_Human,gpt-4o-2024-05-13,51.99826,24.904363,24.617129,30.099921


In [6]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [7]:
database_res = []
tok_use = []

for lib_name in lib_names:
    print(lib_name)
    for model in models:
        print(model)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        for gset in gene_sets:
            print(gset)
            iou_output = []
            in_toks = 0
            out_toks = 0
            for idx in range(len(gen_res["descr"])):
                curated_genes = gen_res["curated_genesets"][idx]
                parsed_llm_genes = gen_res[gset][idx]['parsed_genes']
                ntries = gen_res[gset][idx]['ntries']
                
                in_toks += gen_res[gset][idx]['in_toks']
                out_toks += gen_res[gset][idx]['out_toks']
    
                non_hgcn = len(set(parsed_llm_genes) - hgcn_symbols)
    
                llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
    
                
                intersection = set(llm_genes).intersection(set(curated_genes))
                p_val = hypergeom.sf(len(intersection)-1,
                                     19846-len(curated_genes), 
                                     len(curated_genes), 
                                     len(llm_genes))
                generatio = None
                if len(llm_genes) > 0:
                    generatio = float(len(intersection))/len(llm_genes)

                x = {
                    'database': lib_name,
                    'model' : model,
                    'gene_set' : gset, 
                    'descr': gen_res["descr"][idx],
                    'descr_cleaned': gen_res["descr_cleaned"][idx],
                    'ncurated': len(curated_genes),
                    'nllm': len(llm_genes),
                    'ninter': len(intersection),
                    'generatio': generatio,
                    'bgratio': float(len(curated_genes))/19846,
                    'non_hgcn': non_hgcn,
                    'ntries' : ntries,
                    'ndup': len(parsed_llm_genes) - len(llm_genes),
                    'p_val': p_val
                }
                if gen_res["descr_cleaned"][idx] == "Maturation Of Nucleoprotein":
                    print(gset)
                    print(x)
                iou_output.append(x)

            toks_cur = {"lib_name": lib_name, "model": model,
                        "gene_set": gset,
                        "in_toks": in_toks, "out_toks" : out_toks}
            tok_use.append(toks_cur)
            df = pd.DataFrame(iou_output)
           
            _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
            df['p_val_adj'] = pvals_corrected
            #& (df["generatio"] >= 0.5
            #print(df[(df['p_val_adj'] < 0.01) ].shape[0] / df.shape[0])
            database_res.append(df)    
        print("")

KEGG_2021_Human
gpt-3.5-turbo-0125
llm_genes_role
llm_genes_norole
llm_genes_reason
llm_genes_conf_high
llm_genes_conf_high_medium
llm_ensembled

gpt-4o-2024-05-13
llm_genes_role
llm_genes_norole
llm_genes_reason
llm_genes_conf_high
llm_genes_conf_high_medium
llm_ensembled

Reactome_2022
gpt-3.5-turbo-0125
llm_genes_role
llm_genes_role
{'database': 'Reactome_2022', 'model': 'gpt-3.5-turbo-0125', 'gene_set': 'llm_genes_role', 'descr': 'Maturation Of Nucleoprotein R-HSA-9683610', 'descr_cleaned': 'Maturation Of Nucleoprotein', 'ncurated': 12, 'nllm': 4, 'ninter': 0, 'generatio': 0.0, 'bgratio': 0.0006046558500453492, 'non_hgcn': 0, 'ntries': 1, 'ndup': 0, 'p_val': 1.0}
llm_genes_norole
llm_genes_norole
{'database': 'Reactome_2022', 'model': 'gpt-3.5-turbo-0125', 'gene_set': 'llm_genes_norole', 'descr': 'Maturation Of Nucleoprotein R-HSA-9683610', 'descr_cleaned': 'Maturation Of Nucleoprotein', 'ncurated': 12, 'nllm': 10, 'ninter': 0, 'generatio': 0.0, 'bgratio': 0.0006046558500453492, 'n

In [8]:
df_tok = pd.DataFrame(tok_use)
df_tok.to_csv("tok_use.tsv", sep="\t", index=None)
df_tok

Unnamed: 0,lib_name,model,gene_set,in_toks,out_toks
0,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,51489,90284
1,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_norole,48120,74499
2,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_reason,65400,67948
3,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_conf_high,70520,54151
4,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_conf_high_medium,70520,54151
5,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_ensembled,240600,371720
6,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_role,51772,117996
7,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_norole,48572,107538
8,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_reason,65532,187685
9,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_conf_high,70652,152304


In [9]:
df_tok_gpt4 = df_tok[df_tok["model"] == "gpt-4o-2024-05-13"]
df_tok_gpt4["in_toks"].sum() / 1e6 * 5 + df_tok_gpt4["in_toks"].sum() / 1e6 * 15

105.26097999999999

In [10]:
df_tok_gpt3 = df_tok[df_tok["model"] == "gpt-3.5-turbo-0125"]
df_tok_gpt3["in_toks"].sum() / 1e6 * 0.5 + df_tok_gpt3["in_toks"].sum() / 1e6 * 1.5

10.462696

In [11]:
df = pd.concat(database_res, ignore_index=True)

In [14]:
df[(df["descr_cleaned"] == "Maturation Of Nucleoprotein") & (df["model"] == "gpt-4o-2024-05-13")]

Unnamed: 0,database,model,gene_set,descr,descr_cleaned,ncurated,nllm,ninter,generatio,bgratio,non_hgcn,ntries,ndup,p_val,p_val_adj
15603,Reactome_2022,gpt-4o-2024-05-13,llm_genes_role,Maturation Of Nucleoprotein R-HSA-9683610,Maturation Of Nucleoprotein,12,0,0,,0.000605,0,1,0,1.0,1.0
17421,Reactome_2022,gpt-4o-2024-05-13,llm_genes_norole,Maturation Of Nucleoprotein R-HSA-9683610,Maturation Of Nucleoprotein,12,10,0,0.0,0.000605,0,1,0,1.0,1.0
19239,Reactome_2022,gpt-4o-2024-05-13,llm_genes_reason,Maturation Of Nucleoprotein R-HSA-9683610,Maturation Of Nucleoprotein,12,10,0,0.0,0.000605,0,1,0,1.0,1.0
21057,Reactome_2022,gpt-4o-2024-05-13,llm_genes_conf_high,Maturation Of Nucleoprotein R-HSA-9683610,Maturation Of Nucleoprotein,12,1,0,0.0,0.000605,0,1,0,1.0,1.0
22875,Reactome_2022,gpt-4o-2024-05-13,llm_genes_conf_high_medium,Maturation Of Nucleoprotein R-HSA-9683610,Maturation Of Nucleoprotein,12,4,0,0.0,0.000605,0,1,0,1.0,1.0
24693,Reactome_2022,gpt-4o-2024-05-13,llm_ensembled,Maturation Of Nucleoprotein R-HSA-9683610,Maturation Of Nucleoprotein,12,0,0,,0.000605,0,5,0,1.0,1.0


In [15]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)

In [16]:
df.sort_values("p_val").tail(7000)

Unnamed: 0,database,model,gene_set,descr,descr_cleaned,ncurated,nllm,ninter,generatio,bgratio,non_hgcn,ntries,ndup,p_val,p_val_adj
7705,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_reason,Caspase-mediated Cleavage Of Cytoskeletal Prot...,Caspase-mediated Cleavage Of Cytoskeletal Prot...,12,3,1,0.333333,0.000605,0,1,0,0.001814,1.0
13882,Reactome_2022,gpt-3.5-turbo-0125,llm_ensembled,NOTCH2 Intracellular Domain Regulates Transcri...,NOTCH2 Intracellular Domain Regulates Transcri...,12,3,1,0.333333,0.000605,0,5,0,0.001814,1.0
27311,WikiPathway_2023_Human,gpt-3.5-turbo-0125,llm_genes_reason,SRF And miRs In Smooth Muscle Differentiation ...,SRF And miRs In Smooth Muscle Differentiation ...,12,3,1,0.333333,0.000605,2,1,0,0.001814,1.0
9956,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_conf_high,HDR Thru MMEJ (alt-NHEJ) R-HSA-5685939,HDR Thru MMEJ (alt-NHEJ),12,3,1,0.333333,0.000605,0,1,0,0.001814,1.0
10119,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_conf_high,Lysine Catabolism R-HSA-71064,Lysine Catabolism,12,3,1,0.333333,0.000605,0,1,0,0.001814,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8128,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_reason,HCMV Early Events R-HSA-9609690,HCMV Early Events,96,4,0,0.000000,0.004837,4,1,0,1.000000,1.0
13004,Reactome_2022,gpt-3.5-turbo-0125,llm_ensembled,Activation Of Kainate Receptors Upon Glutamate...,Activation Of Kainate Receptors Upon Glutamate...,29,0,0,,0.001461,0,5,0,1.000000,1.0
25954,WikiPathway_2023_Human,gpt-3.5-turbo-0125,llm_genes_role,mRNA Protein And Metabolite Inducation Pathway...,mRNA Protein And Metabolite Inducation Pathway...,7,10,0,0.000000,0.000353,0,1,0,1.000000,1.0
11317,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_conf_high_medium,CREB3 Factors Activate Genes R-HSA-8874211,CREB3 Factors Activate Genes,8,3,0,0.000000,0.000403,0,1,0,1.000000,1.0
