In [1]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [2]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
gene_sets = ["llm_genes_role", "llm_genes_norole", "llm_genes_reason", "llm_ensembled"]

In [3]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        gen_time_table.append({"lib_name": lib_name, 
         "model": model,
         "gen_time_role": gen_res["gen_time_role"],
         "gen_time_reason": gen_res["gen_time_reason"]                      
        })
df = pd.DataFrame(gen_time_table)
df.to_csv("gen_time.tsv",sep="\t",index=None)
df

Unnamed: 0,lib_name,model,gen_time_role,gen_time_reason
0,KEGG_2021_Human,gpt-3.5-turbo-0125,51.04008,95.413025
1,KEGG_2021_Human,gpt-4o-2024-05-13,29.746293,26.377889
2,Reactome_2022,gpt-3.5-turbo-0125,104.245385,19.204092
3,Reactome_2022,gpt-4o-2024-05-13,46.977597,39.722649
4,WikiPathway_2023_Human,gpt-3.5-turbo-0125,50.940228,28.816699
5,WikiPathway_2023_Human,gpt-4o-2024-05-13,22.779287,126.244361


In [4]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [25]:
database_res = []
tok_use = []

for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        for gset in gene_sets:
            iou_output = []
            in_toks = 0
            out_toks = 0
            for idx in range(len(gen_res["descr"])):
                curated_genes = gen_res["curated_genesets"][idx]
                parsed_llm_genes = gen_res[gset][idx]['parsed_genes']
                
                in_toks += gen_res[gset][idx]['in_toks']
                out_toks += gen_res[gset][idx]['out_toks']
    
                non_hgcn = len(set(parsed_llm_genes) - hgcn_symbols)
    
                llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
    
                
                intersection = set(llm_genes).intersection(set(curated_genes))
                p_val = hypergeom.sf(len(intersection)-1,
                                     19846-len(curated_genes), 
                                     len(curated_genes), 
                                     len(llm_genes))
                generatio = None
                if len(llm_genes) > 0:
                    generatio = float(len(intersection))/len(llm_genes)

                x = {
                    'database': lib_name,
                    'model' : model,
                    'gene_set' : gset, 
                    'descr': gen_res["descr_cleaned"][idx],
                    'ncurated': len(curated_genes),
                    'nllm': len(llm_genes),
                    'ninter': len(intersection),
                    'generatio': generatio,
                    'bgratio': float(len(curated_genes))/19846,
                    'non_hgcn': non_hgcn,
                    'ndup': len(parsed_llm_genes) - len(llm_genes),
                    'p_val': p_val
                }
                iou_output.append(x)

            toks_cur = {"lib_name": lib_name, "model": model,
                        "gene_set": gset,
                        "in_toks": in_toks, "out_toks" : out_toks}
            tok_use.append(toks_cur)
            df = pd.DataFrame(iou_output)

            # no genes, remove for ensembling
            #df = df[df["nllm"] != 0]
            
            _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
            df['p_val_adj'] = pvals_corrected
            #print(df[(df['p_val_adj'] < 0.01) & (df['generatio'] > 0.5)].shape[0] / df.shape[0])
            database_res.append(df)    
        

In [26]:
df_tok = pd.DataFrame(tok_use)
df_tok.to_csv("tok_use.tsv", sep="\t", index=None)
df_tok

Unnamed: 0,lib_name,model,gene_set,in_toks,out_toks
0,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_role,51479,90383
1,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_norole,48120,69324
2,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_genes_reason,68600,76872
3,KEGG_2021_Human,gpt-3.5-turbo-0125,llm_ensembled,205437,341786
4,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_role,51772,116272
5,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_norole,48572,111985
6,KEGG_2021_Human,gpt-4o-2024-05-13,llm_genes_reason,68732,193730
7,KEGG_2021_Human,gpt-4o-2024-05-13,llm_ensembled,207088,462246
8,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_role,306454,323942
9,Reactome_2022,gpt-3.5-turbo-0125,llm_genes_norole,288248,274713


In [19]:
df = pd.concat(database_res, ignore_index=True)

In [20]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)