In [6]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [7]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]

In [8]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        gen_time_table.append({"lib_name": lib_name, 
         "model": model,
         "gen_time": gen_res["gen_time"]
        })
pd.DataFrame(gen_time_table).to_csv("gen_time.tsv",sep="\t",index=None)

In [9]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [35]:
# Calculate Jaccard coefficient (intersection over union)
def calculate_iou(setA, setB):
    # Calculate the intersection of two sets
    intersection = setA.intersection(setB)
    # Calculate the union of two sets
    union = setA.union(setB)
    # Calculate the Intersection over Union
    iou = len(intersection) / len(union)
    return iou

database_res = []
tok_use = []

for lib_name in lib_names:    
    print(lib_name)
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        
        iou_output = []
        in_toks = 0
        out_toks = 0
        for idx in range(len(gen_res["descr"])):
            curated_genes = gen_res["curated_genesets"][idx]
            parsed_llm_genes = gen_res["llm_genesets"][idx]['parsed_genes']
            
            in_toks += gen_res["llm_genes_norole"][idx]['in_toks']
            out_toks += gen_res["llm_genes_norole"][idx]['out_toks']

            non_hgcn = len(set(parsed_llm_genes) - hgcn_symbols)

            llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
            #llm_genes = list(set(parsed_llm_genes).intersection(hgcn_symbols)) # make sure unique genes are selected

            
            intersection = set(llm_genes).intersection(set(curated_genes))
            p_val = hypergeom.sf(len(intersection)-1,
                                 19846-len(curated_genes), 
                                 len(curated_genes), 
                                 len(llm_genes))
            generatio = None
            if len(llm_genes) > 0:
                generatio = float(len(intersection))/len(llm_genes)
            
            x = {
                'database': lib_name,
                'model' : model,
                'descr': gen_res["descr_cleaned"][idx],
                'ncurated': len(curated_genes),
                'nllm': len(llm_genes),
                'ninter': len(intersection),
                'generatio': generatio,
                'bgratio': float(len(curated_genes))/19846,
                'non_hgcn': non_hgcn,
                'ndup': len(parsed_llm_genes) - len(llm_genes),
                'p_val': p_val
            }
            iou_output.append(x)

        tok_use.append({"lib_name": lib_name, "model": model, "in_toks": in_toks, "out_toks" : out_toks})
        df = pd.DataFrame(iou_output)
        # Adjust p-values using the Benjamini-Hochberg (FDR) method
        _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
        df['p_val_adj'] = pvals_corrected
        print(df[df['p_val_adj'] < 0.01].shape[0] / df.shape[0])
        database_res.append(df)    
        

KEGG_2021_Human
0.8875
0.95
Reactome_2022
0.6705170517051705
0.8113311331133113
WikiPathway_2023_Human
0.7228464419475655
0.8426966292134831
GO_Biological_Process_2023
0.287220270020344
0.40909931570186797
GO_Molecular_Function_2023
0.5370531822144725
0.6617262423714037
GO_Cellular_Component_2023
0.5864978902953587
0.6877637130801688


In [51]:
df_tok = pd.DataFrame(tok_use)
df.to_csv("tok_use.tsv", sep="\t", index=None)


cost_gpt35 = df_tok[df_tok["model"] == "gpt-3.5-turbo-0125"].in_toks.sum() / 1e6 * 0.5 + df_tok[df_tok["model"] == "gpt-3.5-turbo-0125"].out_toks.sum() / 1e6 * 1.5
cost_gpt4o = df_tok[df_tok["model"] == "gpt-4o-2024-05-13"].in_toks.sum() / 1e6 * 5 + df_tok[df_tok["model"] == "gpt-4o-2024-05-13"].out_toks.sum() / 1e6 * 15
print(cost_gpt35)
print(cost_gpt4o)

3.276396
51.766705


In [39]:
df = pd.concat(database_res, ignore_index=True)
df[df["p_val"] < 1e-15].sort_values("p_val")

TypeError: DataFrame.sort_values() got an unexpected keyword argument 'index'

In [29]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)