In [2]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [1]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]

In [3]:
# Output table with generation times.
gen_time_table = []
for lib_name in lib_names:    
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        gen_time_table.append({"lib_name": lib_name, 
         "model": model,
         "gen_time": gen_res["gen_time"]
        })
pd.DataFrame(gen_time_table).to_csv("gen_time.tsv",sep="\t",index=None)

In [4]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [24]:
# Calculate Jaccard coefficient (intersection over union)
def calculate_iou(setA, setB):
    # Calculate the intersection of two sets
    intersection = setA.intersection(setB)
    # Calculate the union of two sets
    union = setA.union(setB)
    # Calculate the Intersection over Union
    iou = len(intersection) / len(union)
    return iou

database_res = []


for lib_name in lib_names:    
    print(lib_name)
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        
        iou_output = []
        for idx in range(len(gen_res["descr"])):
            curated_genes = gen_res["curated_genesets"][idx]
            parsed_llm_genes = gen_res["llm_genesets"][idx]['parsed_genes']

            non_hgcn = len(set(parsed_llm_genes) - hgcn_symbols)

            llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
            #llm_genes = list(set(parsed_llm_genes).intersection(hgcn_symbols)) # make sure unique genes are selected

            
            intersection = set(llm_genes).intersection(set(curated_genes))
            p_val = hypergeom.sf(len(intersection)-1,
                                 19846-len(curated_genes), 
                                 len(curated_genes), 
                                 len(llm_genes))
            generatio = None
            if len(llm_genes) > 0:
                generatio = float(len(intersection))/len(llm_genes)
            
            x = {
                'database': lib_name,
                'model' : model,
                'descr': gen_res["descr_cleaned"][idx],
                'ncurated': len(curated_genes),
                'nllm': len(llm_genes),
                'ninter': len(intersection),
                'generatio': generatio,
                'bgratio': float(len(curated_genes))/19846,
                'non_hgcn': non_hgcn,
                'ndup': len(parsed_llm_genes) - len(llm_genes),
                'p_val': p_val
            }
            iou_output.append(x)
    
        df = pd.DataFrame(iou_output)
        # Adjust p-values using the Benjamini-Hochberg (FDR) method
        _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
        df['p_val_adj'] = pvals_corrected
        print(df[df['p_val_adj'] < 0.01].shape[0] / df.shape[0])
        database_res.append(df)    
        

KEGG_2021_Human
0.90625
0.94375
Reactome_2022
0.6556655665566556
0.8030803080308031
WikiPathway_2023_Human
0.7303370786516854
0.8451935081148564
GO_Biological_Process_2023
0.29498797854632886
0.41057887923062697
GO_Molecular_Function_2023
0.5265911072362686
0.6721883173496077
GO_Cellular_Component_2023
0.609704641350211
0.7172995780590717


In [27]:
df = pd.concat(database_res, ignore_index=True)
df[df["p_val"] < 1e-15].sort_values("p_val")

Unnamed: 0,database,model,descr,ncurated,nllm,ninter,generatio,bgratio,non_hgcn,ndup,p_val,p_val_adj
3863,Reactome_2022,gpt-4o-2024-05-13,SLC-mediated Transmembrane Transport,247,268,149,0.555970,0.012446,34,0,2.655249e-230,4.827243e-227
5080,WikiPathway_2023_Human,gpt-4o-2024-05-13,Electron Transport Chain OXPHOS System In Mito...,105,106,87,0.820755,0.005291,14,0,1.802382e-201,1.443708e-198
5569,WikiPathway_2023_Human,gpt-4o-2024-05-13,Cytoplasmic Ribosomal Proteins,88,77,74,0.961039,0.004434,0,0,2.320906e-190,1.859046e-187
573,KEGG_2021_Human,gpt-4o-2024-05-13,Ribosome,158,85,84,0.988235,0.007961,0,0,1.097125e-186,3.510800e-184
14848,GO_Biological_Process_2023,gpt-4o-2024-05-13,Potassium Ion Transport,122,109,83,0.761468,0.006147,2,2,1.625604e-175,8.789640e-172
...,...,...,...,...,...,...,...,...,...,...,...,...
5245,WikiPathway_2023_Human,gpt-4o-2024-05-13,Farnesoid X Receptor Pathway,19,14,6,0.428571,0.000957,3,0,9.620602e-16,7.706102e-13
17977,GO_Molecular_Function_2023,gpt-4o-2024-05-13,MAP Kinase Activity,11,25,6,0.240000,0.000554,0,0,9.641486e-16,1.105878e-12
3975,Reactome_2022,gpt-4o-2024-05-13,Signaling By PDGFRA Extracellular Domain Mutants,11,25,6,0.240000,0.000554,3,0,9.641486e-16,1.752822e-12
3768,Reactome_2022,gpt-4o-2024-05-13,Regulation Of RAS By GAPs,65,11,7,0.636364,0.003275,0,0,9.681258e-16,1.760053e-12


In [29]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)