In [10]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [12]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]

In [22]:
# Calculate Jaccard coefficient (intersection over union)
def calculate_iou(setA, setB):
    # Calculate the intersection of two sets
    intersection = setA.intersection(setB)
    # Calculate the union of two sets
    union = setA.union(setB)
    # Calculate the Intersection over Union
    iou = len(intersection) / len(union)
    return iou

database_res = []

models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]

for lib_name in lib_names:    
    print(lib_name)
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        
        iou_output = []
        for idx in range(len(gen_res["descr"])):
            curated_genes = gen_res["curated_genesets"][idx]
            llm_genes = gen_res["llm_genesets"][idx]['genes']
            intersection = set(llm_genes).intersection(set(curated_genes))
            p_val = hypergeom.sf(len(intersection)-1,
                                 19846-len(curated_genes), 
                                 len(curated_genes), 
                                 len(llm_genes))
            x = {
                'database': lib_name,
                'model' : model,
                'descr': gen_res["descr_cleaned"][idx],
                'ncurated': len(curated_genes),
                'nllm': len(llm_genes),
                'ninter': len(intersection),
                'iou': calculate_iou(set(curated_genes), set(llm_genes)),
                'p_val': p_val
            }
            iou_output.append(x)
    
        df = pd.DataFrame(iou_output)
        # Adjust p-values using the Benjamini-Hochberg (FDR) method
        _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
        df['p_val_adj'] = pvals_corrected
        print(df[df['p_val_adj'] < 0.01].shape[0] / df.shape[0])
        database_res.append(df)    
        

KEGG_2021_Human
0.871875
0.95
Reactome_2022
0.6683168316831684
0.8113311331133113
WikiPathway_2023_Human
0.7378277153558053
0.8526841448189763
GO_Biological_Process_2023
0.2946180876641391
0.42592935084150174
GO_Molecular_Function_2023
0.5135135135135135
0.6748038360941587
GO_Cellular_Component_2023
0.5949367088607594
0.7109704641350211


In [23]:
df = pd.concat(database_res, ignore_index=True)

In [24]:
df[(df["p_val_adj"] < 0.01)].sort_values("p_val")

Unnamed: 0,database,model,descr,ncurated,nllm,ninter,iou,p_val,p_val_adj
5569,WikiPathway_2023_Human,gpt-4o-2024-05-13,Cytoplasmic Ribosomal Proteins,88,74,73,0.820225,3.085370e-190,2.471381e-187
388,KEGG_2021_Human,gpt-4o-2024-05-13,Cytokine-cytokine receptor interaction,295,118,107,0.349673,3.890752e-190,1.245041e-187
177,KEGG_2021_Human,gpt-3.5-turbo-0125,Neuroactive ligand-receptor interaction,341,170,123,0.317010,1.508152e-185,4.826088e-183
3863,Reactome_2022,gpt-4o-2024-05-13,SLC-mediated Transmembrane Transport,247,165,112,0.373333,8.873583e-183,1.613217e-179
515,KEGG_2021_Human,gpt-4o-2024-05-13,Oxidative phosphorylation,133,90,81,0.570423,1.999707e-178,6.399062e-176
...,...,...,...,...,...,...,...,...,...
19832,GO_Cellular_Component_2023,gpt-4o-2024-05-13,Platelet Dense Tubular Network Membrane,8,17,2,0.086957,1.929475e-05,9.145711e-03
443,KEGG_2021_Human,gpt-4o-2024-05-13,Hepatitis B,162,20,4,0.022472,1.932565e-05,6.184207e-03
197,KEGG_2021_Human,gpt-3.5-turbo-0125,PD-L1 expression and PD-1 checkpoint pathway i...,89,2,2,0.022472,2.006562e-05,6.421000e-03
19030,GO_Cellular_Component_2023,gpt-3.5-turbo-0125,H4/H2A Histone Acetyltransferase Complex,29,5,2,0.062500,2.062140e-05,9.774543e-03


In [37]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)