In [5]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests

In [6]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", "WikiPathway_2023_Human"]
#lib_names = ["Reactome_2022"]
#lib_names = ["KEGG_2019"]
#lib_names = ["GO_Biological_Process_2018"]

In [9]:
# Calculate Jaccard coefficient (intersection over union)
def calculate_iou(setA, setB):
    # Calculate the intersection of two sets
    intersection = setA.intersection(setB)
    # Calculate the union of two sets
    union = setA.union(setB)
    # Calculate the Intersection over Union
    iou = len(intersection) / len(union)
    return iou

database_res = []

models = ["gpt-3.5-turbo-0125"]
models = ["gpt-4o-2024-05-13"]

for lib_name in lib_names:
    with open("libs_human/human/" + lib_name + ".json" ) as f:
        lib_human = json.load(f)
        
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            lib_ai = json.load(f)

        iou_output = []
        for (descr, geneset) in lib_human.items():
            intersection = set(geneset).intersection(set(lib_ai[descr]))
            p_val = hypergeom.sf(len(intersection)-1,
                                 19411-len(geneset), 
                                 len(geneset), 
                                 len(lib_ai))
            x = {
                'database': lib_name,
                'descr': descr,
                'ngene': len(geneset),
                'nai': len(lib_ai[descr]),
                'ninter': len(intersection),
                'iou': calculate_iou(set(geneset), set(lib_ai[descr])),
                'p_val': p_val
            }
            iou_output.append(x)
    
        df = pd.DataFrame(iou_output)
        # Adjust p-values using the Benjamini-Hochberg (FDR) method
        _, pvals_corrected, _, _ = multipletests(df['p_val'], alpha=0.05, method='fdr_bh')
        df['p_val_adj'] = pvals_corrected
        print(df[df['p_val_adj'] < 0.05].shape[0] / df.shape[0])
        database_res.append(df)    
        

0.928125
0.5027502750275028
0.8080301129234629


In [29]:
df = pd.concat(database_res, ignore_index=True)

In [13]:
df[(df['p_val_adj'] > 0.05) & (df["p_val"] < 0.05)]

Unnamed: 0,database,descr,ngene,nai,ninter,iou,p_val,p_val_adj
235,WikiPathway_2023_Human,Hematopoietic Stem Cell Gene Regulation By GAB...,20,14,3,0.096774,0.046838,0.057255
275,WikiPathway_2023_Human,Mitochondrial Gene Expression,19,59,3,0.04,0.041024,0.050534
342,WikiPathway_2023_Human,Hereditary Leiomyomatosis And Renal Cell Carci...,20,15,3,0.09375,0.046838,0.057255
375,WikiPathway_2023_Human,Extracellular Vesicles In The Crosstalk Of Car...,19,31,3,0.06383,0.041024,0.050534
434,WikiPathway_2023_Human,FBXL10 Enhancement Of MAP ERK Signaling In Dif...,33,14,4,0.093023,0.045167,0.055381
511,WikiPathway_2023_Human,Regucalcin In Proximal Tubule Epithelial Kidne...,32,15,4,0.093023,0.04097,0.050534
525,WikiPathway_2023_Human,CAMKK2 Pathway,33,11,4,0.1,0.045167,0.055381
731,WikiPathway_2023_Human,Calcium Regulation In Cardiac Cells,151,23,11,0.067485,0.049263,0.060127
782,WikiPathway_2023_Human,Metapathway Biotransformation Phase I And II,184,16,13,0.069519,0.043096,0.053006


In [37]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)