In [1]:
import pandas as pd
import json
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import llm2geneset

In [2]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]

In [10]:
hgcn_symbols = pd.read_csv("hgnc_symbols.txt", sep="\t", header=None)[0].tolist()
hgcn_symbols = set(hgcn_symbols)

In [11]:
# Calculate Jaccard coefficient (intersection over union)
def calculate_iou(setA, setB):
    # Calculate the intersection of two sets
    intersection = setA.intersection(setB)
    # Calculate the union of two sets
    union = setA.union(setB)
    # Calculate the Intersection over Union
    iou = len(intersection) / len(union)
    return iou

database_res = []

models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]

for lib_name in lib_names:    
    print(lib_name)
    for model in models:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
            
        
        iou_output = []
        for idx in range(len(gen_res["descr"])):
            curated_genes = gen_res["curated_genesets"][idx]
            parsed_llm_genes = gen_res["llm_genesets"][idx]['parsed_genes']
            llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected
            intersection = set(llm_genes).intersection(set(curated_genes))
            p_val = hypergeom.sf(len(intersection)-1,
                                 19846-len(curated_genes), 
                                 len(curated_genes), 
                                 len(llm_genes))
            generatio = None
            if len(llm_genes) > 0:
                generatio = float(len(intersection))/len(llm_genes)
            
            x = {
                'database': lib_name,
                'model' : model,
                'descr': gen_res["descr_cleaned"][idx],
                'ncurated': len(curated_genes),
                'nllm': len(llm_genes),
                'ninter': len(intersection),
                'ndup': len(parsed_llm_genes) - len(llm_genes),
                'generatio': generatio,
                'bgratio': float(len(curated_genes))/19846,
                'p_val': p_val
            }
            iou_output.append(x)
    
        df = pd.DataFrame(iou_output)
        # Adjust p-values using the Benjamini-Hochberg (FDR) method
        _, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
        df['p_val_adj'] = pvals_corrected
        print(df[df['p_val_adj'] < 0.01].shape[0] / df.shape[0])
        database_res.append(df)    
        

KEGG_2021_Human
0.89375
0.93125
Reactome_2022
0.6523652365236524
0.8124312431243125
WikiPathway_2023_Human
0.714107365792759
0.8426966292134831
GO_Biological_Process_2023
0.2942481967819493
0.41538746069909377
GO_Molecular_Function_2023
0.5135135135135135
0.6782911944202267
GO_Cellular_Component_2023
0.5759493670886076
0.7362869198312236


In [12]:
df = pd.concat(database_res, ignore_index=True)

In [13]:
df.to_csv("genes_overlap.tsv",sep="\t",index=None)