In [3]:
import llm2geneset
import openai
import pandas as pd
from scipy.stats import hypergeom
from statsmodels.stats.multitest import multipletests
import re
import numpy as np
client = openai.Client()

In [4]:
def load(gmt_file):
    gmt = llm2geneset.read_gmt(gmt_file)
    def clean_elements(array):
        """Use regular expression to remove (GO:xxx) substring,  
        R-HSA-xxx substrings, and WPxxx substrings"""
        cleaned_array = []
        for element in array:
            cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
            cleaned_array.append(cleaned_element)
        return cleaned_array
    gmt["descr_cleaned"] = clean_elements(gmt["descr"]) 
    gmtemb = llm2geneset.get_embeddings(client, gmt["descr_cleaned"])
    return (gmt, gmtemb)

In [57]:
wiki, wikiemb = load("libs_human/gmt/Reactome_2022.txt")
kegg, keggemb = load("libs_human/gmt/KEGG_2021_Human.txt")
react, reactemb = load("libs_human/gmt/Reactome_2022.txt")
go, goemb = load("libs_human/gmt/GO_Biological_Process_2023.txt")

In [63]:

# Convert lists to matrices
matrix1 = np.vstack(wikiemb)  # Shape (m, d), where m is the number of vectors
matrix2 = np.vstack(goemb)  # Shape (n, d), where n is the number of vectors

# Compute pairwise dot products using matrix multiplication
A = matrix1 @ matrix2.T  # Shape (m, n)
max_cols = np.argmax(A, axis=1)
max_values = A[np.arange(A.shape[0]), max_cols]
mask = max_values > 0.7
# 4. Filter down to those rows
f_cols = max_cols[mask].tolist()   # column indices of max
f_vals = max_values[mask]     # corresponding max values
f_rows = np.where(mask)[0].tolist()   # (optional) which row indices



In [64]:
iou_output = []
for row, col in zip(f_rows, f_cols):
    print(wiki["descr"][row] + "\t" + go["descr"][col])
    print()
    #print(wikipathway["genes"][row])
    #print(go["genes"][col])
print(len(f_cols))

ATF6 (ATF6-alpha) Activates Chaperone Genes R-HSA-381183	ATF6-mediated Unfolded Protein Response (GO:0036500)

ATF6 (ATF6-alpha) Activates Chaperones R-HSA-381033	ATF6-mediated Unfolded Protein Response (GO:0036500)

AUF1 (hnRNP D0) Binds And Destabilizes mRNA R-HSA-450408	mRNA Destabilization (GO:0061157)

Acetylcholine Regulates Insulin Secretion R-HSA-399997	Regulation Of Insulin Secretion (GO:0050796)

Activation Of NF-kappaB In B Cells R-HSA-1169091	Activation Of NF-kappaB-inducing Kinase Activity (GO:0007250)

Activation Of RAC1 R-HSA-428540	Regulation Of Rac Protein Signal Transduction (GO:0035020)

Activation Of TRKA Receptors R-HSA-187015	Neurotrophin TRK Receptor Signaling Pathway (GO:0048011)

Acyl Chain Remodeling Of CL R-HSA-1482798	Cardiolipin Acyl-Chain Remodeling (GO:0035965)

Acyl Chain Remodeling Of DAG And TAG R-HSA-1482883	Phosphatidylinositol Acyl-Chain Remodeling (GO:0036149)

Acyl Chain Remodelling Of PC R-HSA-1482788	Phosphatidylcholine Acyl-Chain Remodeling (GO

In [65]:

iou_output = []
for row, col in zip(f_rows, f_cols):
    #print(f"Value {dot_products[row, col]:.2f} at index ({row}, {col})")
    #print(wikipathway["descr_cleaned"][row], go["descr_cleaned"][col])
    #print(wikipathway["genes"][row])
    #print(go["genes"][col])
    curated_genes = go["genes"][col]
    parsed_llm_genes = wiki["genes"][row]
    
    llm_genes = list(set(parsed_llm_genes)) # make sure unique genes are selected


    intersection = set(llm_genes).intersection(set(curated_genes))
    p_val = hypergeom.sf(len(intersection)-1,
                            19846, 
                            len(curated_genes), 
                            len(llm_genes))
    
    # generatio == recall 
    generatio = float(len(intersection)) / len(set(curated_genes))                                                                                                 
    bgratio = float(len(set(llm_genes))) / 19846                                                                                                    
                                                                                                                                                        
    richFactor = None                                                                                                                                      
    foldEnrich = None                                                                                                                                      
    if len(llm_genes) > 0:   
        # richFactor == precision                                                                                                                              
        richFactor = float(len(intersection)) / len(set(llm_genes))                                                                                        
        foldEnrich = generatio / bgratio                                                                                                                   

    x = {
        "wiki": wiki["descr_cleaned"][row],
        "go": go["descr_cleaned"][col],
        'ncurated': len(curated_genes),
        'nllm': len(llm_genes),
        'ninter': len(intersection),
        'generatio': generatio,
        'bgratio': bgratio,
        'richFactor': richFactor,
        'foldEnrich': foldEnrich,
        'p_val': p_val
    }
    iou_output.append(x)

In [66]:
df = pd.DataFrame(iou_output)
_, pvals_corrected, _, _ = multipletests(df['p_val'], method='bonferroni')
df['p_val_adj'] = pvals_corrected
print(df.shape)
print(df[(df['p_val_adj'] < 0.01) ].shape[0] / df.shape[0])

(635, 11)
0.7133858267716535


In [77]:

# Filter the dataframe
filtered_df = df[df['p_val_adj'] < 0.01]
#filtered_df.generatio.mean()
filtered_df.richFactor.mean()

0.31028956179170936