In [62]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from statsmodels.stats.multitest import multipletests
aclient = openai.AsyncClient()
client = openai.Client()

In [63]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)

In [64]:
gsai_outputs = pd.read_csv("/home/jzhu/llm2geneset/notebooks/gsai_outputs.tsv", sep="\t")
llm2geneset_outputs = pd.read_csv("/home/jzhu/llm2geneset/notebooks/llm2geneset_outputs.tsv", sep="\t")

In [65]:
gsai_outputs.head(2)

Unnamed: 0,model,library,gt_name,gsai_name,gsai_ROUGE1,gsai_ROUGE2,gsai_ROUGEL,gsai_csim
0,gpt-3.5-turbo-0125,KEGG_2021_Human,ABC transporters,Transmembrane transport and cellular detoxific...,0.5,0.0,0.285714,0.537227
1,gpt-3.5-turbo-0125,KEGG_2021_Human,AGE-RAGE signaling pathway in diabetic complic...,Inflammatory signaling and immune response reg...,0.142857,0.0,0.153846,0.370827


In [204]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-3.5-turbo-0125"]
lib_names = ["KEGG_2021_Human"]

ouput = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr_cleaned"]
        # choose the gene sets that GSAI ROUGE1>0
        gsai_sets = gsai_outputs[(gsai_outputs.gsai_ROUGE1>0) & (gsai_outputs.library==lib_name)].gt_name.values

        # choose the gene sets that LLM2geneset ROUGE1>0
        llm2geneset_sets = llm2geneset_outputs[(llm2geneset_outputs.llm2geneset_ROUGE1>0) & (llm2geneset_outputs.library==lib_name)].gt_name.values

        # pick the gene sets that both GSAI and LLM2geneset has ROUGE1>1
        intersect_set = set(llm2geneset_sets).intersection(set(gsai_sets))

        # select the intersect test sets 
        set_indices = np.where(np.isin(test_descr, list(intersect_set)))[0]

        # choose 2 random gene sets for 50 times
        for i in range(50):
            # draw 2 random gene sets
            random_indices = np.random.randint(0, len(set_indices), size=2)
            selected_test_descr=[test_descr[set_indices[i]] for i in random_indices]
            selected_test_sets=[test_sets[set_indices[i]] for i in random_indices]
            # combine the genes in the gene set
            selected_test_sets = list(set(selected_test_sets[0]).union(set(selected_test_sets[1])))
            # shuffle the genese within the gene set
            np.random.shuffle(selected_test_sets)
            # get the embedding of the combined gene set name
            gt_name = [', '.join(selected_test_descr)]
            gt_emb = llm2geneset.get_embeddings(client, gt_name)

            # use GSAI to generate geneset name
            gsai_res = await llm2geneset.gsai(aclient, [selected_test_sets], model=model, n_retry=3)
            gsai_name = [i['name'] for i in gsai_res]
            gsai_name_emb = llm2geneset.get_embeddings(client, gsai_name)
            # evaluate gsai proposed gene set name
            scores = scorer.score(gt_name[0], gsai_name[0])
            gsai_rouge1= scores['rouge1'].recall
            gsai_rouge2= scores['rouge2'].recall
            gsai_rougeL= scores['rougeL'].fmeasure      
            gsai_csim = cosine_similarity(gt_emb,gsai_name_emb).squeeze()
            
            x={"model":model,
               "library":lib_name,
               "gt_name":gt_name[0],
               "name":gsai_name[0],
               "ROUGE1":gsai_rouge1,
               "ROUGE2":gsai_rouge2,   
               "ROUGEL":gsai_rougeL,                  
               "csim":gsai_csim,
               "method":"GSAI"}    
        
            ouput.append(x)

            # use LLM2geneset to generate geneset name
            llm2geneset_res = await llm2geneset.gs_proposal(aclient, [selected_test_sets], model=model, n_retry=3)
            # llm2geneset proposed gene set names
            names = [gene_set[0] for gene_set in llm2geneset_res[0]]
            # hypergeometric p-vals for proposed gene sets 
            pvals=[gene_set[1] for gene_set in llm2geneset_res[0]]
            # correct the pvals
            _, pvals_corrected, _, _ = multipletests(pvals, method='bonferroni')
            # select gene sets with adj pvals<0.01
            indices = np.where(pvals_corrected<0.01)
            llm2geneset_name = [', '.join(np.array(names)[indices])]
            llm2geneset_name_emb = llm2geneset.get_embeddings(client, llm2geneset_name)
            # evaluate llm2geneset proposed gene set name
            scores = scorer.score(gt_name[0], llm2geneset_name[0])
            llm2geneset_rouge1= scores['rouge1'].recall
            llm2geneset_rouge2= scores['rouge2'].recall
            llm2geneset_rougeL= scores['rougeL'].fmeasure      
            llm2geneset_csim = cosine_similarity(gt_emb,llm2geneset_name_emb).squeeze()

            x={"model":model,
               "library":lib_name,
               "gt_name":gt_name[0],
               "name":llm2geneset_name[0],
               "ROUGE1":llm2geneset_rouge1,
               "ROUGE2":llm2geneset_rouge2,   
               "ROUGEL":llm2geneset_rougeL,                  
               "csim":llm2geneset_csim,
               "method":"LLM2geneset"}    
        
            ouput.append(x)



KEGG_2021_Human


100%|██████████| 1/1 [00:01<00:00,  1.46s/it]
100%|██████████| 1/1 [00:05<00:00,  5.84s/it]
100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
100%|██████████| 1/1 [00:07<00:00,  7.44s/it]
100%|██████████| 1/1 [00:02<00:00,  2.56s/it]
100%|██████████| 1/1 [00:04<00:00,  4.90s/it]
100%|██████████| 1/1 [00:02<00:00,  2.87s/it]
100%|██████████| 1/1 [00:03<00:00,  3.80s/it]
100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
100%|██████████| 1/1 [00:14<00:00, 14.98s/it]
100%|██████████| 1/1 [00:02<00:00,  2.32s/it]
100%|██████████| 1/1 [00:04<00:00,  4.21s/it]
100%|██████████| 1/1 [00:02<00:00,  2.31s/it]
100%|██████████| 1/1 [00:03<00:00,  3.13s/it]
100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
100%|██████████| 1/1 [00:03<00:00,  3.70s/it]
100%|██████████| 1/1 [00:01<00:00,  1.99s/it]
100%|██████████| 1/1 [00:05<00:00,  5.57s/it]
100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
100%|██████████| 1/1 [00:05<00:00,

In [206]:
df = pd.DataFrame(ouput)

In [208]:
df.to_csv('mix_genesets_outputs.tsv', sep="\t", index=False)