In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import pandas as pd
import random
from itertools import combinations
from rouge_score import rouge_scorer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from statsmodels.stats.multitest import multipletests

aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)

In [3]:
def sample_pairs(integer_list, num_pairs):
    possible_pairs = [(a, b) for a, b in combinations(integer_list, 2)]
    sampled_pairs = random.sample(possible_pairs, min(num_pairs, len(possible_pairs)))
    return sampled_pairs

In [5]:
gsai_out = pd.read_csv("gsai_outputs.tsv", sep="\t")
llm2geneset_out = pd.read_csv("llm2geneset_outputs.tsv", sep="\t")

In [6]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
models = ["gpt-3.5-turbo-0125"]
lib_names = ["WikiPathway_2023_Human"]
random.seed(30)
output = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        # Limit to gene sets easily identified alone.
        gsai_cur = gsai_out[(gsai_out["model"] == model) & (gsai_out["library"] == lib_name)]
        gsai_cur = gsai_cur[gsai_cur["gsai_ROUGE1"] >= 1]
        llm2geneset_cur = llm2geneset_out[(llm2geneset_out["model"] == model) & (llm2geneset_out["library"] == lib_name)]
        llm2geneset_cur = llm2geneset_cur[llm2geneset_cur["llm2geneset_ROUGE1"] >= 1]
        llm2geneset_gt = set(llm2geneset_cur["gt_name"].to_list())
        gsai_gt = set(gsai_cur["gt_name"].to_list())
        selected = gsai_gt.intersection(llm2geneset_gt)

        # Extract selected gene sets.
        sel_idx = [i for i, descr in enumerate(gen_res["descr_cleaned"]) if descr in selected]
        descr_sel = [ gen_res["descr_cleaned"][i] for i in sel_idx ]
        curated_sel = [ gen_res["curated_genesets"][i] for i in sel_idx ]

        # Construct pairs.
        pairs = sample_pairs(range(len(descr_sel)), 25)
        test_descr = []
        test_genes = []
        for p in pairs:
            merged_sel = descr_sel[p[0]] + ", " + descr_sel[p[1]]
            merged_genes = curated_sel[p[0]] + curated_sel[p[1]]
            random.shuffle(merged_genes)
            test_descr.append(merged_sel)
            test_genes.append(merged_genes)

        # Embed ground truth descriptions.
        gt_emb = llm2geneset.get_embeddings(client, test_descr)
        
        # use GSAI to generate geneset name and embed
        gsai_res = await llm2geneset.gsai(aclient, test_genes, model=model, n_retry=3)
        gsai_name = [i['name'] for i in gsai_res]
        gsai_name_emb = llm2geneset.get_embeddings(client, gsai_name)

        # Evaluate GSAI results.
        for idx in range(len(test_descr)):
            scores = scorer.score(test_descr[idx], gsai_name[idx])
            gsai_rouge1= scores['rouge1'].recall
            gsai_rouge2= scores['rouge2'].recall
            gsai_rougeL= scores['rougeL'].recall      
            gsai_csim = np.dot(gt_emb[idx],gsai_name_emb[idx])
        
            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "name":gsai_name[idx],
               "ROUGE1":gsai_rouge1,
               "ROUGE2":gsai_rouge2,   
               "ROUGEL":gsai_rougeL,                  
               "csim":gsai_csim,
               "method":"GSAI"}    
            
            output.append(x)

        # use LLM2geneset to generate geneset name and embed
        llm2geneset_res = await llm2geneset.gs_proposal(aclient, test_genes, model=model, n_retry=3)
        llm2geneset_name = []
        for idx in range(len(test_descr)):
            names = [gene_set[0] for gene_set in llm2geneset_res[idx]]
            pvals=[gene_set[1] for gene_set in llm2geneset_res[idx]]
            _, pvals_corrected, _, _ = multipletests(pvals, method='bonferroni')
            cur_name = ", ".join([n for n, p in zip(names, list(pvals_corrected)) if p < 0.01])
            llm2geneset_name.append(cur_name)

        llm2geneset_name_emb = llm2geneset.get_embeddings(client, llm2geneset_name)
        
        for idx in range(len(test_descr)):
            # evaluate llm2geneset proposed gene set name
            scores = scorer.score(test_descr[idx], llm2geneset_name[idx])
            llm2geneset_rouge1= scores['rouge1'].recall
            llm2geneset_rouge2= scores['rouge2'].recall
            llm2geneset_rougeL= scores['rougeL'].recall      
            llm2geneset_csim = np.dot(gt_emb[idx],llm2geneset_name_emb[idx])

            x={"model":model,
               "library":lib_name,
               "gt_name":test_descr[idx],
               "name":llm2geneset_name[idx],
               "ROUGE1":llm2geneset_rouge1,
               "ROUGE2":llm2geneset_rouge2,   
               "ROUGEL":llm2geneset_rougeL,                  
               "csim":llm2geneset_csim,
               "method":"LLM2geneset"}    
        
            output.append(x)

WikiPathway_2023_Human


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:04<00:00,  5.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:08<00:00,  3.01it/s]


In [9]:
df = pd.DataFrame(output)

In [10]:
df.to_csv('mix_genesets_outputs.tsv', sep="\t", index=False)