In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

In [3]:
models = ["gpt-3.5-turbo-0125","gpt-4o-2024-05-13"]
lib_names = ["KEGG_2021_Human",
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
lib_names = ["KEGG_2021_Human"]
models = ["gpt-3.5-turbo-0125"]

output = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        
        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr"]
        test_descr_cleaned = gen_res["descr_cleaned"]
        
        gt_embs = llm2geneset.get_embeddings(client, test_descr_cleaned)

        for method in ["GSAI", "llm2geneset"]:
            if method == "GSAI":
                # use GSAI to generate geneset name
                gsai_res = await llm2geneset.gsai(aclient, test_sets, model=model, n_retry=3)
                names = [i['name'] for i in gsai_res]
            elif method == "llm2geneset":
                res = await llm2geneset.gs_proposal(aclient, test_sets, model=model)
                most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
                names = [i[0] for i in most_overrepresented]
    
            name_embs = llm2geneset.get_embeddings(client, names)
            
            for i, ref in enumerate(test_descr_cleaned):
                scores = scorer.score(ref, names[i])
                rouge1 = scores['rouge1'].recall
                rouge2 = scores['rouge2'].recall
                rougeL = scores['rougeL'].recall      
                csim = np.dot(gt_embs[i], name_embs[i])
                    
                x={"model":model,
                   "library":lib_name,
                   "gt_name":test_descr[i],
                   "gt_name_clean":ref,
                   "name":names[i],
                   "ROUGE1":rouge1,
                   "ROUGE2":rouge2,   
                   "ROUGEL":rougeL,                  
                   "csim":csim,
                   "method": method}    
            
                output.append(x)

KEGG_2021_Human


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:10<00:00, 31.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:31<00:00, 10.03it/s]


In [6]:
df = pd.DataFrame(ouput)
df.to_csv("gsai_vs_llm2geneset_outputs.tsv", sep="\t", index=False)