In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import hypergeom
from asynciolimiter import StrictLimiter
import asyncio
import tqdm.asyncio
aclient = openai.AsyncClient()
client = openai.Client()

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)

In [3]:
models = ["gpt-3.5-turbo-0125"]#, "gpt-4o-2024-05-13"]
#models = ["gpt-3.5-turbo-0125"]
lib_names = ["KEGG_2021_Human"]#,
             #"Reactome_2022", 
             #"WikiPathway_2023_Human"]

In [4]:
output = []
for model in models:    
    for lib_name in lib_names:
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        test_sets = gen_res["curated_genesets"]
        test_descr = gen_res["descr_cleaned"]
        gt_embs = llm2geneset.get_embeddings(client, test_descr)

        # use llm2geneset to generate geneset name
        res = await llm2geneset.gs_proposal(aclient, test_sets, model=model)
        most_overrepresented = [min(gene_set, key=lambda x: x[1]) for gene_set in res]
        names = [i[0] for i in most_overrepresented]
        name_embs = llm2geneset.get_embeddings(client, names)
        
        for i, ref in enumerate(test_descr):
            gt_emb = [gt_embs[i]]
            name = names[i]
            name_emb = [name_embs[i]]
            scores = scorer.score(ref, name)
            rouge1= scores['rouge1'].recall
            rouge2= scores['rouge2'].recall
            rougeL= scores['rougeL'].fmeasure      
            csim = cosine_similarity(gt_emb,name_emb).squeeze()
                
            x={"model":model,
               "library":lib_name,
               "gt_name":ref,
               "llm2geneset_name":name,
               "llm2geneset_ROUGE1":rouge1,
               "llm2geneset_ROUGE2":rouge2,   
               "llm2geneset_ROUGEL":rougeL,                  
               "llm2geneset_csim":csim}    
            output.append(x)

KEGG_2021_Human


  0%|                                                                                                                                    | 0/320 [00:00<?, ?it/s]

retrying
string indices must be integers, not 'str'
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "p": {
                "type": "string"
            }
        },
        "required": ["p"]
    },
    "contents": [
        {"p": "Tryptophan metabolism"},
        {"p": "Serotonin biosynthesis"},
        {"p": "Kynurenine pathway"},
        {"p": "Histamine metabolism"},
        {"p": "Fatty acid beta-oxidation"}
    ]
}
```


  1%|█▏                                                                                                                          | 3/320 [00:05<06:16,  1.19s/it]

retrying
string indices must be integers, not 'str'

```json
{
    "type": "array",
    "items": [
        {"p": "Tryptophan metabolism"},
        {"p": "Dopamine biosynthesis"},
        {"p": "Fatty acid beta-oxidation"},
        {"p": "GABAergic synapse"},
        {"p": "Gamma-aminobutyric acid catabolic process"}
    ]
}
```


  1%|█▌                                                                                                                          | 4/320 [00:05<05:07,  1.03it/s]

retrying
string indices must be integers, not 'str'
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "p": {
                "type": "string",
            }
        },
        "required": ["p"]
    },
    "pathways": [
        {"p": "PRKCG-IL10 signaling pathway"},
        {"p": "Hemoglobin complex formation"},
        {"p": "B-cell receptor signaling pathway"},
        {"p": "Inflammatory response"},
        {"p": "Ras signaling pathway"}
    ]
}
```


 35%|███████████████████████████████████████████                                                                               | 113/320 [00:08<00:05, 36.39it/s]

retrying
string indices must be integers, not 'str'
```json
{
    "type": "array",
    "items": [
        {"p": "Inflammatory Response Pathway"},
        {"p": "Hemoglobin Binding"},
        {"p": "Immune Response"},
        {"p": "Apoptosis Signaling Pathway"},
        {"p": "Interleukin-6 Signaling"}
    ]
}
```


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:57<00:00,  5.52it/s]


In [8]:
df = pd.DataFrame(output)
df.to_csv("llm2geneset_outputs.tsv", sep="\t", index=False)
df

Unnamed: 0,model,library,gt_name,llm2geneset_name,llm2geneset_ROUGE1,llm2geneset_ROUGE2,llm2geneset_ROUGEL,llm2geneset_csim
0,gpt-3.5-turbo-0125,KEGG_2021_Human,ABC transporters,Multidrug resistance,0.000000,0.000000,0.000000,0.46985455259618913
1,gpt-3.5-turbo-0125,KEGG_2021_Human,AGE-RAGE signaling pathway in diabetic complic...,TGF-beta signaling pathway,0.285714,0.166667,0.363636,0.48049300956694874
2,gpt-3.5-turbo-0125,KEGG_2021_Human,AMPK signaling pathway,AMPK signaling pathway,1.000000,1.000000,1.000000,0.9999988923513651
3,gpt-3.5-turbo-0125,KEGG_2021_Human,Acute myeloid leukemia,Response to interleukin-3,0.000000,0.000000,0.000000,0.39871473936724944
4,gpt-3.5-turbo-0125,KEGG_2021_Human,Adherens junction,Transforming growth factor beta receptor signa...,0.000000,0.000000,0.000000,0.2554367645226975
...,...,...,...,...,...,...,...,...
315,gpt-3.5-turbo-0125,KEGG_2021_Human,cAMP signaling pathway,Calcium signaling pathway,0.666667,0.500000,0.666667,0.5584681398983501
316,gpt-3.5-turbo-0125,KEGG_2021_Human,cGMP-PKG signaling pathway,"ATPase activity, coupled to transmembrane move...",0.000000,0.000000,0.000000,0.3340094289994413
317,gpt-3.5-turbo-0125,KEGG_2021_Human,mRNA surveillance pathway,mRNA export from nucleus,0.333333,0.000000,0.285714,0.5652350086655873
318,gpt-3.5-turbo-0125,KEGG_2021_Human,mTOR signaling pathway,mTOR signaling pathway,1.000000,1.000000,1.000000,0.9999989873215895
