In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd

In [2]:
import re

def clean_elements(array):
    cleaned_array = []
    for element in array:
        # Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [3]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
#lib_names = ["KEGG_2021_Human", 
#             "Reactome_2022", 
#             "WikiPathway_2023_Human"]

Export enrichr libraries as JSON.

In [10]:
geneset_sz = []
for lib_name in lib_names:
    (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
    x = [{"lib_name": lib_name, "descr": d, "num_genes": len(g)} for (d,g) in zip(descr,genes)]
    geneset_sz.extend(x)
df = pd.DataFrame(geneset_sz)
df.to_csv("geneset_sz.tsv",sep="\t",index=None)

In [14]:
hgnc_symbols = pd.read_csv("hgnc_symbols.txt",header=None)

In [11]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
        # Generate cleaned version of gene set description w/o identifiers.
        descr_cleaned = clean_elements(descr)
        
        # Generate genes, and keep track of time it takes (seconds).
        start_time = time.time()
        llm_genes = await llm2geneset.get_genes(aclient, descr_cleaned, model=model)
        end_time = time.time()
        gen_time = end_time - start_time
        gen_res = {}

        # Assemble and save generation results.
        gen_res["lib_name"] = lib_name
        gen_res["model"] = model
        gen_res["gen_time"] = gen_time
        gen_res["descr"] = descr
        gen_res["descr_cleaned"] = descr_cleaned
        gen_res["curated_genesets"] = genes
        gen_res["llm_genesets"] = llm_genes

        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)
        


 15%|█████████████████▊                                                                                                      | 802/5407 [00:07<00:30, 151.21it/s]

retrying
List all the known genes directly and indirectly involved in the following biological process or pathway: Positive Regulation Of NIK/NF-kappaB Signaling. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            },
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the biological process or pathway: Positive Regulation Of NIK/NF-kappaB Signaling. Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.



 39%|██████████████████████████████████████████████▌                                                                        | 2117/5407 [00:16<00:19, 168.17it/s]

retrying
List all the known genes directly and indirectly involved in the following biological process or pathway: Contact Inhibition. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            },
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the biological process or pathway: Contact Inhibition. Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5406/5407 [01:00<00:02,  2.98s/it]

retrying
List all the known genes directly and indirectly involved in the following biological process or pathway: Positive Regulation Of Neurotransmitter Secretion. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            },
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the biological process or pathway: Positive Regulation Of Neurotransmitter Secretion. Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5407/5407 [01:36<00:00, 56.07it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 474/474 [00:15<00:00, 31.04it/s]
 12%|██████████████▊                                                                                                         | 668/5407 [00:10<00:42, 111.81it/s]

retrying
List all the known genes directly and indirectly involved in the following biological process or pathway: Peroxisomal Membrane Transport. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            },
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the biological process or pathway: Peroxisomal Membrane Transport. Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.



 46%|███████████████████████████████████████████████████████▏                                                               | 2510/5407 [00:23<00:20, 142.01it/s]

retrying
List all the known genes directly and indirectly involved in the following biological process or pathway: Positive Regulation Of Cysteine-Type Endopeptidase Activity Involved In Apoptotic Signaling Pathway. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            },
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the biological process or pathway: Positive Regulation Of Cysteine-Type Endopeptidase Activity Involved In Apoptotic Signaling Pathway. Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.



 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 4708/5407 [00:38<00:04, 142.06it/s]

retrying
List all the known genes directly and indirectly involved in the following biological process or pathway: Membrane Fusion. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            },
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the biological process or pathway: Membrane Fusion. Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5407/5407 [01:14<00:00, 72.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 474/474 [00:30<00:00, 15.55it/s]
