In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd

In [2]:
import re

def clean_elements(array):
    cleaned_array = []
    for element in array:
        # Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [3]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
#lib_names = ["KEGG_2021_Human"]
#lib_names = ["KEGG_2021_Human", 
#             "Reactome_2022", 
#             "WikiPathway_2023_Human"]

In [4]:
# Generate table for some basic stats on gene set libraries used.
geneset_sz = []
for lib_name in lib_names:
    (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
    x = [{"lib_name": lib_name, "descr": d, "num_genes": len(g)} for (d,g) in zip(descr,genes)]
    print(len(genes))
    geneset_sz.extend(x)
df = pd.DataFrame(geneset_sz)
df.to_csv("geneset_sz.tsv",sep="\t",index=None)

320
1818
801
5407
1147
474


In [5]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
        # Generate cleaned version of gene set description w/o identifiers.
        descr_cleaned = clean_elements(descr)
        
        # Generate genes, and keep track of time it takes (seconds).
        start_time = time.time()
        llm_genes = await llm2geneset.get_genes(aclient, descr_cleaned, model=model)
        end_time = time.time()
        gen_time = end_time - start_time
        gen_res = {}

        # Assemble and save generation results.
        gen_res["lib_name"] = lib_name
        gen_res["model"] = model
        gen_res["gen_time"] = gen_time
        gen_res["descr"] = descr
        gen_res["descr_cleaned"] = descr_cleaned
        gen_res["curated_genesets"] = genes
        gen_res["llm_genesets"] = llm_genes
    
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:35<00:00,  8.94it/s]
 17%|████████████████████▌                                                                                                     | 306/1818 [00:04<00:11, 134.55it/s]

retrying
unhashable type: 'list'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Synthesis Of Epoxy (EET) And Dihydroxyeicosatrienoic Acids (DHET)""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Synthesis Of Epoxy (EET) And Dihydroxyeicosatrienoic Acids (DHET)""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CYP2C8"},
    {"gene": "CYP2C9"},
    {"gene": "CYP2C19"},
    {"gene": "CYP2J2"},
    {"gene": "CYP4A11"},
    {"gene": "CYP4F2"},
    {"gene": ["EPHX2", "EPHX1"]},
    {"gene": "P

 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 1526/1818 [00:12<00:01, 172.57it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """HDMs Demethylate Histones""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """HDMs Demethylate Histones""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
{
    "genes": [
        {
            "gene": "KDM1A"
        },
        {
            "gene": "KDM2A"
        },
        {
            "gene": "KDM2B"
        },
        {
            "gene": "KDM3A"
        },
        {
            "gene": "KDM3B"
       

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [01:38<00:00, 18.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:38<00:00, 20.75it/s]
 24%|█████████████████████████████▋                                                                                           | 1324/5407 [00:11<00:22, 181.96it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Microtubule Anchoring""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Microtubule Anchoring""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CAMS


 29%|███████████████████████████████████▎                                                                                     | 1578/5407 [00:13<00:25, 150.43it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Regulation Of Homotypic Cell-Cell Adhesion""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Regulation Of Homotypic Cell-Cell Adhesion""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CDH1"},
    {"gene": "CDH2"},
    {"gene": "CTNND1"},
    {"gene": "JUP"},
    {"gene": "PVRL1"},
    {"gene": "CTNNB1"},
    {"gene": "CTNNA1"},
    {"gene": "CTNNA2"},
    {"gene": "CTNNA3"},
    {"gene": "CTNNA4"},
    {"gene": "CDH11"},
   

 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 5310/5407 [00:39<00:01, 68.08it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Regulation Of Blood Circulation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Regulation Of Blood Circulation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "AGT"},
    {"gene": "ANG"},
    {"gene": "ATP12A"},
    {"gene": "AVP"},
    {"gene": "CACNA1C"},
    {"gene": "CALCA"},
    {"gene": "CCL2"},
    {"gene": "CCL5"},
    {"gene": "CCR2"},
    {"gene": "CCR5"},
    {"gene": "CXCL10"},
    {"gene": "CXCL1

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5404/5407 [01:00<00:06,  2.08s/it]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Phosphate-Containing Compound Metabolic Process""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Phosphate-Containing Compound Metabolic Process""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "ACDC"},
    {"gene": "ACO2"},
    {"gene": "ACP5"},
    {"gene": "ADK"},
    {"gene": "ADK2"},
    {"gene": "AGPS"},
    {"gene": "AK1"},
    {"gene": "AK2"},
    {"gene": "AK3"},
    {"gene": "ALPL"},
    {"gene": "AMDH

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5407/5407 [01:13<00:00, 73.65it/s]
 51%|██████████████████████████████████████████████████████████████▎                                                           | 586/1147 [00:06<00:03, 150.28it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Cytokine Activity""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Cytokine Activity""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"genes": "IL1A"},
    {"genes": "IL1B"},
    {"genes": "IL2"},
    {"genes": "IL3"},
    {"genes": "IL4"},
    {"genes": "IL5"},
    {"genes": "IL6"},
    {"genes": "IL7"},
    {"genes": "IL8"},
    {"genes": "IL9"},
    {"genes": "IL10"},
    {"genes": "IL11"},
    {"genes": "IL12"},
    {"genes": "IL

 58%|██████████████████████████████████████████████████████████████████████▍                                                   | 662/1147 [00:06<00:03, 159.68it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Single-Stranded DNA Exodeoxyribonuclease Activity""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Single-Stranded DNA Exodeoxyribonuclease Activity""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "EXO1"},
    {"gene": "FEN1"},
    {"gene": "MRE11"},
    {"gene": "RAD2"},
    {"gene": "BLM"},
    {"gene": "DNA2"},
    {"gene": "WRN"},
    {"gene": "RECQ1"},
    {"gene": "RECQ4"},
    {"gene": "RB1"},
    {"gene

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1147/1147 [00:38<00:00, 29.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 474/474 [00:24<00:00, 19.74it/s]
 59%|████████████████████████████████████████████████████████████████████████▊                                                   | 188/320 [00:06<00:02, 55.59it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """VEGF signaling pathway""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """VEGF signaling pathway""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "VEGFA"
    },
    {
        "gene": "VEGFB"
    },
    {
        "gene": "VEGFC"
    },
    {
        "gene": "VEGFD"
    },
    {
        "gene": "VEGFE"
    },
    {
        "gene": "NRP1"
    },
    {
        "gene": "NRP2"
    },
    {
        "gene": "KDR"

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:20<00:00, 15.32it/s]
 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 1678/1818 [00:19<00:02, 50.71it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Metabolism""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Metabolism""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "G6PC"
    },
    {
        "gene": "HK1"
    },
    {
        "gene": "HK2"
    },
    {
        "gene": "HK3"
    },
    {
        "gene": "HKDC1"
    },
    {
        "gene": "PDHA1"
    },
    {
        "gene": "PDHB"
    },
    {
        "gene": "DLAT"
    },
    {
        "gene":

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [01:13<00:00, 24.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:30<00:00, 25.89it/s]
 27%|████████████████████████████████▏                                                                                        | 1436/5407 [00:15<00:34, 114.41it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Mitochondrial Respiratory Chain Complex I Assembly""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Mitochondrial Respiratory Chain Complex I Assembly""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "NDUFA1"},
    {"gene": "NDUFA2"},
    {"gene": "NDUFA3"},
    {"gene": "NDUFA4"},
    {"gene": "NDUFA5"},
    {"gene": "NDUFA6"},
    {"gene": "NDUFA7"},
    {"gene": "NDUFA8"},
    {"gene": "NDUFA9"},
    {"gene":

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5407/5407 [01:18<00:00, 69.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1147/1147 [00:53<00:00, 21.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 474/474 [00:43<00:00, 10.89it/s]
