In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd

In [2]:
import re

def clean_elements(array):
    cleaned_array = []
    for element in array:
        # Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [3]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
#lib_names = ["KEGG_2021_Human"]
#lib_names = ["KEGG_2021_Human", 
#             "Reactome_2022", 
#             "WikiPathway_2023_Human"]

In [4]:
# Generate table for some basic stats on gene set libraries used.
geneset_sz = []
for lib_name in lib_names:
    (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
    x = [{"lib_name": lib_name, "descr": d, "num_genes": len(g)} for (d,g) in zip(descr,genes)]
    print(len(genes))
    geneset_sz.extend(x)
df = pd.DataFrame(geneset_sz)
df.to_csv("geneset_sz.tsv",sep="\t",index=None)

320
1818
801
5407
1147
474


In [None]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
# "gpt-4-turbo-2024-04-09" has garbage random text, doesn't generat correctly using
# the async API
# models = ["gpt-4-turbo-2024-04-09"]
for model in models:
    for lib_name in lib_names:
        (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
        # Generate cleaned version of gene set description w/o identifiers.
        descr_cleaned = clean_elements(descr)
        
        # Generate genes, and keep track of time it takes (seconds).
        start_time = time.time()
        llm_genes = await llm2geneset.get_genes(aclient, descr_cleaned, model=model)
        end_time = time.time()
        gen_time = end_time - start_time
        gen_res = {}

        # Assemble and save generation results.
        gen_res["lib_name"] = lib_name
        gen_res["model"] = model
        gen_res["gen_time"] = gen_time
        gen_res["descr"] = descr
        gen_res["descr_cleaned"] = descr_cleaned
        gen_res["curated_genesets"] = genes
        gen_res["llm_genesets"] = llm_genes
    
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:16<00:00, 19.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 1817/1818 [00:36<00:02,  2.49s/it]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Platelet Activation, Signaling And Aggregation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Platelet Activation, Signaling And Aggregation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "ADRA2A"},
    {"gene": "ADRA2B"},
    {"gene": "ADRB1"},
    {"gene": "ADRB2"},
    {"gene": "ADRB3"},
    {"gene": "AKT1"},
    {"gene": "AKT2"},
    {"gene": "AKT3"},
    {"gene": "ARHGEF1"},
    {"gene": "CALM1"},
    {

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:43<00:00, 41.67it/s]
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 742/801 [00:08<00:01, 46.89it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Glycosphingolipid Metabolism""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Glycosphingolipid Metabolism""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ABCA1"
    },
    {
        "gene": "ABCB11"
    },
    {
        "gene": "ABCB4"
    },
    {
        "gene": "ABCG5"
    },
    {
        "gene": "ABCG8"
    },
    {
        "gene": "ASAH1"
    },
    {
        "gene": "ASAH2"
    },
    {
        "gene": "B4G

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 800/801 [00:22<00:01,  1.60s/it]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """GPCRs Class A Rhodopsin Like""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """GPCRs Class A Rhodopsin Like""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "ADRA1A"},
    {"gene": "ADRA1B"},
    {"gene": "ADRA1D"},
    {"gene": "ADRA2A"},
    {"gene": "ADRA2B"},
    {"gene": "ADRA2C"},
    {"gene": "ADRB1"},
    {"gene": "ADRB2"},
    {"gene": "ADRB3"},
    {"gene": "BDKRB1"},
    {"gene": "BDKRB2"},
    {"gene"

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:48<00:00, 16.47it/s]
 60%|████████████████████████████████████████████████████████████████████████▋                                                | 3248/5407 [00:24<00:13, 158.16it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Regulation Of Cell Cycle""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Regulation Of Cell Cycle""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CDK1"},
    {"gene": "CCND1"},
    {"gene": "CCNE1"},
    {"gene": "CDK2"},
    {"gene": "CDKN1A"},
    {"gene": "CDKN1B"},
    {"gene": "CDKN2A"},
    {"gene": "CDKN2B"},
    {"gene": "RB1"},
    {"gene": "E2F1"},
    {"gene": "TP53"},
    {"gene": "CDK4"},
    {"gne": "CCNA1"},

 77%|████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4148/5407 [00:30<00:07, 175.89it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Regulation Of Epidermal Cell Differentiation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Regulation Of Epidermal Cell Differentiation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "TP63"},
    {"gene": "TP73"},
    {"gene": "NOTCH1"},
    {"gene": "NOTCH2"},
    {"gene": "NOTCH3"},
    {"gene": "NOTCH4"},
    {"gene": "JAG1"},
    {"gene": "JAG2"},
    {"gEne": "DLL1"},
    {"gene": "DLL3"},
    {"gene": "DLL4"},
    

 84%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 4547/5407 [00:33<00:05, 149.43it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Negative Regulation Of Monocyte Chemotaxis""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Negative Regulation Of Monocyte Chemotaxis""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
{
    "genes": [
        {"gene": "CCL2"},
        {"gene": "CCL3"},
        {"gene": "CCL4"},
        {"gene": "CCR2"},
        {"gene": "CX3CL1"},
        {"gene": "CXCL10"},
        {"gene": "CXCL12"},
        {"gene": "P

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 5406/5407 [01:13<00:05,  5.64s/it]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Regulation Of Protein Polymerization""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Regulation Of Protein Polymerization""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "ACTG1"},
    {"gene": "ACTN1"},
    {"gene": "ACTR10"},
    {"gene": "ADF"},
    {"gene": "AGFG1"},
    {"gene": "AIMP1"},
    {"gene": "AIMP2"},
    {"gene": "AIMP2-DX7"},
    {"gene": "AIMP3"},
    {"gene": "AIP"},
    {"gene": "ALBU"},
   

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5407/5407 [01:25<00:00, 62.97it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1147/1147 [00:21<00:00, 52.31it/s]
 49%|█████████████████████████████████████████████████████████████▏                                                              | 234/474 [00:04<00:02, 80.77it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Clathrin-Coated Endocytic Vesicle""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Clathrin-Coated Endocytic Vesicle""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "AP2M1"},
    {"gene": "AP2A1"},
    {"gene": "AP2S1"},
    {"gene": "AP2B1"},
    {"gene": "AP2A2"},
    {"gene": "AP2B2"},
    {"gene": "AP2A1-AS1"},
    {"gene": "CLTC"},
    {"gene": "CLTA"},
    {"gene": "CLTB"},
    {"gene": "CLTCL1"},
    {"g

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 474/474 [00:25<00:00, 18.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [01:10<00:00,  4.56it/s]
  7%|████████▋                                                                                                                  | 129/1818 [00:04<00:21, 77.30it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Cytochrome C-Mediated Apoptotic Response""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Cytochrome C-Mediated Apoptotic Response""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "CYCS"
    },
    {
        "gene": "APA


 12%|██████████████▌                                                                                                           | 217/1818 [00:05<00:14, 110.95it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Apoptosis""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Apoptosis""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "TP53"},
    {"gene": "BAX"},
    {"gene": "BCL2"},
    {"gene": "CASP3"},
    {"gene": "CASP8"},
    {"gene": "CASP9"},
    {"gene": "FAS"},
    {"gene": "FASLG"},
    {"gene": "BID"},
    {"gene": "BCL2L1"},
    {"gene": "BIK"},
    {"gene": "BAD"},
    {"gene": "BIRC5"},
    {"gene": "DIABLO"}

 74%|█████████████████████████████████████████████████████████████████████████████████████████▉                               | 1351/1818 [00:13<00:03, 143.49it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """SUMO Is Proteolytically Processed""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """SUMO Is Proteolytically Processed""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "SENP1"},
    {"gene": "SENP2"},
    {"gene": "SENP3"},
    {"gene: "SENP5"},
    {"gene": "SENP6"},
    {"gene": "SENP7"},
    {"gene": "UBA2"},
    {"gene": "SAE1"},
    {"gene": "UBC9"},
    {"gene": "SUMO1"},
    {"gene": "SUMO2"},
    {"gene": "SUMO3"},
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:45<00:00, 39.87it/s]
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 783/801 [00:13<00:01, 14.16it/s]