# Caution: This notebook uses the OpenAI API. Token costs can add up quickly.

In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd

In [2]:
import re

def clean_elements(array):
    """Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings"""
    cleaned_array = []
    for element in array:
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [53]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]

In [None]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
        # Generate cleaned version of gene set description w/o identifiers.
        descr_cleaned = clean_elements(descr)
        
        # Generate genes sets with a system message with role prompt.
        start_time1 = time.time()
        llm_genes_role = await llm2geneset.get_genes(aclient,
                                                     descr_cleaned,
                                                     model=model,
                                                     prompt_type='basic',
                                                     use_sysmsg=True)
        end_time1 = time.time()
        gen_time_role = end_time1 - start_time1

        # Generate gene sets without role prompt.
        start_time2 = time.time()
        llm_genes_norole = await llm2geneset.get_genes(aclient,
                                                       descr_cleaned,
                                                       model=model)
        end_time2 = time.time()
        gen_time_norole = end_time2 - start_time2

        # Generate gene sets with reasoning.
        start_time3 = time.time()
        llm_genes_reason = await llm2geneset.get_genes(aclient,
                                                       descr_cleaned,
                                                       model=model, 
                                                       prompt_type='reason')
        end_time3 = time.time()
        gen_time_reasoning = end_time3 - start_time3

        # Generate gene sets with confidence.
        start_time4 = time.time()
        llm_genes_conf = await llm2geneset.get_genes(aclient,
                                                     descr_cleaned,
                                                     model=model, 
                                                     prompt_type='conf')
        end_time4 = time.time()
        gen_time_conf = end_time4 - start_time4


        # Assemble and save generation results.
        gen_res = {}
        gen_res["lib_name"] = lib_name
        gen_res["model"] = model
        gen_res["descr"] = descr
        gen_res["descr_cleaned"] = descr_cleaned 
        gen_res["gen_time_role"] = gen_time_role
        gen_res["gen_time_norole"] = gen_time_norole
        gen_res["gen_time_reasoning"] = gen_time_reasoning
        gen_res["gen_time_conf"] = gen_time_conf
        gen_res["curated_genesets"] = genes
        gen_res["llm_genes_role"] = llm_genes_role
        gen_res["llm_genes_norole"] = llm_genes_norole
        gen_res["llm_genes_reason"] = llm_genes_reason
        gen_res["llm_genes_conf"] = llm_genes_conf
    
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

In [12]:
# Generate gene sets for ensembling.
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        for i in range(4):
            start_time = time.time()
            gen_res["llm_ensemble_" + str(i)] = await llm2geneset.get_genes(aclient,
                                                                            gen_res["descr_cleaned"],
                                                                            model=model)
            end_time = time.time()
            gen_res["gen_time_ensemble_" + str(i)] = end_time - start_time

        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

 15%|██████████████████▎                                                                                                        | 271/1818 [00:05<00:16, 94.38it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Signaling By ERBB2 TMD/JMD Mutants""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Signaling By ERBB2 TMD/JMD Mutants""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ERBB2"
    },
    {
        "gene": "EGFR"
    },
    {
        "gene": "GRB2"
    },
    {
        "gene": "SHC1"
    },
    {
        "gene": "PIK3CA"
    },
    {
        "gene": "PTK2"
    },
    {
        "gene": "SRC"
    },
    {


 26%|███████████████████████████████▎                                                                                          | 467/1818 [00:07<00:10, 128.47it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Signaling By PDGFR In Disease""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Signaling By PDGFR In Disease""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "PDGFRA"},
    {"gene": "PDGFRB"},
    {"gene": "PIK3CA"},
    {"gene": "PIK3R1"},
    {"gene": "PTEN"},
    {"gene": "AKT1"},
    {"gene": "AKT2"},
    {"gene": "AKT3"},
    {"gene": "MTOR"},
    {"gene": "MAPK1"},
    {"gene": "MAPK3"},
    {"gene": "RAF1

 51%|█████████████████████████████████████████████████████████████▋                                                            | 919/1818 [00:10<00:05, 162.20it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Potential Therapeutics For SARS""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Potential Therapeutics For SARS""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ACE2"
    },
    {
        "gene": "TMPRSS2"
    },
    {
        "gene": "DPP4"
    },
    {
        "gene": "AGT"
    },
    {
        "gene": "IFITM3"
    },
    {
        "gene": "ADAM17"
    },
    {
        "gene": "TMEM16F"
    },
    {

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:55<00:00, 32.48it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:37<00:00, 49.06it/s]
 64%|████████████████████████████████████████████████████████████████████████████▊                                            | 1155/1818 [00:11<00:05, 121.49it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """SARS-CoV Infections""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """SARS-CoV Infections""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ACE2"
    },
    {
        "gene": "TMPRSS2"
    },
    {
        "gene": "DPP4"
    },
    {
        "gene": "CTS


 71%|██████████████████████████████████████████████████████████████████████████████████████▍                                  | 1298/1818 [00:12<00:03, 170.90it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """HIV Life Cycle""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """HIV Life Cycle""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    { "gene": "CD4" },
    { "gene": "CCR5" },
    { "gene": "CXCR4" },
    { "gene": "FURIN" },
    { "gene": "APOBEC3G" },
    { "gene": "APOBEC3H" },
    { "gene": "EIF2AK2" },
    { "gene": "TRIM5" },
    { "gene": "SAMHD1" },
    { "gene": "MOV10" },
    { "gene": "SERINC3

 76%|████████████████████████████████████████████████████████████████████████████████████████████▏                            | 1385/1818 [00:13<00:03, 142.95it/s]

retrying
list indices must be integers or slices, not str
List all the known genes directly and indirectly involved in the following biological process or cellular component """FGFR3b Ligand Binding And Activation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """FGFR3b Ligand Binding And Activation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "FGFR3"},
    {"gene": "FGF1"},
    {"gene": "FGF2"},
    {"gene": "FGF3"},
    {"gene": "FGF4"},
    {"gene": "FGF5"},
    {"gene": "FGF6"},
    {"gene": "FGF7"},  // Also known as KGFR
    {"gene": "FGF8"},
    {"

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:59<00:00, 30.61it/s]
 73%|███████████████████████████████████████████████████████████████████████████████████████▊                                 | 1320/1818 [00:12<00:03, 156.61it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Activation Of TRKA Receptors""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Activation Of TRKA Receptors""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "NTRK1"
    },
    {
        "gene": "NGF"
    },
    {
        "gene": "SHC1"
    },
    {
        "gene": "GRB2"
    },
    {
        "gene": "SOS1"
    },
    {
        "gene": "PIK3R1"
    },
    {
        "gene": "PIK3CA"
    },
    {
        "g

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:51<00:00, 35.05it/s]


In [57]:
# Generate gene sets using RAG.
aclient = openai.AsyncClient()
#models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
models = ["gpt-4o-2024-05-13"]

for model in models:    
    for lib_name in lib_names:
        print(model)
        print(lib_name)
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
        descr_cleaned = gen_res["descr_cleaned"]


        start_time = time.time()
        pmids = await llm2geneset.esearch_async(descr_cleaned, "pubmed", 1)
        pmids_cleaned = []
        for i in pmids:
            if len(i)>0:
                pmids_cleaned.append(i[0])
            else:
                pmids_cleaned.append("")


        abstracts = await llm2geneset.efetch_pubmed_async([[i] for i in pmids_cleaned])
        context = []
        for x in abstracts:
            if len(x) > 0:
                context.append(x[0]['abstract'])
            else:
                context.append("")
        
        
        llm_genes_rag = await llm2geneset.get_genes_context(aclient,
                                                                          context,
                                                                          descr_cleaned, 
                                                                          model = model)
        end_time = time.time()
        gen_time_rag = end_time - start_time

        gen_res["gen_time_rag"] = gen_time_rag
        gen_res["llm_genes_rag"] = llm_genes_rag
        
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

gpt-4o-2024-05-13
Reactome_2022


100%|███████████████████████████████████████| 1818/1818 [15:10<00:00,  2.00it/s]
 80%|███████████████████████████████        | 1450/1818 [12:06<03:07,  1.97it/s]

Attempt 0 failed with error: Server disconnected


100%|███████████████████████████████████████| 1818/1818 [15:11<00:00,  2.00it/s]
 50%|███████████████████▎                   | 903/1818 [00:12<00:06, 131.39it/s]

retrying
string indices must be integers, not 'str'
BACKGROUND Neuromyelitis optica is an inflammatory demyelinating disease with generally poor prognosis that selectively targets optic nerves and spinal cord. It is commonly misdiagnosed as multiple sclerosis. Neither disease has a distinguishing biomarker, but optimum treatments differ. The relation of neuromyelitis optica to optic-spinal multiple sclerosis in Asia is uncertain. We assessed the capacity of a putative marker for neuromyelitis optica (NMO-IgG) to distinguish neuromyelitis optica and related disorders from multiple sclerosis. METHODS Indirect immunofluorescence with a composite substrate of mouse tissues identified a distinctive NMO-IgG staining pattern, which we characterised further by dual immunostaining. We tested masked serum samples from 102 North American patients with neuromyelitis optica or with syndromes that suggest high risk of the disorder, and 12 Japanese patients with optic-spinal multiple sclerosis. Contr

 79%|██████████████████████████████▏       | 1445/1818 [00:16<00:03, 104.11it/s]

retrying
No code blocks found
 FGFR signaling is deregulated in many human cancers, and FGFR is considered a valid target in FGFR-deregulated tumors. Here, we examine the preclinical profile of futibatinib (TAS-120; 1-[(3S)-[4-amino-3-[(3,5-dimethoxyphenyl)ethynyl]-1H-pyrazolo[3, 4-d] pyrimidin-1-yl]-1-pyrrolidinyl]-2-propen-1-one), a structurally novel, irreversible FGFR1-4 inhibitor. Among a panel of 296 human kinases, futibatinib selectively inhibited FGFR1-4 with IC50 values of 1.4 to 3.7 nmol/L. Futibatinib covalently bound the FGFR kinase domain, inhibiting FGFR phosphorylation and, in turn, downstream signaling in FGFR-deregulated tumor cell lines. Futibatinib exhibited potent, selective growth inhibition of several tumor cell lines (gastric, lung, multiple myeloma, bladder, endometrial, and breast) harboring various FGFR genomic aberrations. Oral administration of futibatinib led to significant dose-dependent tumor reduction in various FGFR-driven human tumor xenograft models, 

 83%|████████████████████████████████▎      | 1508/1818 [00:17<00:03, 90.99it/s]

retrying
No code blocks found
 Circadian rhythms are biological systems that synchronize cellular circadian oscillators with the organism's daily feeding-fasting or rest-activity cycles in mammals. Circadian rhythms regulate nutrient absorption and utilization at the cellular level and are closely related to obesity and metabolic disorders. Bile acids are important modulators that facilitate nutrient absorption and regulate energy metabolism. Here, we provide an overview of the current connections and future perspectives between the circadian clock and bile acid metabolism as well as related metabolic diseases. Feeding and fasting cycles influence bile acid pool size and composition, and bile acid signaling can respond to acute lipid and glucose utilization and mediate energy balance. Disruption of circadian rhythms such as shift work, irregular diet, and gene mutations can contribute to altered bile acid metabolism and heighten obesity risk. High-fat diets, alcohol, and gene mutations

100%|███████████████████████████████████████| 1818/1818 [00:59<00:00, 30.43it/s]


gpt-4o-2024-05-13
WikiPathway_2023_Human


100%|█████████████████████████████████████████| 801/801 [06:41<00:00,  2.00it/s]
100%|█████████████████████████████████████████| 801/801 [06:41<00:00,  2.00it/s]
 21%|████████▌                                | 168/801 [00:06<00:09, 67.53it/s]

retrying
string indices must be integers, not 'str'
 Photodynamic therapy (PDT) is a non-to-minimally invasive treatment modality that utilizes photoactivatable drugs called photosensitizers to disrupt tumors with locally photoproduced reactive oxygen species (ROS). Photosensitizer activation by light results in hyperoxidative stress and subsequent tumor cell death, vascular shutdown and hypoxia, and an antitumor immune response. However, sublethally afflicted tumor cells initiate several survival mechanisms that account for decreased PDT efficacy. The hypoxia inducible factor 1 (HIF-1) pathway is one of the most effective cell survival pathways that contributes to cell recovery from PDT-induced damage. Several hundred target genes of the HIF-1 heterodimeric complex collectively mediate processes that are involved in tumor cell survival directly and indirectly (e.g., vascularization, glucose metabolism, proliferation, and metastasis). The broad spectrum of biological ramifications culm

 25%|██████████▏                              | 199/801 [00:06<00:07, 84.36it/s]

retrying
string indices must be integers, not 'str'
 Zinc oxide nanoparticles (ZnONPs) hold great promise for biomedical applications. Previous studies have revealed that ZnONPs exposure can induce toxicity in endothelial cells, but the underlying mechanisms have not been fully elucidated. In this study, we report that ZnONPs can induce ferroptosis of both HUVECs and EA.hy926 cells, as evidenced by the elevation of intracellular iron levels, lipid peroxidation and cell death in a dose- and time-dependent manner. In addition, both the lipid reactive oxygen species (ROS) scavenger ferrostatin-1 and the iron chelator deferiprone attenuated ZnONPs-induced cell death. Intriguingly, we found that ZnONPs-induced ferroptosis is macroautophagy/autophagy-dependent, because the inhibition of autophagy with a pharmacological inhibitor or by ATG5 gene knockout profoundly mitigated ZnONPs-induced ferroptosis. We further demonstrated that NCOA4 (nuclear receptor coactivator 4)-mediated ferritinophagy

100%|█████████████████████████████████████████| 801/801 [00:33<00:00, 23.75it/s]


In [8]:
# Generate gene sets using RAG.
aclient = openai.AsyncClient()
set0 = [['CETP', 'TRNP1', 'P4HB', 'APOF', 'LPA' , 'APOB']]
set0 = [['ADIPOQ'
        'CPT1A', 'LEP', 'LEPR', 'ADIPOR1', 'PRKAG1',  'ADIPOR2', 'PRKAB1',  'PRKAA1',  'ACACA']]
set0 = [['SRR'   ,  'PSAT1'  , 'SHMT2' ,  'SHMT1' ,  'PHGDH' ,  'TYMS' ,   'DHFR',   ' PSPH']]
await llm2geneset.gsai(aclient,set0)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:27<00:00, 27.12s/it]

Name: One-carbon metabolism and serine biosynthesis
LLM self-assessed confidence: 0.97

1. SRR, PSAT1, and PSPH are involved in the serine biosynthesis pathway. SRR (serine racemase) catalyzes the isomerization of L-serine to D-serine, which acts as a coagonist with glutamate on NMDA receptors. PSAT1 (phosphoserine aminotransferase 1) and PSPH (phosphoserine phosphatase) are enzymes that convert 3-phosphohydroxypyruvate to L-serine through an intermediate, 3-phosphoserine. This pathway links glycolysis to the synthesis of this non-essential amino acid, impacting cell proliferation and growth.

2. PHGDH (phosphoglycerate dehydrogenase) is another pivotal enzyme in the serine biosynthesis pathway, converting 3-phosphoglycerate into 3-phosphohydroxypyruvate. This reaction is the first committed step in the synthesis of serine from glycolytic intermediates and is crucial for the provision of serine, glycine, and one-carbon units essential for various biosynthetic processes.

3. SHMT1 and S




[{'name': 'One-carbon metabolism and serine biosynthesis',
  'conf': 0.97,
  'annot': ['1. SRR, PSAT1, and PSPH are involved in the serine biosynthesis pathway. SRR (serine racemase) catalyzes the isomerization of L-serine to D-serine, which acts as a coagonist with glutamate on NMDA receptors. PSAT1 (phosphoserine aminotransferase 1) and PSPH (phosphoserine phosphatase) are enzymes that convert 3-phosphohydroxypyruvate to L-serine through an intermediate, 3-phosphoserine. This pathway links glycolysis to the synthesis of this non-essential amino acid, impacting cell proliferation and growth.',
   '2. PHGDH (phosphoglycerate dehydrogenase) is another pivotal enzyme in the serine biosynthesis pathway, converting 3-phosphoglycerate into 3-phosphohydroxypyruvate. This reaction is the first committed step in the synthesis of serine from glycolytic intermediates and is crucial for the provision of serine, glycine, and one-carbon units essential for various biosynthetic processes.',
   '3. S