# Caution: This notebook uses the OpenAI API. Token costs can add up quickly.

In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd

In [2]:
import re

def clean_elements(array):
    """Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings"""
    cleaned_array = []
    for element in array:
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [3]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]

In [5]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
        # Generate cleaned version of gene set description w/o identifiers.
        descr_cleaned = clean_elements(descr)
        
        # Generate genes sets with a system message with role prompt.
        start_time1 = time.time()
        llm_genes_role = await llm2geneset.get_genes(aclient,
                                                     descr_cleaned,
                                                     model=model,
                                                     prompt_type='basic',
                                                     use_sysmsg=True)
        end_time1 = time.time()
        gen_time_role = end_time1 - start_time1

        # Generate gene sets without role prompt.
        start_time2 = time.time()
        llm_genes_norole = await llm2geneset.get_genes(aclient,
                                                       descr_cleaned,
                                                       model=model)
        end_time2 = time.time()
        gen_time_norole = end_time2 - start_time2

        # Generate gene sets with reasoning.
        start_time3 = time.time()
        llm_genes_reason = await llm2geneset.get_genes(aclient,
                                                       descr_cleaned,
                                                       model=model, 
                                                       prompt_type='reason')
        end_time3 = time.time()
        gen_time_reasoning = end_time3 - start_time3

        # Generate gene sets with confidence.
        start_time4 = time.time()
        llm_genes_conf = await llm2geneset.get_genes(aclient,
                                                     descr_cleaned,
                                                     model=model, 
                                                     prompt_type='conf')
        end_time4 = time.time()
        gen_time_conf = end_time4 - start_time4
        
        # Assemble and save generation results.
        gen_res = {}
        gen_res["lib_name"] = lib_name
        gen_res["model"] = model
        gen_res["descr"] = descr
        gen_res["descr_cleaned"] = descr_cleaned        
        gen_res["gen_time_role"] = gen_time_role
        gen_res["gen_time_norole"] = gen_time_norole
        gen_res["gen_time_reasoning"] = gen_time_reasoning
        gen_res["gen_time_conf"] = gen_time_conf
        gen_res["curated_genesets"] = genes
        gen_res["llm_genes_role"] = llm_genes_role
        gen_res["llm_genes_norole"] = llm_genes_norole
        gen_res["llm_genes_reason"] = llm_genes_reason
        gen_res["llm_genes_conf"] = llm_genes_conf
    
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

 53%|████████████████████████████████████████████████████████████████▋                                                         | 964/1818 [00:10<00:05, 157.64it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """rRNA Processing In Nucleus And Cytosol""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """rRNA Processing In Nucleus And Cytosol""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
{
    "type": "array",
    "items": [
        {
            "gene": "RPS6KC1"
        },
        {
            "gene": "NOP56"
        },
        {
            "gene": "FBL"
        },
        {
            "gene": "DDX21"
        }

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:35<00:00, 51.77it/s]
  1%|█▎                                                                                                                          | 20/1818 [00:02<01:45, 17.04it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """CD28 Co-Stimulation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """CD28 Co-Stimulation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CD28"},
    {"gene": "CD80"},
    {"gene": "CD86"},
    {"gene": "PIK3R1"},
    {"gene": "PIK3R2"},
    {"gene": "PIK3CA"},
    {"gene": "PIK3CB"},
    {"gene": "GRB2"},
    {"gene": "GAB2"},
    {"gene": "SHC1"},
    {"gene": "LCK"},
    {"gene": "FYN"},
    {"gene": "ITK"},

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 1817/1818 [00:41<00:03,  3.25s/it]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """SLC-mediated Transmembrane Transport""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """SLC-mediated Transmembrane Transport""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "SLC1A1"},
    {"gene": "SLC1A2"},
    {"gene": "SLC1A3"},
    {"gene": "SLC1A4"},
    {"gene": "SLC1A5"},
    {"gene": "SLC1A6"},
    {"gene": "SLC1A7"},
    {"gene": "SLC2A1"},
    {"gene": "SLC2A2"},
    {"gene": "SLC2A3"},
    {"gene": "SL

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [01:34<00:00, 19.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:42<00:00, 42.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:38<00:00, 47.16it/s]


In [12]:
# Generate gene sets for ensembling.
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)

        for i in range(4):
            start_time = time.time()
            gen_res["llm_ensemble_" + str(i)] = await llm2geneset.get_genes(aclient,
                                                                            gen_res["descr_cleaned"],
                                                                            model=model)
            end_time = time.time()
            gen_res["gen_time_ensemble_" + str(i)] = end_time - start_time

        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

 15%|██████████████████▎                                                                                                        | 271/1818 [00:05<00:16, 94.38it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Signaling By ERBB2 TMD/JMD Mutants""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Signaling By ERBB2 TMD/JMD Mutants""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ERBB2"
    },
    {
        "gene": "EGFR"
    },
    {
        "gene": "GRB2"
    },
    {
        "gene": "SHC1"
    },
    {
        "gene": "PIK3CA"
    },
    {
        "gene": "PTK2"
    },
    {
        "gene": "SRC"
    },
    {


 26%|███████████████████████████████▎                                                                                          | 467/1818 [00:07<00:10, 128.47it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Signaling By PDGFR In Disease""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Signaling By PDGFR In Disease""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "PDGFRA"},
    {"gene": "PDGFRB"},
    {"gene": "PIK3CA"},
    {"gene": "PIK3R1"},
    {"gene": "PTEN"},
    {"gene": "AKT1"},
    {"gene": "AKT2"},
    {"gene": "AKT3"},
    {"gene": "MTOR"},
    {"gene": "MAPK1"},
    {"gene": "MAPK3"},
    {"gene": "RAF1

 51%|█████████████████████████████████████████████████████████████▋                                                            | 919/1818 [00:10<00:05, 162.20it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Potential Therapeutics For SARS""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Potential Therapeutics For SARS""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ACE2"
    },
    {
        "gene": "TMPRSS2"
    },
    {
        "gene": "DPP4"
    },
    {
        "gene": "AGT"
    },
    {
        "gene": "IFITM3"
    },
    {
        "gene": "ADAM17"
    },
    {
        "gene": "TMEM16F"
    },
    {

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:55<00:00, 32.48it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:37<00:00, 49.06it/s]
 64%|████████████████████████████████████████████████████████████████████████████▊                                            | 1155/1818 [00:11<00:05, 121.49it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """SARS-CoV Infections""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """SARS-CoV Infections""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "ACE2"
    },
    {
        "gene": "TMPRSS2"
    },
    {
        "gene": "DPP4"
    },
    {
        "gene": "CTS


 71%|██████████████████████████████████████████████████████████████████████████████████████▍                                  | 1298/1818 [00:12<00:03, 170.90it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """HIV Life Cycle""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """HIV Life Cycle""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    { "gene": "CD4" },
    { "gene": "CCR5" },
    { "gene": "CXCR4" },
    { "gene": "FURIN" },
    { "gene": "APOBEC3G" },
    { "gene": "APOBEC3H" },
    { "gene": "EIF2AK2" },
    { "gene": "TRIM5" },
    { "gene": "SAMHD1" },
    { "gene": "MOV10" },
    { "gene": "SERINC3

 76%|████████████████████████████████████████████████████████████████████████████████████████████▏                            | 1385/1818 [00:13<00:03, 142.95it/s]

retrying
list indices must be integers or slices, not str
List all the known genes directly and indirectly involved in the following biological process or cellular component """FGFR3b Ligand Binding And Activation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """FGFR3b Ligand Binding And Activation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "FGFR3"},
    {"gene": "FGF1"},
    {"gene": "FGF2"},
    {"gene": "FGF3"},
    {"gene": "FGF4"},
    {"gene": "FGF5"},
    {"gene": "FGF6"},
    {"gene": "FGF7"},  // Also known as KGFR
    {"gene": "FGF8"},
    {"

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:59<00:00, 30.61it/s]
 73%|███████████████████████████████████████████████████████████████████████████████████████▊                                 | 1320/1818 [00:12<00:03, 156.61it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Activation Of TRKA Receptors""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Activation Of TRKA Receptors""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "NTRK1"
    },
    {
        "gene": "NGF"
    },
    {
        "gene": "SHC1"
    },
    {
        "gene": "GRB2"
    },
    {
        "gene": "SOS1"
    },
    {
        "gene": "PIK3R1"
    },
    {
        "gene": "PIK3CA"
    },
    {
        "g

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:51<00:00, 35.05it/s]
