In [1]:
import openai
from pathlib import Path
import json
import llm2geneset
import time
import pandas as pd

In [2]:
import re

def clean_elements(array):
    """Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings"""
    cleaned_array = []
    for element in array:
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [3]:
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human", 
             "GO_Biological_Process_2023",
             "GO_Molecular_Function_2023",
             "GO_Cellular_Component_2023"]
lib_names = ["KEGG_2021_Human", 
             "Reactome_2022", 
             "WikiPathway_2023_Human"]
lib_names = ["KEGG_2021_Human"]

In [4]:
# Generate table for some basic stats on gene set libraries used.
geneset_sz = []
for lib_name in lib_names:
    (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
    x = [{"lib_name": lib_name, "descr": d, "num_genes": len(g)} for (d,g) in zip(descr,genes)]
    print(len(genes))
    geneset_sz.extend(x)
df = pd.DataFrame(geneset_sz)
df.to_csv("geneset_sz.tsv",sep="\t",index=None)

320


In [5]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125"]#, "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        (descr, genes) = llm2geneset.read_gmt("libs_human/gmt/" + lib_name + ".txt")
        # Generate cleaned version of gene set description w/o identifiers.
        descr_cleaned = clean_elements(descr)
        
        # Generate genes sets with a system message with role prompt.
        start_time1 = time.time()
        llm_genes_role = await llm2geneset.get_genes(aclient,
                                                     descr_cleaned,
                                                     model=model,
                                                     prompt_type='basic',
                                                     use_sysmsg=True)
        end_time1 = time.time()
        gen_time_role = end_time1 - start_time1

        # Generate gene sets without role prompt.
        start_time2 = time.time()
        llm_genes_norole = await llm2geneset.get_genes(aclient,
                                                       descr_cleaned,
                                                       model=model)
        end_time2 = time.time()
        gen_time_norole = end_time2 - start_time2

        # Generate gene sets with reasoning.
        start_time3 = time.time()
        llm_genes_reason = await llm2geneset.get_genes(aclient,
                                                       descr_cleaned,
                                                       model=model, 
                                                       prompt_type='reason')
        end_time3 = time.time()
        gen_time_reasoning = end_time3 - start_time3

        start_time4 = time.time()
        llm_genes_conf = await llm2geneset.get_genes(aclient,
                                                     descr_cleaned,
                                                     model=model, 
                                                     prompt_type='conf')
        end_time4 = time.time()
        gen_time_conf = end_time4 - start_time4
        
        # Assemble and save generation results.
        gen_res = {}
        gen_res["lib_name"] = lib_name
        gen_res["model"] = model
        gen_res["descr"] = descr
        gen_res["descr_cleaned"] = descr_cleaned        
        gen_res["gen_time_role"] = gen_time_role
        gen_res["gen_time_norole"] = gen_time_norole
        gen_res["gen_time_reasoning"] = gen_time_reasoning
        gen_res["gen_time_conf"] = gen_time_conf
        gen_res["curated_genesets"] = genes
        gen_res["llm_genes_role"] = llm_genes_role
        gen_res["llm_genes_norole"] = llm_genes_norole
        gen_res["llm_genes_reason"] = llm_genes_reason
        gen_res["llm_genes_conf"] = llm_genes_conf
    
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:18<00:00, 17.11it/s]
  6%|██████▉                                                                                                              | 19/320 [00:01<00:14, 21.04it/s]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Rap1 signaling pathway""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Rap1 signaling pathway""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "RAP1A"},
    {"gene": "RAP1B"},
    {"gene": "


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:23<00:00, 13.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:14<00:00, 22.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:10<00:00, 30.03it/s]


In [7]:
llm_genes_conf

[{'genes': ['ABCA1', 'ABCB1', 'ABCD1', 'ABCC1', 'ABCG2'],
  'parsed_genes': ['ABCA1', 'ABCB1', 'ABCC1', 'ABCG2', 'ABCD1'],
  'reason': ['', '', '', '', ''],
  'conf': ['high', 'high', 'high', 'medium', 'medium'],
  'in_toks': 215,
  'out_toks': 106,
  'ntries': 1},
 {'genes': ['S100B', 'MAPK8', 'JAK2', 'AGER', 'NFKB1', 'AKT1'],
  'parsed_genes': ['AGER', 'S100B', 'MAPK8', 'NFKB1', 'AKT1', 'JAK2'],
  'reason': ['', '', '', '', '', ''],
  'conf': ['high', 'medium', 'low', 'medium', 'medium', 'low'],
  'in_toks': 225,
  'out_toks': 95,
  'ntries': 1},
 {'genes': ['PRKAA1',
   'PRKAA2',
   'PPARA',
   'PRKAG3',
   'PRKAB1',
   'PRKAG1',
   'PRKAG2',
   'PRKAB2',
   'PPARGC1A'],
  'parsed_genes': ['PRKAA1',
   'PRKAA2',
   'PRKAB1',
   'PRKAB2',
   'PRKAG1',
   'PRKAG2',
   'PRKAG3',
   'PPARA',
   'PPARGC1A'],
  'reason': ['', '', '', '', '', '', '', '', ''],
  'conf': ['high',
   'high',
   'high',
   'high',
   'high',
   'high',
   'high',
   'medium',
   'medium'],
  'in_toks': 217,
  

In [5]:
# Ensembl approach, use re-occuring genes to define gene set.
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125"]
#models = ["gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        with open("libs_human/" + model + "/" + lib_name + ".json") as f:
            gen_res = json.load(f)
                
        llm_genes = []
        for i in range(4):
            llm_genes_cur = await llm2geneset.get_genes(aclient,
                                                        gen_res["descr_cleaned"],
                                                        model=model, use_sysmsg=True)
            llm_genes.append(llm_genes_cur)

        llm_ensembled = llm2geneset.ensemble_genes(gen_res["descr_cleaned"], llm_genes, 4)
    
        # Add ensembled results.
        gen_res["llm_ensembled"] = llm_ensembled
        
        with open('libs_human/' + model + '/' + lib_name + '.json', 'w') as json_file:
            json.dump(gen_res, json_file, indent=4)

 72%|██████████████████████████████████████████████████████████████████████████████████████▎                                 | 230/320 [00:06<00:02, 41.01it/s]

retrying
list indices must be integers or slices, not str
List all the known genes directly and indirectly involved in the following biological process or cellular component """Sulfur metabolism""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Sulfur metabolism""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CBS"},
    {"gene": "CDO1"},
    {"gene": "CDO2"},
    {"gene": "ETHE1"},
    {"gene": "GCLC"},
    {"gene": "GLRX"},
    {"gene": "GLRX3"},
    {"gene": "GLRX5"},
    {"gene": "GSR"},
    {"gene": "GSS"},
    {"gene": "MARS"},
    {"gene": "ME1"},
   

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:33<00:00,  9.49it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:45<00:00,  7.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [00:25<00:00, 12.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 320/320 [01:29<00:00,  3.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:55<00:00, 32.95it/s]
 65%|███████████████████████████████████████████████████████████████████████████▋                                         | 1177/1818 [00:10<00:04, 160.19it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Telomere C-strand (Lagging Strand) Synthesis""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Telomere C-strand (Lagging Strand) Synthesis""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "WRN"},
    {"gene": "RECQL4"},
    {"gene": "TEN1"},
    {"gene": "CTC1"},
    {"gene": "STN1"},
    {"gene": "POLA1"},
    {"gene": "POLA2"},
    {"gene": "PRIM2"},
    {"gene": "FEN1"},
    {"gene": "RPA1"},
    {"gene": "RPA2"},
    {"ge

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:54<00:00, 33.49it/s]
 82%|███████████████████████████████████████████████████████████████████████████████████████████████▊                     | 1489/1818 [00:12<00:02, 142.72it/s]

retrying
'gene'
List all the known genes directly and indirectly involved in the following biological process or cellular component """NR1H2 And NR1H3 Regulate Gene Expression To Limit Cholesterol Uptake""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """NR1H2 And NR1H3 Regulate Gene Expression To Limit Cholesterol Uptake""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "NR1H2"},
    {"gene": "NR1H3"},
    {"gene": "ABCA1"},
    {"gene": "ABCG1"},
    {"gene\": "SREBF2"},
    {"gene": "LDLR"},
    {"gene": "HMGCR"},
    {"gene": "HMGCS1"}
]
```


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 1817/1818 [00:38<00:02,  2.75s/it]

retrying
No code blocks found
List all the known genes directly and indirectly involved in the following biological process or cellular component """Platelet Activation, Signaling And Aggregation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Platelet Activation, Signaling And Aggregation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "GP6"},
    {"gene": "ITGA2B"},
    {"gene": "ITGB3"},
    {"gene": "ITGAV"},
    {"gene": "GP1BA"},
    {"gene": "GP1BB"},
    {"gene": "GP1BC"},
    {"gene": "F2R"},
    {"gene": "F2R"},
    {"gene": "GP9"},
    {"gene": 

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [01:48<00:00, 16.78it/s]
 18%|█████████████████████▊                                                                                                | 336/1818 [00:04<00:11, 128.87it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Disease""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Disease""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string"
            }
        },
        "required": [
            "gene"
        ]
    },
    "genes": [
        {"gene": "BRCA1"},
        {"gene": "BRCA2"},

 25%|█████████████████████████████                                                                                         | 447/1818 [00:05<00:11, 115.56it/s]

retrying
string indices must be integers, not 'str'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Long-term Potentiation""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Long-term Potentiation""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {"gene": "CREB1"},
    {"gene": "CAMK2A"},
    {"gene": "CAMK2B"},
    {"gene": "CALM1"},
    {"gene": "GRIN1"},
    {"gene": "GRIN2A"},
    {"gene": "GRIN2B"},
    {"gene": "PRKCG"},
    {"gene": "PRKG1"},
    {"gene": "MAPK1"},
    {"gene": "MAPK3"},
    {

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818/1818 [00:42<00:00, 42.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [01:49<00:00,  7.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:23<00:00, 34.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:47<00:00, 16.88it/s]
  3%|███▋                                                                                                                     | 24/801 [00:02<00:37, 20.93it/s]

retrying
unhashable type: 'list'
List all the known genes directly and indirectly involved in the following biological process or cellular component """Codeine And Morphine Metabolism""". Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "gene": {
                "type": "string",
            }
        },
        "required": ["gene"]
    }
}
```
The field `gene` is a gene involved in the following biological process or cellular component: """Codeine And Morphine Metabolism""". Use the HUGO Gene Nomenclature Committee (HGNC) gene abbreviations. Place the output in a JSON code block. Do not add any comments in the JSON code block.

```json
[
    {
        "gene": "CYP2D6"
    },
    {
        "gene": "CYP3A4"
    },
    {
        "gene": "UGT2B7"
    },
    {
        "gene": ["CYP2C9", "CYP2C19", "CYP2B6", "CYP3A5"]
    },
    {
        "gene": ["COMT", "OPRM1"]
    }
]
```  


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 801/801 [00:23<00:00, 34.30it/s]
