In [8]:
import blitzgsea as blitz
import openai
from pathlib import Path
import json
from gseaai import GSEAAI
client = openai.OpenAI()
gsea_ai = GSEAAI(client)

In [9]:
import re

def clean_elements(array):
    cleaned_array = []
    for element in array:
        # Use regular expression to remove (GO:xxx) substring,  R-HSA-xxx substrings, and WPxxx substrings
        cleaned_element = re.sub(r'\s*\(GO:\d+\)\s*|\s*R-HSA-\d+\s*|\s*WP\d+\s*', '', element)
        cleaned_array.append(cleaned_element)
    return cleaned_array

In [10]:
lib_names = ["KEGG_2019", "GO_Biological_Process_2018"]
lib_names = ["GO_Biological_Process_2018"]

In [11]:
for lib_name in lib_names:
    lib = blitz.enrichr.read_gmt("libs_yeast/gmt/" + lib_name + ".txt")
    print(len(lib))

1649


In [12]:
yeast_libs = {}
for lib_name in lib_names:
    lib = blitz.enrichr.read_gmt("libs_yeast/gmt/" + lib_name + ".txt")
    lib = dict(zip(clean_elements(list(lib.keys())), lib.values()))
    json.dump(lib, open("libs_yeast/yeast/" + lib_name + '.json', 'w'), indent=4)
    yeast_libs[lib_name] = lib

In [13]:
aclient = openai.AsyncClient()
models = ["gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"]
for model in models:
    for lib_name in lib_names:
        lib = yeast_libs[lib_name]
        ai_lib_json = Path("libs_yeast/" + model + "/" + lib_name + '.json')
        if not ai_lib_json.is_file():
            ai_lib = await gsea_ai.get_genes(aclient, list(lib.keys()), modelg=model, species="yeast")
            json.dump(ai_lib, open(str(ai_lib_json), 'w'), indent=4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1649/1649 [01:14<00:00, 22.00it/s]


'gene'
RNA polymerase II transcriptional preinitiation complex assembly
```json
[
    {"gene": "TFIIA"},
    {"gene": "TFIIB"},
    {"gene": "TFIID"},
    {"genes": [
        {"gene": "TAF1"},
        {"gene": "TAF2"},
        {"gene": "TAF3"},
        {"gene": "TAF4"},
        {"gene": "TAF5"},
        {"gene": "TAF6"},
        {"gene": "TAF7"},
        {"gene": "TAF9"},
        {"gene": "TAF10"},
        {"gene": "TAF11"},
        {"gene": "TAF12"},
        {"gene": "TAF13"}
    ]},
    {"gene": "TFIIE"},
    {"gene": "TFIIF"},
    {"gene": "TFIIH"}
]
```  
'gene'
cellular protein modification process
```json
[
    {
        "gene": "UBC9"
    },
    {
        "gene": "SMT3"
    },
    {
        "gene": "E1"
    },
    {
        "gene": "UBA1"
    },
    {
        "gene": "E2"
    },
    {
        "gene": "UFD2"
    },
    {
        "gene": "UFD4"
    },
    {
        "gene": "UBC4"
    },
    {
        "gene": "UBC5"
    },
    {
        "gene": "UBI4"
    },
    {
        "gene": "

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1649/1649 [01:16<00:00, 21.64it/s]


the JSON object must be str, bytes or bytearray, not NoneType
translational elongation
```json
[
    {"gene": "RPL1A"},
    {"gene": "RPL1B"},
    {"gene": "RPL2A"},
    {"gene": "RPL2B"},
    {"gene": "RPL3"},
    {"gene": "RPL4A"},
    {"gene": "RPL4B"},
    {"gene": "RPL5"},
    {"gene": "RPL6A"},
    {"gene": "RPL6B"},
    {"gene": "RPL7A"},
    {"gene": "RPL7B"},
    {"gene": "RPL8A"},
    {"gene": "RPL8B"},
    {"gene": "RPL9A"},
    {"gene": "RPL9B"},
    {"gene": "RPL10"},
    {"gene": "RPL11A"},
    {"gene": "RPL11B"},
    {"gene": "RPL12A"},
    {"gene": "RPL12B"},
    {"gene": "RPL13A"},
    {"gene": "RPL13B"},
    {"gene": "RPL14A"},
    {"gene": "RPL14B"},
    {"gene": "RPL15A"},
    {"gene": "RPL15B"},
    {"gene": "RPL16A"},
    {"gene": "RPL16B"},
    {"gene": "RPL17A"},
    {"gene": "RPL17B"},
    {"gene": "RPL18A"},
    {"gene": "RPL18B"},
    {"gene": "RPL19A"},
    {"gene": "RPL19B"},
    {"gene": "RPL20A"},
    {"gene": "RPL20B"},
    {"gene": "RPL21A"},
    {"gene

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.68s/it]
