In [1]:
import openai
import llm2geneset
import pandas as pd
import random

aclient = openai.AsyncClient()

In [2]:
al002 = pd.read_csv("data/Stats.TECAN.AL002_vs_isotype.txt", sep="\t")
al002 = al002[al002["PercentDetectedSamples"] > 0.9]

In [3]:
al002_sig = al002[(al002["padj"] < 0.1) ]
genes = al002_sig["gene"].to_list()

In [4]:
random.seed(138337)
random.shuffle(genes)

In [5]:
print("\n".join(genes))

LILRB2
MX1
GSN
MFGE8
CCL3
GPD1
TRPV2
DHCR24
TNFAIP3
MATK
CXCL8
CHIT1
TPRKB
RASSF4
SMC6
FOS
RHOBTB2
RNF19B
FBP1
SLC11A1
DCSTAMP
HSD3B7
OAS2
TGM2
MCEMP1
FAM20C
ACP5
CD9
AP5B1


In [6]:
gsai_res = await llm2geneset.gsai(aclient, [genes], model="gpt-4o", n_retry=3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.09s/it]


In [7]:
gsai_res

[{'name': 'Immune Response and Inflammatory Regulation',
  'conf': 0.85,
  'annot': ['1. **LILRB2 (Leukocyte immunoglobulin-like receptor subfamily B member 2)** and **SLC11A1 (Solute Carrier Family 11 Member 1)** are involved in the regulation of immune cell activation and phagosome-lysosome fusion, respectively. They play roles in modulating immune responses, particularly in macrophages and other antigen-presenting cells.',
   '2. **CCL3 (Chemokine (C-C motif) ligand 3)**, **CXCL8 (C-X-C motif chemokine ligand 8)**, and **TNFAIP3 (Tumor necrosis factor alpha-induced protein 3)** are critical chemokines and regulatory proteins involved in inflammatory responses. They recruit and activate various leukocytes to infection sites and modulate inflammation through cytokine signaling.',
   "3. **MX1 (Myxovirus resistance protein 1)** and **OAS2 (2'-5'-Oligoadenylate synthase 2)** are interferon-induced proteins that play essential roles in antiviral defense. They contribute to the host's imm

In [8]:
context_str = ""
llm2geneset_no_context_res = await llm2geneset.gs_proposal(aclient, [genes], model="gpt-4o", context=context_str,
                                                           n_pathways=100, n_retry=3)

  0%|                                                                                                                                      | 0/1 [00:00<?, ?it/s]

List 100 biological pathways, biological processes, or cellular components that contain the following genes """LILRB2,MX1,GSN,MFGE8,CCL3,GPD1,TRPV2,DHCR24,TNFAIP3,MATK,CXCL8,CHIT1,TPRKB,RASSF4,SMC6,FOS,RHOBTB2,RNF19B,FBP1,SLC11A1,DCSTAMP,HSD3B7,OAS2,TGM2,MCEMP1,FAM20C,ACP5,CD9,AP5B1""" with high confidence. Be as specific as possible. List non-overlapping pathways, processes, or components. Do not include the gene names in the outputs. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "p": {
                "type": "string",
            },
        },
        "required": ["p"]
    }
}
```
Example output will look like the following:
```json
[{"p":"BP or Pathway 1"},
 {"p":"BP or Pathway 2"},
 {"p":"BP or Pathway 3"},
 {"p":"BP or Pathway 4"}
```
The element `p` designates a pathway, biological process or cellular component. Place the output in a JSON code block. Do not add any comments in the JSON c

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.64s/it]


In [23]:
df = llm2geneset_no_context_res[0]["ora_results"]
df.to_csv("no_context_AL002.tsv",sep="\t",index=None)
df[df["p_adj"] < 0.05]

Unnamed: 0,set_descr,generatio,bgratio,richFactor,foldEnrich,p_val,p_adj,intersection,set_genes,ngenes,nset,ninter,in_toks,out_toks
83,Regulation of bone resorption,0.137931,0.002016,0.1,68.434483,3.258229e-07,3e-05,"ACP5,CCL3,DCSTAMP,CXCL8","S1PR2,CALCR,ESR2,CSF1,OPG,DCSTAMP,PTGS2,CSF1R,...",29,40,4,160,361
70,Cellular response to interleukin-1,0.103448,0.00131,0.115385,78.962865,7.161642e-06,0.000333,"FOS,TNFAIP3,CXCL8","STAT3,PTGS2,MAPK3,ICAM1,TNF,CXCL10,RELA,IL1RL1...",29,26,3,166,343
18,Gluconeogenesis,0.068966,0.000806,0.125,85.543103,0.0002449968,0.00645,"FBP1,GPD1","PFKL,FBP1,GYS1,GYS2,PCK1,PFKP,GCK,G6PC,FBP2,PD...",29,16,2,156,208
67,Osteoclast fusion,0.068966,0.000857,0.117647,80.511156,0.0002774111,0.00645,"FOS,DCSTAMP","NFATC1,CTSK,RANKL,CSF1,TNFRSF11A,CD47,OCSTAMP,...",29,17,2,158,231
68,Positive regulation of cytokine production,0.068966,0.001159,0.086957,59.508246,0.0005132635,0.008736,"TNFAIP3,CXCL8","TLR4,STAT3,SOCS3,TNF,BTK,IL12B,MAPK8,RELA,JAK1...",29,23,2,160,208
81,Regulation of inflammatory response,0.068966,0.00126,0.08,54.747586,0.0006075094,0.008736,"FOS,CXCL8","TLR4,NFKBIA,CXCR2,STAT3,PTGS2,CYLD,TNF,PPARG,R...",29,25,2,158,325
42,Cytokine production,0.068966,0.00131,0.076923,52.64191,0.0006575384,0.008736,"CCL3,CXCL8","IL18,IL4,TNF,CXCL10,IL8,IL12B,LTA,CSF2,CCL5,IL...",29,26,2,158,332
36,Type I interferon-mediated signaling pathway,0.068966,0.001764,0.057143,39.105419,0.001194016,0.012526,"MX1,OAS2","IFNA10,IFNAR1,IFNA17,IRF1,IFNA14,JAK1,OAS3,IFN...",29,35,2,162,457
20,Osteoclast differentiation,0.068966,0.001864,0.054054,36.991612,0.001334074,0.012526,"FOS,DCSTAMP","RAC1,SRC,PLEKHM1,CALCR,CSF1,MTOR,OPG,DCSTAMP,C...",29,37,2,158,489
45,Defense response to virus,0.068966,0.001915,0.052632,36.018149,0.001406914,0.012526,"MX1,OAS2","RNASEL,TLR9,STAT3,BST2,TRIM25,IRF1,EIF2AK2,OAS...",29,38,2,156,492


In [16]:
context_str = "microglia treated with a TREM2 agonist antibody"
llm2geneset_context_res = await llm2geneset.gs_proposal(aclient, [genes], model="gpt-4o", context=context_str,
                                                        n_pathways=100, n_retry=3)

  0%|                                                                                                                                      | 0/1 [00:00<?, ?it/s]

List 100 biological pathways, biological processes, or cellular components that contain the following genes """LILRB2,MX1,GSN,MFGE8,CCL3,GPD1,TRPV2,DHCR24,TNFAIP3,MATK,CXCL8,CHIT1,TPRKB,RASSF4,SMC6,FOS,RHOBTB2,RNF19B,FBP1,SLC11A1,DCSTAMP,HSD3B7,OAS2,TGM2,MCEMP1,FAM20C,ACP5,CD9,AP5B1""" with high confidence. Also consider the following context as related to the genes: """microglia treated with a TREM2 agonist antibody""" when selecting pathways, processes, and components. List non-overlapping pathways, processes, or components. Do not include the gene names in the outputs. Use the following JSON schema:
```json
{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "p": {
                "type": "string",
            },
        },
        "required": ["p"]
    }
}
```
Example output will look like the following:
```json
[{"p":"BP or Pathway 1"},
 {"p":"BP or Pathway 2"},
 {"p":"BP or Pathway 3"},
 {"p":"BP or Pathway 4"}
```
The element `p` d

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:23<00:00, 23.61s/it]


In [24]:
df = llm2geneset_context_res[0]["ora_results"]
df.to_csv("with_context_AL002.tsv",sep="\t",index=None)
df[df["p_adj"] < 0.05]

Unnamed: 0,set_descr,generatio,bgratio,richFactor,foldEnrich,p_val,p_adj,intersection,set_genes,ngenes,nset,ninter,in_toks,out_toks
22,Regulation of inflammatory response,0.103448,0.002116,0.071429,48.881773,3.1e-05,0.002988,"FOS,TNFAIP3,CXCL8","TLR4,NFKBIA,TAB1,CEBPB,CXCR2,STAT3,PTGS2,CSF3,...",29,42,3,158,551
63,Positive regulation of cytokine production,0.068966,0.000756,0.133333,91.245977,0.000215,0.010299,"TNFAIP3,CXCL8","TLR4,MAPK8,NOD2,IL6,IFNG,TNFAIP3,NFkB1,IL18,ST...",29,15,2,160,136
11,Chemotaxis,0.068966,0.001209,0.083333,57.028736,0.000559,0.01403,"CCL3,CXCL8","RAC1,PTGER2,PLA2G4A,CXCR1,CCR5,CXCR2,IL8,CCR7,...",29,24,2,154,319
24,Interleukin-1-mediated signaling pathway,0.068966,0.001411,0.071429,48.881773,0.000763,0.01403,"TNFAIP3,CXCL8","IRAK4,NFKBIA,IRAK2,TNFAIP2,TNF,RELA,IL1RL2,IL1...",29,28,2,164,381
19,Inflammatory response to antigenic stimulus,0.068966,0.001461,0.068966,47.196195,0.000819,0.01403,"FOS,CXCL8","TLR4,STAT6,CEBPB,STAT3,IL4,PTGS2,MAPK3,TNF,REL...",29,29,2,164,255
21,Neuroinflammation,0.068966,0.001512,0.066667,45.622989,0.000877,0.01403,"TNFAIP3,CXCL8","TLR4,NFKBIA,STAT3,CCR5,PTGS2,CSF1R,TNF,CXCL10,...",29,30,2,156,394
8,Antiviral defense,0.068966,0.001915,0.052632,36.018149,0.001407,0.019295,"MX1,OAS2","TRIM5,BST2,EIF2AK2,RELA,OAS3,MX1,IRF3,OASL,IFI...",29,38,2,156,492
29,Viral defense mechanism,0.068966,0.002066,0.04878,33.382675,0.001637,0.019639,"MX1,OAS2","TRIM5,BST2,AIM2,RELA,ZAP,OAS3,MX1,CASP4,IRF3,C...",29,41,2,156,529
20,Antiviral innate immunity,0.068966,0.002318,0.043478,29.754123,0.002056,0.021935,"MX1,OAS2","RNASEL,MAVS,NFKBIA,TLR9,BST2,TRIM25,NFAT5,TBK1...",29,46,2,158,417


In [27]:
set_descr, gene_sets = llm2geneset.read_gmt("libs_human/gmt/KEGG_2021_Human.txt")
df = llm2geneset.simple_ora(genes, set_descr, gene_sets)
df.to_csv("KEGG_AL002.tsv",sep="\t",index=None)
df[df["p_adj"] < 0.05]
#df[df["p_adj"] < 0.01]

Unnamed: 0,set_descr,generatio,bgratio,richFactor,foldEnrich,p_val,p_adj,intersection,set_genes,ngenes,nset,ninter
251,Rheumatoid arthritis,0.137931,0.004686,0.043011,29.434186,1e-05,0.003157,"ACP5,FOS,CCL3,CXCL8","ATP6V1C1,CD86,CSF1,ATP6V1H,HLA-DPA1,CTLA4,LTB,...",29,93,4
161,Measles,0.137931,0.007004,0.028777,19.693376,4.8e-05,0.007686,"FOS,TNFAIP3,MX1,OAS2","IL2RA,TP73,IFNA10,BCL2L1,EIF2AK2,AKT1,OAS3,IRF...",29,139,4
173,NOD-like receptor signaling pathway,0.137931,0.00912,0.022099,15.123643,0.000134,0.014265,"TRPV2,OAS2,TNFAIP3,CXCL8","RNASEL,CASR,PLCB4,DEFA4,RIPK3,TRPM7,VDAC2,IFNA...",29,181,4
136,IL-17 signaling pathway,0.103448,0.004736,0.031915,21.840792,0.000345,0.021238,"FOS,TNFAIP3,CXCL8","IL5,IL17RA,MMP3,CHUK,CXCL5,MMP13,JUN,MMP1,TRAF...",29,94,3
64,Coronavirus disease,0.137931,0.01169,0.017241,11.799049,0.000345,0.021238,"FOS,OAS2,MX1,CXCL8","RPL37A,CGAS,IFNA10,RPS13,RPL22,RPL19,RPL8,RPS9...",29,232,4
50,Chagas disease,0.103448,0.00514,0.029412,20.127789,0.000439,0.021238,"CCL3,FOS,CXCL8","PLCB4,GNA11,AKT1,AKT3,CHUK,PIK3CA,PPP2R1B,JUN,...",29,102,3
290,Toll-like receptor signaling pathway,0.103448,0.00524,0.028846,19.740716,0.000465,0.021238,"CCL3,FOS,CXCL8","CD86,IFNA10,LY96,AKT1,TOLLIP,IRF3,AKT3,CHUK,TL...",29,104,3
191,Osteoclast differentiation,0.103448,0.006399,0.023622,16.165626,0.000832,0.03326,"ACP5,FOS,LILRB2","SQSTM1,CSF1,SIRPB1,PPP3CC,AKT1,AKT3,CHUK,PPP3R...",29,127,3
