In this file, we will use the GPT-API to choose relevant Enformer tracks for phenotypes.
The SAD track ids and descriptions are taken from the target_dnase_ataq_tracks.csv

In [13]:
from openai import OpenAI
from enum import Enum
from pydantic import BaseModel
import os

By using an .env file, we can store the API key outside of version control.

To run the file, create the .env file and add:

```OPENAI_API_KEY="My API Key"```

Also, be sure to add your .env file to .gitignore if this is not already the case.

In [10]:
from dotenv import load_dotenv
load_dotenv(".env")

True

Because of the way, GPT_API uses structured output, we first want to create an output class using enumerated string literals.

(There might be a way to create the ENum in a more succinct way, here I just used copy+paste)

In [3]:
class tissues1(str, Enum):
    ENCFF833POA = "cerebellum male adult (27 years) and male adult (35 years)"
    ENCFF110QGM = "frontal cortex male adult (27 years) and male adult (35 years)"
    ENCFF880MKD = "chorion"
    ENCFF463ZLQ = "Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour"
    ENCFF890OGQ = "GM03348"
    ENCFF996AEF = "GM03348 genetically modified using transduction treated with 3 ug/mL doxycycline for 10 days"
    ENCFF660YSU = "AG08395"
    ENCFF787MSC = "AG08396"
    ENCFF568LMQ = "AG20443"
    ENCFF685MZL = "frontal cortex female adult (67 years) and female adult (80 years)"
    ENCFF452DLO = "H54"
    ENCFF149TAY = "GM10248"
    ENCFF093VXI = "GM12878"
    ENCFF713YTW = "GM12891"
    ENCFF536IEC = "GM12892"
    ENCFF714ZSE = "GM18507"
    ENCFF385ITM = "GM19238"
    ENCFF919WIT = "GM19239"
    ENCFF814MPK = "GM19240"
    ENCFF153XQL = "H1-hESC"
    ENCFF753SCX = "H7-hESC"
    ENCFF663TLI = "H9"
    ENCFF082SFS = "heart male adult (27 years) and male adult (35 years)"
    ENCFF148BGE = "HEK293T"
    ENCFF422TAV = "HeLa-S3 treated with interferon alpha for 4 hours"
    ENCFF050BDU = "HeLa-S3"
    ENCFF818FXA = "hepatocyte"
    ENCFF136DBS = "HepG2"
    ENCFF284JHO = "HTR-8/SVneo"
    ENCFF305BCB = "endothelial cell of umbilical vein newborn"
    ENCFF015IAT = "CWRU1 male"
    ENCFF565AEK = "iPS-NIHi11 male adult (71 year) originated from AG20443"
    ENCFF637TYS = "iPS-NIHi7 female adult (85 years) originated from AG08395"
    ENCFF899YDP = "K562 treated with 1 uM vorinostat for 72 hours"
    ENCFF515UNC = "K562 G2 phase"
    ENCFF708UIS = "K562 G1 phase"
    ENCFF809IIW = "LNCaP clone FGC"
    ENCFF009NEA = "LNCaP clone FGC treated with 1 nM 17B-hydroxy-17-methylestra-4,9,11-trien-3-one for 12 hours"
    ENCFF382YNA = "MCF-7 originated from MCF-7"
    ENCFF585BKY = "medulloblastoma"
    ENCFF863UGY = "epidermal melanocyte"
    ENCFF678LXL = "CD14-positive monocyte female"
    ENCFF690MIA = "keratinocyte female"
    ENCFF300IKA = "osteoblast"
    ENCFF927VUO = "psoas muscle male adult (27 years) and male adult (35 years)"
    ENCFF613XPC = "T47D treated with 10 nM 17B-estradiol for 30 minutes"
    ENCFF300QLZ = "urothelium cell line"
    ENCFF688CJL = "A549"
    ENCFF802ZBQ = "AG04449"
    ENCFF882YMD = "AG04450"
    ENCFF160AIM = "AG09309"
    ENCFF294KLB = "AG09319"
    ENCFF211SDO = "AG10803"
    ENCFF869ECL = "fibroblast of the aortic adventitia female"
    ENCFF695WLT = "BE2C"
    ENCFF107IJY = "BJ"
    ENCFF132EIR = "HS-27A"
    ENCFF964WLS = "HS-5"
    ENCFF747REW = "stromal cell of bone marrow male"
    ENCFF085OHV = "Caco-2"
    ENCFF801IWM = "B cell female adult (43 years)"
    ENCFF274MGW = "hematopoietic multipotent progenitor cell"
    ENCFF753ETN = "naive thymus-derived CD4-positive, alpha-beta T cell male adult (26 years)"
    ENCFF391HDR = "CMK"
    ENCFF171BUA = "GM04503"
    ENCFF602PIH = "GM04504"
    ENCFF617HEH = "GM06990"
    ENCFF709TDR = "GM12864"
    ENCFF427FEF = "GM12865"
    ENCFF915DFR = "GM12878"
    ENCFF131HMO = "H1-hESC"
    ENCFF554DWH = "cardiac mesoderm"
    ENCFF274NDO = "cardiac mesoderm"
    ENCFF501KNP = "cardiac mesoderm"
    ENCFF213FJN = "cardiac mesoderm"
    ENCFF696IEW = "H7-hESC"
    ENCFF499UDS = "astrocyte of the hippocampus"
    ENCFF901UBX = "astrocyte of the spinal cord"
    ENCFF382FZE = "astrocyte of the cerebellum"
    ENCFF102UQK = "amniotic epithelial cell"
    ENCFF025HHG = "brain microvascular endothelial cell"
    ENCFF353QSZ = "brain pericyte"
    ENCFF443IYY = "smooth muscle cell of the brain vasculature female"
    ENCFF049FGZ = "cardiac fibroblast"
    ENCFF472TFO = "cardiac fibroblast female"
    ENCFF484UXW = "cardiac muscle cell"
    ENCFF923EUB = "fibroblast of the conjunctiva"
    ENCFF021NBR = "choroid plexus epithelial cell"
    ENCFF169PCK = "HCT116"
    ENCFF655LKW = "epithelial cell of esophagus"
    ENCFF830FDL = "HeLa-S3 G1b phase"
    ENCFF205TKQ = "HepG2"
    ENCFF111BCZ = "foreskin fibroblast male newborn"
    ENCFF103JVK = "HFF-Myc originated from foreskin fibroblast"
    ENCFF117RGP = "fibroblast of gingiva"
    ENCFF390MSL = "iris pigment epithelial cell"
    ENCFF634ZUJ = "HL-60"
    ENCFF263BZU = "mammary epithelial cell female"
    ENCFF724CMH = "fibroblast of mammary gland female"
    ENCFF969NNQ = "dermis blood vessel endothelial cell female adult"
    ENCFF250ZZO = "dermis blood vessel endothelial cell female adult"
    ENCFF189TQN = "dermis blood vessel endothelial cell male newborn"
    ENCFF130RBG = "dermis microvascular lymphatic vessel endothelial cell female"
    ENCFF431JNM = "dermis microvascular lymphatic vessel endothelial cell male"
    ENCFF468FOP = "dermis blood vessel endothelial cell male newborn"
    ENCFF010NPI = "lung microvascular endothelial cell female"
    ENCFF999UJH = "lung microvascular endothelial cell female"
    ENCFF714QQU = "non-pigmented ciliary epithelial cell"
    ENCFF719ZEX = "pulmonary artery endothelial cell female"
    ENCFF184UHW = "fibroblast of pulmonary artery"
    ENCFF519CAJ = "fibroblast of peridontal ligament male"
    ENCFF655AEV = "fibroblast of lung"
    ENCFF457RRO = "renal cortical epithelial cell"
    ENCFF221IPJ = "kidney epithelial cell"
    ENCFF656ZYW = "glomerular endothelial cell"
    ENCFF634YGE = "retinal pigment epithelial cell"
    ENCFF549TYD = "skeletal muscle myoblast"
    ENCFF684FJC = "myotube originated from skeletal muscle myoblast"
    ENCFF752DYE = "endothelial cell of umbilical vein newborn"
    ENCFF652HJH = "fibroblast of villous mesenchyme"
    ENCFF916JRN = "Jurkat clone E61"
    ENCFF413AHU = "K562"
    ENCFF868NHV = "K562"
    ENCFF565YDB = "K562"
    ENCFF748YJM = "myocyte originated from LHCN-M2"
    ENCFF639MPM = "LHCN-M2"
    ENCFF571HTM = "LNCaP clone FGC"
    ENCFF250TLK = "M059J"
    ENCFF924FJR = "MCF-7"
    ENCFF606KFN = "MCF-7 treated with 100 nM estradiol for 1 hour"
    ENCFF411ZPI = "MCF-7"
    ENCFF659BVQ = "CD14-positive monocyte female"
    ENCFF897EZG = "NB4"
    ENCFF320CHE = "astrocyte"
    ENCFF940PZT = "bronchial epithelial cell female treated with retinoic acid"
    ENCFF302JEV = "fibroblast of dermis female adult"
    ENCFF861ULS = "foreskin fibroblast male newborn"
    ENCFF767YBP = "keratinocyte female"
    ENCFF330KHN = "fibroblast of lung male adult (45 years)"
    ENCFF644NWU = "NT2/D1"
    ENCFF698HGI = "Panc1"
    ENCFF739CBQ = "epithelial cell of prostate"
    ENCFF991YWM = "RPMI-7951"
    ENCFF499KPZ = "epithelial cell of proximal tubule"
    ENCFF827VFY = "bronchial epithelial cell"
    ENCFF440XZH = "SK-N-MC"
    ENCFF851VBR = "SK-N-SH treated with 6 uM all-trans-retinoic acid for 48 hours"
    ENCFF627SBE = "skeletal muscle cell"
    ENCFF367ZVE = "T47D"
    ENCFF479VHW = "T-helper 1 cell"
    ENCFF204BQM = "T-helper 1 cell female adult (26 years)"
    ENCFF795TGF = "T-helper 1 cell male adult (33 years)"
    ENCFF566QDG = "T-helper 17 cell"
    ENCFF335KDD = "T-helper 2 cell"
    ENCFF768PJS = "T-helper 2 cell female adult (26 years)"
    ENCFF438GIW = "T-helper 2 cell male adult (33 years)"
    ENCFF313YWE = "regulatory T cell female adult (35 years)"
    ENCFF622JVD = "regulatory T cell male adult (28 years)"
    ENCFF369NZI = "WERI-Rb-1"
    ENCFF189ULK = "WI38 genetically modified using stable transfection originated from WI38 treated with 20 nM afimoxifene for 72 hours"
    ENCFF040DSN = "WI38 genetically modified using stable transfection originated from WI38"
    ENCFF953AFG = "HT1080"
    ENCFF810XTS = "SK-MEL-5"
    ENCFF741DDM = "SJCRH30"
    ENCFF143RMC = "NCI-H460"
    ENCFF989ALW = "SK-N-DZ treated with dimethyl sulfoxide for 72 hours"
    ENCFF938XUM = "GM23338 male adult (53 years) originated from GM23248"
    ENCFF362UJZ = "lung embryo (112 days)"
    ENCFF820SGC = "SK-MEL-5"
    ENCFF739XXP = "muscle of back female embryo (113 days)"
    ENCFF694RNX = "iPS DF 4.7 male newborn"
    ENCFF461CDQ = "right renal cortex interstitium male embryo (105 days)"
    ENCFF498QFP = "caudate nucleus male adult (78 years)"
    ENCFF245IYM = "left renal pelvis male embryo (105 days)"
    ENCFF855CJL = "H9 S1 phase genetically modified using stable transfection"
    ENCFF960UKQ = "right kidney female embryo (87 days)"
    ENCFF220RFB = "Karpas-422"
    ENCFF439EFF = "CD8-positive, alpha-beta T cell male adult (21 year)"
    ENCFF825MFH = "renal cortex interstitium female embryo (96 days)"
    ENCFF735MKP = "brain female embryo (85 days)"
    ENCFF920AZN = "muscle of leg male embryo (97 days)"
    ENCFF577QJI = "muscle of back female embryo (105 days)"
    ENCFF681MHT = "placenta female embryo (113 days)"
    ENCFF971NFK = "foreskin keratinocyte male newborn"
    ENCFF842TPR = "right kidney female embryo (107 days)"
    ENCFF076JMD = "RCC 7860"
    ENCFF188JVI = "colon epithelial cell line"
    ENCFF269PEU = "left renal cortex interstitium male embryo (120 days)"
    ENCFF400NBT = "T-cell"
    ENCFF304TGP = "large intestine male embryo (113 days)"
    ENCFF999OEJ = "PC-3"
    ENCFF813VDR = "muscle of leg male embryo (96 days)"
    ENCFF613SVD = "adrenal gland embryo (96 days)"
    ENCFF519PNO = "coronary artery female adult (53 years)"
    ENCFF022OWT = "stomach female embryo (105 days)"
    ENCFF529MMP = "muscle of arm female embryo (115 days)"
    ENCFF854MAD = "heart female embryo (91 day)"
    ENCFF734DOF = "kidney embryo (80 days)"
    ENCFF678EIG = "heart left ventricle female adult (53 years)"
    ENCFF581JDF = "muscle of back female embryo (98 days)"
    ENCFF475YJR = "lung male embryo (108 days)"
    ENCFF146VLD = "tibial artery female adult (53 years)"
    ENCFF580LWU = "adrenal gland male embryo (101 day)"
    ENCFF192KLJ = "kidney female embryo (121 day)"
    ENCFF534VWZ = "stomach female embryo (107 days)"
    ENCFF343DFM = "small intestine female embryo (110 days)"
    ENCFF723FDT = "MG63"
    ENCFF792NVL = "skin of body female embryo (82 days)"
    ENCFF617ONE = "left renal cortex interstitium male embryo (105 days)"
    ENCFF873DWP = "stomach male embryo (58 days) and male embryo (76 days)"
    ENCFF862JVF = "left kidney female embryo (59 days) and male embryo (91 day)"
    ENCFF996GHG = "hematopoietic multipotent progenitor cell"
    ENCFF969UJW = "kidney female embryo (120 days)"
    ENCFF413KYW = "muscle of back female embryo (115 days)"
    ENCFF771PWA = "hematopoietic multipotent progenitor cell male adult (25 years) treated with erythropoietin for 20 days, hydrocortisone succinate for 20 days, kit ligand for 20 days, interleukin-3 for 20 days"
    ENCFF671BMS = "small intestine female embryo (120 days)"
    ENCFF657KLL = "brain embryo (112 days)"
    ENCFF424IXN = "stomach female embryo (105 days)"
    ENCFF415LNY = "left kidney female embryo (98 days)"
    ENCFF248OBP = "trophoblast cell embryo (21 week)"
    ENCFF328DCV = "common myeloid progenitor, CD34-positive female adult (27 years)"
    ENCFF681KGX = "eye female embryo (76 days)"
    ENCFF332EJM = "renal cortex interstitium male embryo (108 days)"
    ENCFF987HVM = "placenta female embryo (108 days)"
    ENCFF922TBR = "stomach female adult (53 years)"
    ENCFF209YFM = "small intestine female embryo (107 days)"
    ENCFF811EUQ = "large intestine female embryo (120 days)"
    ENCFF798YGS = "kidney glomerular epithelial cell male adult (43 years) and male adult (62 years)"
    ENCFF319GAX = "renal pelvis female embryo (96 days)"
    ENCFF083VCD = "lung female embryo (76 days)"
    ENCFF044CTX = "muscle of back male embryo (104 days)"
    ENCFF899HCF = "lung female embryo (85 days)"
    ENCFF085EFX = "spinal cord male embryo (105 days)"
    ENCFF833WVM = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 4 days, kit ligand for 4 days, hydrocortisone succinate for 4 days, erythropoietin for 4 days"
    ENCFF577SOF = "HepG2"
    ENCFF366GCX = "foreskin fibroblast male newborn"
    ENCFF575XEK = "omental fat pad female adult (53 years)"
    ENCFF207LXN = "large intestine female embryo (110 days)"
    ENCFF604EJP = "heart female embryo (105 days)"
    ENCFF365HVU = "renal cortex interstitium male embryo (113 days)"
    ENCFF064KBT = "brain female embryo (96 days)"
    ENCFF677VKI = "thyroid gland male adult (37 years)"
    ENCFF068ZBX = "liver embryo (59 days) and embryo (80 days)"
    ENCFF288HWP = "stomach male adult (54 years)"
    ENCFF921GFL = "thymus male embryo (127 days)"
    ENCFF414NAA = "upper lobe of left lung male adult (37 years)"
    ENCFF131GBV = "endodermal cell"
    ENCFF016KEV = "forelimb muscle female embryo (108 days)"
    ENCFF522NPH = "CD4-positive, alpha-beta T cell male adult (37 years)"
    ENCFF858BSX = "muscle of leg male embryo (101 day)"
    ENCFF161KLK = "gastrocnemius medialis male adult (37 years)"
    ENCFF073UDC = "gastrocnemius medialis male adult (54 years)"
    ENCFF362OUW = "stomach male embryo (91 day)"
    ENCFF499LIJ = "muscle of back male embryo (101 day)"
    ENCFF008DNO = "heart female embryo (117 days)"
    ENCFF294LJN = "kidney tubule cell female adult (80 years) and male adult (62 years)"
    ENCFF433QRU = "fibroblast of skin of abdomen male embryo (97 days)"
    ENCFF230OUF = "pancreas male adult (34 years)"
    ENCFF532VGP = "trophoblast cell originated from H1-hESC"
    ENCFF401OYD = "left kidney male embryo (96 days)"
    ENCFF338NJM = "skin fibroblast male embryo (97 days)"
    ENCFF378OWK = "brain female embryo (142 days)"
    ENCFF687KFN = "small intestine male embryo (105 days)"
    ENCFF509TTX = "heart female embryo (110 days)"
    ENCFF103KGP = "common myeloid progenitor, CD34-positive female"
    ENCFF452CPH = "adrenal gland male adult (37 years)"
    ENCFF748UWO = "muscle of back female embryo (85 days)"
    ENCFF875TFI = "muscle of leg female embryo (85 days)"
    ENCFF340RUK = "Peyer's patch male adult (54 years)"
    ENCFF410CYG = "amniotic stem cell"
    ENCFF978RWQ = "stomach female embryo (98 days)"
    ENCFF584GYN = "large intestine female embryo (91 day)"
    ENCFF029QDT = "lung female embryo (120 days)"
    ENCFF285PVN = "skin fibroblast male embryo (97 days)"
    ENCFF726FED = "SW480"
    ENCFF711EJL = "GM23248"
    ENCFF484VHZ = "muscle of trunk female embryo (120 days)"
    ENCFF639JCH = "islet precursor cell"
    ENCFF151SBP = "medulla oblongata male adult (78 years) and male adult (84 years)"
    ENCFF640UAZ = "right lung male embryo (115 days)"
    ENCFF102SIK = "body of pancreas male adult (54 years)"
    ENCFF227VUS = "mammary epithelial cell female adult (18 years)"
    ENCFF082TEM = "gastrocnemius medialis female adult (51 year)"
    ENCFF037DUA = "uterus female adult (53 years)"
    ENCFF818DSK = "ELR"
    ENCFF037ZSW = "omental fat pad female adult (51 year)"
    ENCFF280CMC = "stomach male child (3 years)"
    ENCFF570XSN = "B cell male adult (37 years)"
    ENCFF113YFF = "foreskin fibroblast male newborn"
    ENCFF981AOU = "renal cortex interstitium male embryo (91 day)"
    ENCFF828TLX = "ACHN"
    ENCFF054DND = "kidney tubule cell female adult (80 years) treated with 5 uM cisplatin"
    ENCFF468WHL = "left renal pelvis male embryo (105 days)"
    ENCFF669AED = "iPS DF 6.9 male newborn"
    ENCFF369CRG = "kidney male embryo (87 days)"
    ENCFF784RSW = "right lung female embryo (91 day)"
    ENCFF564MGY = "NCI-H226"
    ENCFF973VII = "right renal cortex interstitium male embryo (105 days)"
    ENCFF006KSE = "G401"
    ENCFF789PQN = "muscle of arm male embryo (113 days)"
    ENCFF992LZL = "common myeloid progenitor, CD34-positive male adult"
    ENCFF716VIS = "muscle of leg female embryo (113 days)"
    ENCFF430CNF = "arm bone male embryo (81 day)"
    ENCFF334GMP = "H9"
    ENCFF639ZOF = "testis male embryo"
    ENCFF372SLF = "neural stem progenitor cell originated from H1-hESC"
    ENCFF312ZWZ = "right atrium auricular region female adult (51 year)"
    ENCFF819IOM = "common myeloid progenitor, CD34-positive male adult (49 years)"
    ENCFF688EZK = "stomach female embryo (121 day)"
    ENCFF502QQC = "muscle of leg female embryo (105 days)"
    ENCFF674WBH = "muscle of leg male embryo (97 days)"
    ENCFF541UPO = "left kidney female embryo (87 days)"
    ENCFF034GYI = "muscle of back male embryo (97 days)"
    ENCFF160KJF = "muscle of leg male embryo (96 days)"
    ENCFF071LJS = "right lung female embryo (105 days)"
    ENCFF193BCL = "NAMALWA treated with Sendai virus for 2 hours"
    ENCFF964WUW = "renal pelvis male embryo (127 days)"
    ENCFF736ZAN = "muscle of arm male embryo (96 days)"
    ENCFF064YHT = "heart male child (3 years)"
    ENCFF536FNG = "brain female embryo (117 days)"
    ENCFF428DTK = "muscle of arm female embryo (85 days)"
    ENCFF782WKY = "heart male embryo (110 days)"
    ENCFF464NHA = "L1-S8"
    ENCFF468PTJ = "CD8-positive, alpha-beta T cell male adult (37 years)"
    ENCFF847HLP = "muscle of back male embryo (91 day)"
    ENCFF637LBX = "left renal pelvis male embryo (105 days)"
    ENCFF751LHL = "middle frontal gyrus male adult (78 years)"
    ENCFF267BLE = "lung male embryo (103 days)"
    ENCFF915OOI = "heart male embryo (105 days)"
    ENCFF688UBR = "left renal pelvis male embryo (120 days)"
    ENCFF570PTC = "common myeloid progenitor, CD34-positive female adult (50 years)"
    ENCFF137XIB = "Peyer's patch female adult (53 years)"
    ENCFF123ENC = "skin fibroblast male embryo (97 days)"
    ENCFF158ALT = "hematopoietic multipotent progenitor cell"
    ENCFF386DMJ = "right kidney male embryo (87 days)"
    ENCFF367MHB = "tibial artery male adult (37 years)"
    ENCFF620RYJ = "renal cell carcinoma"
    ENCFF936MHE = "thymus female embryo (113 days)"
    ENCFF003FBE = "brain female embryo (109 days)"
    ENCFF399TJD = "Daoy"
    ENCFF305GOM = "A673"

class tissues2(str, Enum):
    ENCFF147PGS = "muscle of arm male embryo (101 day)"
    ENCFF941ILF = "muscle of arm female embryo (120 days)"
    ENCFF779BIA = "muscle of back male embryo (105 days)"
    ENCFF647YEC = "skin fibroblast male embryo (97 days)"
    ENCFF095HIQ = "ecto neural progenitor cell originated from H9"
    ENCFF708ZHA = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 17 days, kit ligand for 17 days, hydrocortisone succinate for 17 days, erythropoietin for 17 days"
    ENCFF136YOJ = "hepatocyte originated from H9"
    ENCFF078RPO = "renal pelvis male embryo (97 days)"
    ENCFF802UAF = "heart embryo (101 day)"
    ENCFF761ZJC = "large intestine male embryo (115 days)"
    ENCFF010ZNC = "T-cell male adult (37 years)"
    ENCFF053JYY = "skin fibroblast male embryo (97 days)"
    ENCFF259NLK = "CD1c-positive myeloid dendritic cell"
    ENCFF220IWU = "B cell male adult (21 year)"
    ENCFF029ZOX = "superior temporal gyrus male adult (84 years)"
    ENCFF801JJS = "skin fibroblast male embryo (97 days)"
    ENCFF294LWY = "iPS DF 19.11 male newborn"
    ENCFF973KXU = "hindlimb muscle male embryo (120 days)"
    ENCFF955DHP = "MCF 10A treated with 1 uM tamoxifen for 6 hours"
    ENCFF017NBX = "sigmoid colon male adult (54 years)"
    ENCFF851MCI = "heart male embryo (120 days)"
    ENCFF083ELT = "tibial nerve female adult (51 year)"
    ENCFF531MYW = "adipocyte"
    ENCFF562EUF = "muscle of trunk female embryo (121 day)"
    ENCFF920CTO = "left cardiac atrium female embryo (101 day)"
    ENCFF213QWE = "T-cell male adult (21 year)"
    ENCFF645BQM = "right kidney male embryo (108 days)"
    ENCFF166VWE = "adrenal gland female embryo (108 days)"
    ENCFF121CVF = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 11 day, kit ligand for 11 day, hydrocortisone succinate for 11 day, erythropoietin for 11 day"
    ENCFF947HEA = "brain male embryo (105 days)"
    ENCFF116YEY = "body of pancreas female adult (53 years)"
    ENCFF963ION = "left kidney female embryo (147 days)"
    ENCFF133LMW = "CD4-positive, alpha-beta T cell male adult (37 years)"
    ENCFF953EXS = "KBM-7"
    ENCFF241XMS = "psoas muscle male child (3 years)"
    ENCFF190AUQ = "ovary female embryo"
    ENCFF175IFH = "RKO"
    ENCFF869XYK = "leg bone male embryo (81 day)"
    ENCFF479JXO = "glomerular visceral epithelial cell child (3 years)"
    ENCFF520XMG = "foreskin melanocyte male newborn"
    ENCFF249WKV = "embryonic facial prominence embryo (53 days) and embryo (58 days)"
    ENCFF154NSS = "vagina female adult (53 years)"
    ENCFF890HSZ = "kidney female embryo (85 days)"
    ENCFF990QEC = "muscle of back male embryo (96 days)"
    ENCFF736QYT = "muscle of trunk female embryo (113 days)"
    ENCFF732WSQ = "iPS DF 19.7 male newborn"
    ENCFF631ZXV = "right lung female embryo (110 days)"
    ENCFF216TLU = "left lung male embryo (87 days)"
    ENCFF471TWL = "large intestine female embryo (105 days)"
    ENCFF338GCP = "right lung female embryo (98 days)"
    ENCFF573KUK = "leg bone male embryo (81 day)"
    ENCFF724HAH = "CD14-positive monocyte male adult (21 year)"
    ENCFF897JFY = "muscle of arm male embryo (120 days)"
    ENCFF799HNC = "Peyer's patch male adult (37 years)"
    ENCFF139ZYV = "common myeloid progenitor, CD34-positive male adult (36 years)"
    ENCFF549BSX = "left lung female embryo (108 days)"
    ENCFF359LKO = "small intestine female embryo (91 day)"
    ENCFF896FUI = "MM.1S"
    ENCFF410TTD = "sigmoid colon female adult (51 year)"
    ENCFF852RDH = "left lung female embryo (105 days)"
    ENCFF523VFS = "common myeloid progenitor, CD34-positive female adult (33 years)"
    ENCFF773NZF = "retina embryo (125 days) and male embryo (103 days)"
    ENCFF468SMI = "brain male embryo (72 days) and male embryo (76 days)"
    ENCFF302BVR = "muscle of back male embryo (108 days)"
    ENCFF706WOH = "IMR-90"
    ENCFF036RHD = "lung female embryo (108 days)"
    ENCFF877ULZ = "left lung male embryo (113 days)"
    ENCFF506FHA = "OCI-LY7"
    ENCFF300WUQ = "upper lobe of left lung female adult (51 year)"
    ENCFF117DAB = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 18 days, kit ligand for 18 days, hydrocortisone succinate for 18 days, erythropoietin for 18 days"
    ENCFF681JFJ = "putamen male adult (78 years)"
    ENCFF893ONO = "common myeloid progenitor, CD34-positive male adult (42 years)"
    ENCFF555ENQ = "placenta embryo (53 days)"
    ENCFF758ZBZ = "kidney male embryo (105 days)"
    ENCFF185JEB = "heart left ventricle female embryo (136 days)"
    ENCFF966IYT = "small intestine female embryo (108 days)"
    ENCFF692FQU = "LoVo"
    ENCFF564DYF = "cerebellar cortex male adult (78 years) and male adult (84 years)"
    ENCFF710LQM = "lung embryo (67 days)"
    ENCFF413EKZ = "transverse colon female adult (53 years)"
    ENCFF482UKP = "brain embryo (80 days)"
    ENCFF922XNR = "stomach female embryo (108 days)"
    ENCFF622CII = "umbilical cord embryo (59 days) and male embryo (76 days)"
    ENCFF683HAF = "muscle of leg male embryo (105 days)"
    ENCFF109WQI = "Caki2"
    ENCFF644WCO = "muscle of arm female embryo (105 days)"
    ENCFF910FZX = "foreskin melanocyte male newborn"
    ENCFF450EHI = "stomach male embryo (108 days)"
    ENCFF381POQ = "body of pancreas male adult (37 years)"
    ENCFF631XBY = "renal cortex interstitium female embryo (120 days)"
    ENCFF095KFF = "body of pancreas female adult (51 year)"
    ENCFF824OUT = "large intestine male embryo (105 days)"
    ENCFF850IPE = "limb embryo (58 days) and embryo (59 days)"
    ENCFF461COG = "heart female embryo (147 days)"
    ENCFF135BIN = "heart male embryo (96 days)"
    ENCFF785TYR = "right renal pelvis male embryo (120 days)"
    ENCFF149IUC = "left lung female embryo (117 days)"
    ENCFF858RAV = "thymus female embryo (98 days)"
    ENCFF802LRO = "ovary female adult (53 years)"
    ENCFF925SLC = "left kidney male embryo (115 days)"
    ENCFF527VFR = "thymus male embryo (113 days)"
    ENCFF176WVA = "placenta female embryo (105 days)"
    ENCFF746DZE = "EH"
    ENCFF354FAY = "renal cortex interstitium male embryo (97 days)"
    ENCFF933ZHA = "placenta embryo (102 days)"
    ENCFF708XRO = "placenta embryo (56 days) and embryo (59 days)"
    ENCFF407LCA = "renal pelvis female embryo (103 days)"
    ENCFF462ZLK = "right lobe of liver female adult (53 years)"
    ENCFF660PED = "skin fibroblast male embryo (97 days)"
    ENCFF616AAF = "tongue male embryo (72 days)"
    ENCFF723EYP = "skin fibroblast male embryo (97 days)"
    ENCFF775FBE = "liver female embryo (101 day) and female embryo (113 days)"
    ENCFF658BNW = "large intestine female embryo (107 days)"
    ENCFF438MEL = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 6 days, kit ligand for 6 days, hydrocortisone succinate for 6 days, erythropoietin for 6 days"
    ENCFF819VID = "common myeloid progenitor, CD34-positive male"
    ENCFF389EEG = "small intestine female embryo (98 days)"
    ENCFF658FXQ = "CD4-positive, alpha-beta T cell male adult (21 year)"
    ENCFF503ZTL = "brain male embryo (101 day)"
    ENCFF124SMB = "placenta female embryo (101 day) and male embryo (105 days)"
    ENCFF069WLL = "renal pelvis male embryo (91 day)"
    ENCFF830TGU = "lung embryo (80 days) and male embryo (76 days)"
    ENCFF460VUW = "Ammon's horn male adult (84 years)"
    ENCFF596JMX = "muscle of back female embryo (105 days)"
    ENCFF606ESZ = "lung male embryo (82 days)"
    ENCFF946AQO = "left renal cortex interstitium male embryo (105 days)"
    ENCFF925UKW = "trophoblast cell embryo (17 weeks) and embryo (18 weeks)"
    ENCFF548BBX = "RPMI8226"
    ENCFF243OFZ = "small intestine male embryo (91 day)"
    ENCFF641HVS = "brain embryo (56 days) and male embryo (58 days)"
    ENCFF158QOY = "pancreas female adult (30 years)"
    ENCFF080JZX = "adrenal gland female adult (53 years)"
    ENCFF322ALX = "breast epithelium female adult (51 year)"
    ENCFF028YSP = "left kidney male embryo (87 days)"
    ENCFF010FIA = "renal pelvis female embryo (96 days)"
    ENCFF581JYZ = "stomach male embryo (127 days)"
    ENCFF726GTU = "CD8-positive, alpha-beta T cell female adult (34 years)"
    ENCFF599PVD = "spinal cord female embryo (89 days)"
    ENCFF174TXJ = "muscle of arm male embryo (97 days)"
    ENCFF161NZJ = "renal pelvis female embryo (105 days)"
    ENCFF442SRX = "HAP-1"
    ENCFF909HEZ = "SJSA1"
    ENCFF113RRS = "retina embryo (74 days) and embryo (85 days)"
    ENCFF906KHN = "muscle of leg male embryo (127 days)"
    ENCFF900LOG = "fibroblast of skin of abdomen male embryo (97 days)"
    ENCFF399ISP = "bipolar neuron originated from GM23338 treated with 0.5 ug/mL doxycycline hyclate for 4 days"
    ENCFF138ZHG = "stomach female embryo (147 days)"
    ENCFF969AND = "lung male embryo (54 days) and male embryo (58 days)"
    ENCFF508ADO = "T-cell male adult (36 years)"
    ENCFF801DBL = "T-cell male adult (21 year)"
    ENCFF401DZU = "HK-2"
    ENCFF673ICX = "NCI-H460"
    ENCFF655RAK = "left lung male embryo (96 days)"
    ENCFF005DPL = "common myeloid progenitor, CD34-positive male adult (23 years)"
    ENCFF766YIL = "stomach female adult (51 year)"
    ENCFF502EPK = "CD4-positive, alpha-beta T cell female adult (33 years)"
    ENCFF740DGO = "kidney female embryo (113 days)"
    ENCFF600WZF = "skin fibroblast male embryo (97 days)"
    ENCFF180IFD = "placenta female embryo (85 days)"
    ENCFF361UTR = "brain male embryo (122 days)"
    ENCFF903LEE = "muscle of arm male embryo (97 days)"
    ENCFF161UUJ = "renal cortex interstitium female embryo (103 days)"
    ENCFF126DFR = "thyroid gland female adult (51 year)"
    ENCFF889CMZ = "NAMALWA"
    ENCFF351UQV = "heart embryo (59 days) and female embryo (76 days)"
    ENCFF365LXQ = "large intestine male embryo (105 days)"
    ENCFF260OFK = "skin fibroblast male embryo (97 days)"
    ENCFF307BRD = "right kidney male embryo (96 days)"
    ENCFF722MXM = "CD14-positive monocyte male adult (37 years)"
    ENCFF422YYH = "ELF-1"
    ENCFF024WMP = "spinal cord female embryo (113 days)"
    ENCFF090EZE = "spleen embryo (112 days)"
    ENCFF010TXA = "left lung female embryo (107 days)"
    ENCFF538SYZ = "thymus female embryo"
    ENCFF131RGB = "CD4-positive, alpha-beta T cell male adult (21 year)"
    ENCFF827CLR = "stomach female embryo (96 days)"
    ENCFF923JHL = "EL"
    ENCFF759CKK = "CD14-positive monocyte female adult (34 years)"
    ENCFF290ZJP = "thyroid gland female adult (53 years)"
    ENCFF815MQN = "adrenal gland female embryo (113 days)"
    ENCFF545YJA = "muscle of back male embryo (96 days)"
    ENCFF332XCB = "SJCRH30"
    ENCFF985WGD = "natural killer cell male adult (21 year)"
    ENCFF702IWR = "heart male embryo (72 days) and male embryo (76 days)"
    ENCFF225GCE = "midbrain male adult (78 years) and male adult (84 years)"
    ENCFF338CIA = "ovary female adult (30 years)"
    ENCFF918FRW = "mesendoderm originated from H1-hESC"
    ENCFF104JJN = "natural killer cell male adult (37 years)"
    ENCFF125SYT = "natural killer cell female adult (34 years)"
    ENCFF380PKB = "foreskin keratinocyte male newborn"
    ENCFF488GEQ = "stomach female embryo"
    ENCFF285UPE = "adrenal gland male embryo (108 days)"
    ENCFF037YBR = "testis male embryo"
    ENCFF594UFP = "mesenchymal stem cell originated from H1-hESC"
    ENCFF264JCE = "vagina female adult (51 year)"
    ENCFF158JPG = "common myeloid progenitor, CD34-positive male adult (43 years)"
    ENCFF062BSK = "heart right ventricle female embryo (101 day) and female embryo (103 days)"
    ENCFF572BDN = "L1-S8R"
    ENCFF162WFN = "heart female embryo (116 days) and female embryo (98 days)"
    ENCFF747RHY = "MCF 10A treated with 1 uM tamoxifen for 24 hours"
    ENCFF923TRG = "right renal pelvis male embryo (105 days)"
    ENCFF851IQY = "muscle of arm female embryo (98 days)"
    ENCFF685WFL = "H9 G1 phase genetically modified using stable transfection"
    ENCFF094FZF = "tibial nerve male adult (37 years)"
    ENCFF098CIC = "kidney male embryo (85 days)"
    ENCFF645CDA = "tongue female embryo (59 days) and female embryo (76 days)"
    ENCFF950KRY = "transverse colon female adult (51 year)"
    ENCFF881JSN = "right renal pelvis male embryo (105 days)"
    ENCFF484MWV = "large intestine male embryo (91 day)"
    ENCFF712OII = "right kidney female embryo (117 days)"
    ENCFF268XHZ = "right kidney male embryo (91 day)"
    ENCFF375JUQ = "skin fibroblast male embryo (97 days)"
    ENCFF328BAN = "globus pallidus male adult (78 years) and male adult (84 years)"
    ENCFF518IQW = "kidney female embryo (105 days)"
    ENCFF412KRU = "right atrium auricular region female adult (53 years)"
    ENCFF721IYP = "thymus female embryo (147 days)"
    ENCFF017FGP = "stomach male adult (34 years)"
    ENCFF540MDC = "eye embryo (56 days) and male embryo (76 days)"
    ENCFF376RIO = "esophagus squamous epithelium male adult (37 years)"
    ENCFF888PNL = "glomerular visceral epithelial cell child (3 years)"
    ENCFF167DZY = "muscle of back male embryo (127 days)"
    ENCFF072UOA = "spinal cord male embryo (96 days)"
    ENCFF222GDV = "MCF 10A"
    ENCFF098CAJ = "transverse colon male adult (54 years)"
    ENCFF419BLQ = "kidney embryo (59 days) and female embryo (59 days)"
    ENCFF117WXM = "H1-hESC"
    ENCFF112GQO = "large intestine female embryo (98 days)"
    ENCFF691QJR = "large intestine female embryo (103 days)"
    ENCFF804AOO = "placenta male embryo (85 days)"
    ENCFF772ECZ = "left lung female embryo (91 day)"
    ENCFF751LRX = "heart embryo (80 days)"
    ENCFF130BQK = "femur female embryo (98 days)"
    ENCFF345ALB = "H4"
    ENCFF497YLT = "trophoblast cell embryo (23 weeks)"
    ENCFF258PIV = "urinary bladder male embryo (76 days)"
    ENCFF528SGI = "HT-29"
    ENCFF540ITV = "limb embryo (53 days) and embryo (56 days)"
    ENCFF489DYU = "retina female embryo (89 days)"
    ENCFF867YFT = "dedifferentiated amniotic fluid mesenchymal stem cell"
    ENCFF427FTE = "upper lobe of left lung female adult (53 years)"
    ENCFF643ARA = "brain female embryo (105 days)"
    ENCFF924PVB = "muscle of arm embryo (101 day)"
    ENCFF331URX = "renal cortex interstitium male embryo (108 days)"
    ENCFF210SQU = "pons male adult (78 years)"
    ENCFF353EKI = "thymus female embryo (105 days)"
    ENCFF398WIV = "CD8-positive, alpha-beta T cell female adult (33 years)"
    ENCFF446JBL = "cardiac muscle cell"
    ENCFF825SRM = "CD8-positive, alpha-beta T cell male adult (21 year)"
    ENCFF800PMS = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 8 days, kit ligand for 8 days, hydrocortisone succinate for 8 days, erythropoietin for 8 days"
    ENCFF520PUY = "right lung female embryo (117 days)"
    ENCFF942LPA = "right lung male embryo (105 days)"
    ENCFF609EGN = "lung female embryo (96 days)"
    ENCFF415UET = "adrenal gland male adult (54 years)"
    ENCFF493QSG = "renal cortex interstitium male embryo (127 days)"
    ENCFF047HXB = "spleen male adult (54 years)"
    ENCFF590MET = "renal cortex interstitium female embryo (89 days)"
    ENCFF233ORX = "ascending aorta female adult (51 year)"
    ENCFF141YBL = "ovary female adult (51 year)"
    ENCFF667COT = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 13 days, kit ligand for 13 days, hydrocortisone succinate for 13 days, erythropoietin for 13 days"
    ENCFF626LYV = "cardiac fibroblast female embryo (94 days) and female embryo (98 days)"
    ENCFF378ZLF = "adrenal gland male embryo (85 days)"
    ENCFF443KCU = "large intestine male embryo (108 days)"
    ENCFF568EAH = "occipital lobe male adult (84 years)"
    ENCFF586GLK = "right lung female embryo (107 days)"
    ENCFF374WLB = "small intestine female embryo (105 days)"
    ENCFF779SYM = "muscle of leg male embryo (104 days)"
    ENCFF561DJM = "lower leg skin female adult (53 years)"
    ENCFF193WRL = "adrenal gland female adult (51 year)"
    ENCFF530PCA = "adrenal gland female embryo (85 days)"
    ENCFF170QVL = "muscle of arm male embryo (115 days)"
    ENCFF200VXY = "right kidney female embryo (147 days)"
    ENCFF248ZTQ = "kidney female embryo (76 days) and male embryo (76 days)"
    ENCFF367ZCY = "large intestine female embryo (108 days)"
    ENCFF759AQQ = "right kidney female embryo (98 days)"
    ENCFF596RFA = "right renal cortex interstitium male embryo (120 days)"
    ENCFF259GAI = "muscle of arm male embryo (96 days)"
    ENCFF350RPE = "thymus male embryo (108 days)"
    ENCFF104EQW = "B cell female adult (34 years)"
    ENCFF762FMJ = "thyroid gland male adult (54 years)"
    ENCFF370CZR = "spinal cord female embryo (59 days) and male embryo (72 days)"
    ENCFF326FWY = "heart embryo (96 days)"
    ENCFF734APY = "renal pelvis female embryo (89 days)"
    ENCFF643GKF = "renal pelvis male embryo (108 days)"
    ENCFF281GOQ = "heart female embryo (103 days)"
    ENCFF745RLH = "H9"
    ENCFF971AHO = "K562"
    ENCFF168UHZ = "left renal cortex interstitium male embryo (105 days)"
    ENCFF499TWX = "right lung female embryo (108 days)"
    ENCFF218SJO = "stomach female embryo (96 days)"
    ENCFF619LIB = "small intestine male adult (34 years)"
    ENCFF701FFX = "renal cortex interstitium female embryo (96 days)"
    ENCFF385DEA = "A172"
    ENCFF106TCU = "left lung male embryo (115 days)"
    ENCFF198FAC = "muscle of leg male embryo (115 days)"
    ENCFF907UIY = "hematopoietic multipotent progenitor cell treated with interleukin-3 for 15 days, kit ligand for 15 days, hydrocortisone succinate for 15 days, erythropoietin for 15 days"
    ENCFF131DEL = "muscle of leg female embryo (115 days)"
    ENCFF363RKM = "PC-9"
    ENCFF547ICI = "kidney female embryo (108 days)"
    ENCFF047HGQ = "stomach embryo (101 day)"
    ENCFF692AJS = "H9 G2 phase genetically modified using stable transfection"
    ENCFF951HWM = "esophagus muscularis mucosa male adult (37 years)"
    ENCFF053FMQ = "small intestine male embryo (108 days)"
    ENCFF791FRX = "heart left ventricle female embryo (101 day) and female embryo (103 days)"
    ENCFF461SNA = "SK-N-DZ"
    ENCFF622PZI = "brain male embryo (104 days)"
    ENCFF030GLD = "right lung male embryo (87 days)"
    ENCFF293NBG = "muscle of arm male embryo (105 days)"
    ENCFF778GKQ = "muscle of arm male embryo (104 days)"
    ENCFF207ECZ = "kidney female embryo (105 days)"
    ENCFF459LFA = "left lung male embryo (105 days)"
    ENCFF532VZC = "left kidney female embryo (107 days)"
    ENCFF220BQY = "Peyer's patch male adult (37 years)"
    ENCFF884QVN = "renal pelvis male embryo (113 days)"
    ENCFF069GRI = "prostate gland male adult (37 years)"
    ENCFF977IGB = "HeLa-S3"
    ENCFF609SYX = "left lung female embryo (110 days)"
    ENCFF689ZKP = "thymus male embryo (104 days)"
    ENCFF417MBH = "neural progenitor cell originated from H9"
    ENCFF833HDX = "trophoblast cell embryo (39 weeks) and embryo (40 weeks)"
    ENCFF491HNC = "right lung male embryo (96 days)"
    ENCFF091ZXC = "H9 G1 phase genetically modified using stable transfection"
    ENCFF746JHF = "small intestine male embryo (87 days)"
    ENCFF217JVN = "spinal cord female embryo (87 days)"
    ENCFF623DPF = "left kidney female embryo (110 days)"
    ENCFF658GXI = "arm bone male embryo (81 day)"
    ENCFF068YUS = "testis male adult (54 years)"
    ENCFF624FJD = "transverse colon male adult (37 years)"
    ENCFF630KCX = "right renal pelvis male embryo (105 days)"
    ENCFF283WCF = "lung embryo (101 day)"
    ENCFF848BGL = "inferior parietal cortex male adult (84 years)"
    ENCFF403BFX = "kidney capillary endothelial cell female embryo (113 days)"
    ENCFF383SIU = "placenta male embryo (91 day)"
    ENCFF253IUB = "right kidney male embryo (115 days)"
    ENCFF871GME = "coronary artery female adult (51 year)"
    GSM2543089 = "BM0106-Day0-MCP-A / Bone Marrow CD34+ / pDC"
    GSM2543090 = "BM0106-UNK-ATAC-2 / Bone Marrow CD34+ / UNK"
    GSM2543091 = "BM0828-MEGA1-A-151109 / Bone Marrow CD34+ / Mega"
    GSM2543093 = "BM1077-MCP / Bone Marrow CD34+ / pDC"
    GSM2543094 = "BM1077-UNK / Bone Marrow CD34+ / UNK"
    GSM2543095 = "BM1137-GMP1-low-ATAC-2 / Bone Marrow CD34+ / GMP-A"
    GSM2543096 = "BM1137-GMP2-mid-ATAC-1 / Bone Marrow CD34+ / GMP-B"
    GSM2543098 = "BM1137-GMP3-high-ATAC-2 / Bone Marrow CD34+ / GMP-C"
    GSM2543099 = "BM1214-Day0-MCP / Bone Marrow CD34+ / pDC"
    GSM2543100 = "BM1214-Day0-UNK-A / Bone Marrow CD34+ / UNK"

In [4]:
class Tracks(BaseModel):
    category1: list[tissues1]
    category2: list[tissues2]
    subcategory: str

In [17]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

An example of a prompt for the o4-mini model.

```"role":"system"``` let's the programmer give the model high-level commands on how to deal with the user input (given in ```"role":"user"```)

In [18]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "For the given phenotype, choose all matching tissues and give a closer categorization in the field subcategory."},
        {"role": "user", "content": "Albumin"},
    ],
    response_format=Tracks,
)

event = completion.choices[0].message.parsed

Read out the output:

In [19]:
event

Tracks(category1=[<tissues1.ENCFF818FXA: 'hepatocyte'>, <tissues1.ENCFF136DBS: 'HepG2'>], category2=[<tissues2.ENCFF520XMG: 'foreskin melanocyte male newborn'>, <tissues2.ENCFF497YLT: 'trophoblast cell embryo (23 weeks)'>], subcategory='Protein')

Seems like a pretty bad selection, as Albumin does not seem to be liver tissue but instead found in blood plasma.

Let's try a stronger model: (seems like o1 and o3-mini are not available...)

In [22]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "For the given phenotype, choose all matching tissues and give a closer categorization in the field subcategory."},
        {"role": "user", "content": "Albumin"},
    ],
    response_format=Tracks,
)

event = completion.choices[0].message.parsed

In [23]:
event

Tracks(category1=[<tissues1.ENCFF660YSU: 'AG08395'>, <tissues1.ENCFF787MSC: 'AG08396'>, <tissues1.ENCFF148BGE: 'HEK293T'>, <tissues1.ENCFF136DBS: 'HepG2'>], category2=[], subcategory='Liver and related systems')

Does not seem much better. Let's fiddle with the prompt:

In [24]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": """You are an expert in human biology and you are working in a project exploring the human genome. There are two sets of tissue samples that were sequenced,
         their descriptions are names of cell lines or descriptions of the sample. These descriptions are enumerated in the two classes "tissues1" and "tissues2". The team plans to study several phenotypes
         and therefore has to choose the most relevant sequencing tracks. It is important to choose only relevant tracks, since adding irrelevant data will worsen the performance of the method.
         For any provided phenotype, return a list of relevant tissues. Phenotypes might be shortened to keywords."""},
        {"role": "user", "content": "Albumin"},
    ],
    response_format=Tracks,
)

event = completion.choices[0].message.parsed

In [25]:
event

Tracks(category1=[<tissues1.ENCFF818FXA: 'hepatocyte'>, <tissues1.ENCFF136DBS: 'HepG2'>, <tissues1.ENCFF818FXA: 'hepatocyte'>, <tissues1.ENCFF457RRO: 'renal cortical epithelial cell'>, <tissues1.ENCFF798YGS: 'kidney glomerular epithelial cell male adult (43 years) and male adult (62 years)'>], category2=[<tissues2.ENCFF010FIA: 'renal pelvis female embryo (96 days)'>, <tissues2.ENCFF631XBY: 'renal cortex interstitium female embryo (120 days)'>, <tissues2.ENCFF354FAY: 'renal cortex interstitium male embryo (97 days)'>, <tissues2.ENCFF701FFX: 'renal cortex interstitium female embryo (96 days)'>, <tissues2.ENCFF890HSZ: 'kidney female embryo (85 days)'>], subcategory='Serum protein')

In [27]:
event

Tracks(category1=[<tissues1.ENCFF818FXA: 'hepatocyte'>, <tissues1.ENCFF136DBS: 'HepG2'>], category2=[<tissues2.ENCFF136YOJ: 'hepatocyte originated from H9'>, <tissues2.ENCFF775FBE: 'liver female embryo (101 day) and female embryo (113 days)'>], subcategory='Liver function and synthesis of serum proteins, specifically albumin production.')

In [28]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": """You are an expert in human biology and you are working in a project exploring the human genome. There are two sets of tissue samples that were sequenced,
         their descriptions are names of cell lines or descriptions of the sample. These descriptions are enumerated in the two classes "tissues1" and "tissues2". The team plans to study several phenotypes
         and therefore has to choose the most relevant sequencing tracks. It is important to choose only relevant tracks, since adding irrelevant data will worsen the performance of the method.
         Phenotypes might be shortened to keywords.
         For any provided input, first decide what phenotype is investigated, then assemble a list of functions of the human body that will influence the expression of this phenotype most, then
         return the sequenced tissues that contain the clearest information on those functions."""},
        {"role": "user", "content": "Albumin"},
    ],
    response_format=Tracks,
)

event = completion.choices[0].message.parsed

In [None]:
event

Tracks(category1=[<tissues1.ENCFF818FXA: 'hepatocyte'>, <tissues1.ENCFF136DBS: 'HepG2'>], category2=[<tissues2.ENCFF136YOJ: 'hepatocyte originated from H9'>, <tissues2.ENCFF775FBE: 'liver female embryo (101 day) and female embryo (113 days)'>], subcategory='Liver function and synthesis of serum proteins, specifically albumin production.')

In [None]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": """You are an expert in human biology and you are working in a project exploring the human genome. There are two sets of tissue samples that were sequenced,
         their descriptions are names of cell lines or descriptions of the sample. These descriptions are enumerated in the two classes "tissues1" and "tissues2". The team plans to study several phenotypes
         and therefore has to choose the most relevant sequencing tracks. It is important to choose only relevant tracks, since adding irrelevant data will worsen the performance of the method.
         Phenotypes might be shortened to keywords.
         For any provided input, first decide what phenotype is investigated, then assemble a list of functions of the human body that will influence the expression of this phenotype most, then
         return the sequenced tissues that contain the clearest information on those functions."""},
        {"role": "user", "content": "Albumin"},
    ],
    response_format=Tracks,
)

event = completion.choices[0].message.parsed

Notably in the last two experiments, the responses of ``gpt-4o-mini`` and ``gpt-4o`` are the same

Now let's get the list of relevant tissues:

In [52]:
# Some Enum utility functions
type(event.category1[0])             # <enum 'tissues1'>
event.category1[0].value             # 'liver embryo (59 days) and embryo (80 days)'
event.category1[0].name              # 'ENCFF068ZBX'
type(event.category1[0].name)        # str
"b'"+event.category1[0].name+"'"     # "b'ENCFF068ZBX'"

"b'ENCFF068ZBX'"

In [53]:
selected_tracks = ["b'"+tissue.name+"'" for tissue in event.category1+event.category2]

In [54]:
selected_tracks

["b'ENCFF068ZBX'",
 "b'ENCFF818FXA'",
 "b'ENCFF136DBS'",
 "b'ENCFF136YOJ'",
 "b'ENCFF462ZLK'"]

Finally, let's try and see if the selected tissues for Major Depressive Disorder match the ones we selected:

In [55]:
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": """You are an expert in human biology and you are working in a project exploring the human genome. There are two sets of tissue samples that were sequenced,
         their descriptions are names of cell lines or descriptions of the sample. These descriptions are enumerated in the two classes "tissues1" and "tissues2". The team plans to study several phenotypes
         and therefore has to choose the most relevant sequencing tracks. It is important to choose only relevant tracks, since adding irrelevant data will worsen the performance of the method.
         Phenotypes might be shortened to keywords.
         For any provided input, first decide what phenotype is investigated, then assemble a list of functions of the human body that will influence the expression of this phenotype most, then
         return the sequenced tissues that contain the clearest information on those functions."""},
        {"role": "user", "content": "ajor Depressive Disorder"},
    ],
    response_format=Tracks,
)

event = completion.choices[0].message.parsed

In [56]:
event

Tracks(category1=[<tissues1.ENCFF110QGM: 'frontal cortex male adult (27 years) and male adult (35 years)'>, <tissues1.ENCFF685MZL: 'frontal cortex female adult (67 years) and female adult (80 years)'>, <tissues1.ENCFF833POA: 'cerebellum male adult (27 years) and male adult (35 years)'>, <tissues1.ENCFF153XQL: 'H1-hESC'>, <tissues1.ENCFF025HHG: 'brain microvascular endothelial cell'>, <tissues1.ENCFF499UDS: 'astrocyte of the hippocampus'>, <tissues1.ENCFF901UBX: 'astrocyte of the spinal cord'>], category2=[<tissues2.ENCFF029ZOX: 'superior temporal gyrus male adult (84 years)'>, <tissues2.ENCFF947HEA: 'brain male embryo (105 days)'>, <tissues2.ENCFF622PZI: 'brain male embryo (104 days)'>, <tissues2.ENCFF503ZTL: 'brain male embryo (101 day)'>, <tissues2.ENCFF468SMI: 'brain male embryo (72 days) and male embryo (76 days)'>, <tissues2.ENCFF353EKI: 'thymus female embryo (105 days)'>, <tissues2.ENCFF359LKO: 'small intestine female embryo (91 day)'>], subcategory='Neurological Function and Men

In [57]:
selected_tracks = ["b'"+tissue.name+"'" for tissue in event.category1+event.category2]

In [62]:
# print(selected_tracks.sort())
selected_tracks

["b'ENCFF025HHG'",
 "b'ENCFF029ZOX'",
 "b'ENCFF110QGM'",
 "b'ENCFF153XQL'",
 "b'ENCFF353EKI'",
 "b'ENCFF359LKO'",
 "b'ENCFF468SMI'",
 "b'ENCFF499UDS'",
 "b'ENCFF503ZTL'",
 "b'ENCFF622PZI'",
 "b'ENCFF685MZL'",
 "b'ENCFF833POA'",
 "b'ENCFF901UBX'",
 "b'ENCFF947HEA'"]

```  b'ENCFF003FBE'	DNASE:brain female embryo (109 days)
= b'ENCFF025HHG'	DNASE:brain microvascular endothelial cell
= b'ENCFF029ZOX'	DNASE:superior temporal gyrus male adult (84 years)
  b'ENCFF064KBT'	DNASE:brain female embryo (96 days)
= b'ENCFF110QGM'	DNASE:frontal cortex male adult (27 years) and male adult (35 years)
  b'ENCFF151SBP'	DNASE:medulla oblongata male adult (78 years) and male adult (84 years)
- b'ENCFF153XQL'    DNASE:H1-hESC
  b'ENCFF210SQU'	DNASE:pons male adult (78 years)
  b'ENCFF225GCE'	DNASE:midbrain male adult (78 years) and male adult (84 years)
  b'ENCFF328BAN'	DNASE:globus pallidus male adult (78 years) and male adult (84 years)
- b'ENCFF353EKI'    DNASE:thymus female embryo (105 days)
  b'ENCFF353QSZ'	DNASE:brain pericyte
- b'ENCFF359LKO'    DNASE:small intestine female embryo (91 day)
  b'ENCFF361UTR'	DNASE:brain male embryo (122 days)
  b'ENCFF378OWK'	DNASE:brain female embryo (142 days)
  b'ENCFF382FZE'	DNASE:astrocyte of the cerebellum
  b'ENCFF460VUW'	DNASE:Ammon's horn male adult (84 years)
= b'ENCFF468SMI'	DNASE:brain male embryo (72 days) and male embryo (76 days)
  b'ENCFF482UKP'	DNASE:brain embryo (80 days)
  b'ENCFF498QFP'	DNASE:caudate nucleus male adult (78 years)
= b'ENCFF499UDS'	DNASE:astrocyte of the hippocampus
= b'ENCFF503ZTL'	DNASE:brain male embryo (101 day)
  b'ENCFF536FNG'	DNASE:brain female embryo (117 days)
  b'ENCFF564DYF'	DNASE:cerebellar cortex male adult (78 years) and male adult (84 years)
  b'ENCFF568EAH'	DNASE:occipital lobe male adult (84 years)
= b'ENCFF622PZI'	DNASE:brain male embryo (104 days)
  b'ENCFF641HVS'	DNASE:brain embryo (56 days) and male embryo (58 days)
  b'ENCFF643ARA'	DNASE:brain female embryo (105 days)
  b'ENCFF657KLL'	DNASE:brain embryo (112 days)
  b'ENCFF681JFJ'	DNASE:putamen male adult (78 years)
= b'ENCFF685MZL'	DNASE:frontal cortex female adult (67 years) and female adult (80 years)
  b'ENCFF735MKP'	DNASE:brain female embryo (85 days)
  b'ENCFF751LHL'	DNASE:middle frontal gyrus male adult (78 years)
= b'ENCFF833POA'	DNASE:cerebellum male adult (27 years) and male adult (35 years)
  b'ENCFF848BGL'	DNASE:inferior parietal cortex male adult (84 years)
- b'ENCFF901UBX'    DNASE:astrocyte of the spinal cord
= b'ENCFF947HEA'	DNASE:brain male embryo (105 days)```