# Goal

* Expand tissue annotations beyond CellxGene's tissue categories
  * e.g., include plant tissues
    * for plants, using categories from the [scPlantDB](https://biobigdata.nju.edu.cn/scplantdb/dataset)

In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [94]:
import os
import pandas as pd
from pypika import Query, Table, Field, Column, Criterion

In [None]:
from SRAgent.db.connect import db_connect
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, db_get_table, execute_query
from SRAgent.db.get import db_find_srx

In [4]:
# set to prod database
os.environ['DYNACONF'] = 'prod'

In [None]:
# list database tables
with db_connect() as conn:
    print("\n".join(db_list_tables(conn)))

screcounter_star_results
eval
scbasecamp_metadata
screcounter_trace
srx_srr
srx_metadata
screcounter_log
scbasecamp_metadata_tmp
screcounter_star_params


# Current tissue categories

In [6]:
tissue_cat_file = "data/2025-02-20_tissue_categories.csv"
tissue_cat = pd.read_csv(tissue_cat_file)
print(tissue_cat.shape)
tissue_cat.head()

(3747, 2)


Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung


# Organisms

In [7]:
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism) \
    .distinct()

with db_connect() as conn:
    orgs = [str(x) for x in pd.read_sql(str(stmt), conn)["organism"].tolist()]
print("\n".join(sorted(orgs)))

Anopheles gambiae
Arabidopsis thaliana
Bos taurus
Caenorhabditis elegans
Callithrix jacchus
Canis lupus
Danio rerio
Drosophila melanogaster
Equus caballus
Gallus gallus
Gorilla gorilla
Heterocephalus glaber
Homo sapiens
Macaca mulatta
Mus musculus
None
Oryctolagus cuniculus
Oryza sativa
Ovis aries
Pan troglodytes
Rattus norvegicus
Schistosoma mansoni
Solanum lycopersicum
Sus scrofa
Xenopus tropicalis
Zea mays
metagenome
other


# Plant tissue annotations

In [8]:
plants = ["Arabidopsis thaliana", "Oryza sativa", "Solanum lycopersicum", "Zea mays"]

## SRX metadata

In [9]:
# list all records for a feature in scbasecamp_metadata
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct() \
    .where(tbl.organism.isin(plants))

with db_connect() as conn:
    df = pd.read_sql(str(stmt), conn)
print(df.shape)
df.head()

(98, 2)


Unnamed: 0,organism,tissue
0,Arabidopsis thaliana,12-day-old seedlings
1,Arabidopsis thaliana,Arabidopsis root cells
2,Arabidopsis thaliana,Arabidopsis Root Protoplasts
3,Arabidopsis thaliana,callus
4,Arabidopsis thaliana,Callus


In [10]:
# merge df with tissue_cat on tissue
df_j = df.merge(tissue_cat, on="tissue", how="left")
df_j

Unnamed: 0,organism,tissue,category
0,Arabidopsis thaliana,12-day-old seedlings,other
1,Arabidopsis thaliana,Arabidopsis root cells,other
2,Arabidopsis thaliana,Arabidopsis Root Protoplasts,other
3,Arabidopsis thaliana,callus,
4,Arabidopsis thaliana,Callus,
...,...,...,...
93,Zea mays,shoot apex (SAM+P6),other
94,Zea mays,Shoot apex (SAM+P6),other
95,Zea mays,Tassel FM-S2,
96,Zea mays,Tassel FM-S3,


In [11]:
# summarize the categories
df_j["category"].value_counts()

category
other                 67
respiratory system     1
embryo                 1
Name: count, dtype: int64

In [12]:
# pull out non-other
df_j[(df_j["category"] != "other") & (df_j["category"].notnull())]

Unnamed: 0,organism,tissue,category
12,Arabidopsis thaliana,HAE+ FACS,respiratory system
75,Zea mays,ear primordia,embryo


In [13]:
# pull out non-other
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct() \
    .where(tbl.tissue.isin(["HAE+ FACS", "ear primordia"]))
    

with db_connect() as conn:
    df_check = pd.read_sql(str(stmt), conn)
df_check

Unnamed: 0,organism,tissue
0,Arabidopsis thaliana,HAE+ FACS
1,Zea mays,ear primordia


In [14]:
# list all tissues
print("\n".join([f"'{x}'" for x in sorted(df["tissue"].unique().tolist())]))

'12 day seedling'
'12-day-old seedlings'
'Adventitious root'
'Arabidopsis Root Protoplasts'
'Arabidopsis root cells'
'Callus'
'Cotyledon'
'Ear FM-S2'
'Ear FM-S3'
'Ear primordia'
'Fruit'
'Germinating seeds'
'HAE+ FACS'
'Hypocotyl'
'Imbibed seeds'
'Kernel'
'Kernel, filling stage (18 days after pollination)'
'Leaf'
'Leaf base'
'Leaf primodium'
'Leaf primordia'
'Leaves'
'Mixed-stage male gametophytes'
'Pistil'
'Pollen'
'Rice root cells'
'Root'
'Root meristem'
'Root tip'
'Root tips'
'Roots'
'Rosette'
'Rosette (21-day-old)'
'Rosette (30 days old)'
'Rosette, 21 days old'
'Rosette, Age: 30 days old'
'SAM (Shoot Apical Meristem), plastochrons 1-6'
'SAM, plastochrons 1 - 6'
'Seedling'
'Seedling (12 days old)'
'Seedling, 7 days old'
'Shoot apex (SAM+P6)'
'Silique'
'Stem'
'Tassel FM-S2'
'Tassel FM-S3'
'Whole root'
'Whole silique'
'callus'
'cotyledon'
'ear primordia'
'endosperm'
'floral receptacle'
'flower abscission zone'
'fruit'
'germinating seeds'
'germinating seeds (1.25 days old)'
'hypocotyl'


In [None]:
# tissue categories applied to plant tissues via o1 model
plant_tissue_cat_file = "data/2025-03-10_plant_tissue_cat.csv"
plant_tissue_cat = pd.read_csv(plant_tissue_cat_file)
plant_tissue_cat

Unnamed: 0,tissue,category
0,12 day seedling,Seedling
1,12-day-old seedlings,Seedling
2,Adventitious root,Root
3,Arabidopsis Root Protoplasts,Protoplast
4,Arabidopsis root cells,Root
...,...,...
81,unsure,other
82,whole flowers,Flower
83,whole root,Root
84,whole root tip,Root


In [16]:
# merge 
tissue_cat_j = tissue_cat.merge(plant_tissue_cat, on="tissue", how="left")

# if `category_y` is not null, use it, otherwise use `category_x`
tissue_cat_j["category"] = tissue_cat_j["category_y"].combine_first(tissue_cat_j["category_x"]).str.replace('"', "").str.strip().str.lower().str.split(";").str[0]

pd.set_option('display.max_rows', 10)
tissue_cat_j

Unnamed: 0,tissue,category_x,category_y,category
0,2 layer spheroid,other,,other
1,"3 layer spheroid channel 1,epithelial",other,,other
2,3 layer spheroid channel 2,other,,other
3,3D healthy skin model,skin of body,,skin of body
4,A549 cells,lung,,lung
...,...,...,...,...
3742,zebrafish embryo,embryo,,embryo
3743,zebrafish larva,other,,other
3744,zebrafish trunks at yolk extension,embryo,,embryo
3745,ileal,small intestine,,small intestine


In [19]:
# show all rows
pd.set_option('display.max_rows', None)
tissue_cat_j["category"].value_counts().to_frame().reset_index()

Unnamed: 0,category,count
0,brain,628
1,other,391
2,embryo,173
3,blood,159
4,immune system,157
5,bone marrow,143
6,skin of body,142
7,heart,125
8,skeletal system,123
9,vasculature,95


In [20]:
# reset display
pd.set_option('display.max_rows', 10)

In [92]:
# write out tissue_cat_j to file
tissue_cat_j.drop(columns=["category_x", "category_y"]).to_csv("data/2025-03-10_tissue_categories.csv", index=False)

### Tissue summary

In [22]:
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct()

with db_connect() as conn:
    df_srx_meta = pd.read_sql(str(stmt), conn)
df_srx_meta

Unnamed: 0,organism,tissue
0,,
1,Mus musculus,caudal tissues from E8.5 embryos
2,Danio rerio,hepatized intestine
3,Homo sapiens,iPSC-derived neural cultures
4,Bos taurus,macrophages
...,...,...
6386,Homo sapiens,PBMC Healthy control (T50)
6387,Homo sapiens,parahippocampus
6388,Mus musculus,urethra
6389,Gallus gallus,Basilar papillae (auditory epithelium)


In [25]:
df_srx_meta = df_srx_meta.merge(
    tissue_cat_j.drop(columns=["category_x", "category_y"]),
    on="tissue", how="left"
)
df_srx_meta

Unnamed: 0,organism,tissue,category
0,,,other
1,Mus musculus,caudal tissues from E8.5 embryos,
2,Danio rerio,hepatized intestine,intestine
3,Homo sapiens,iPSC-derived neural cultures,
4,Bos taurus,macrophages,immune system
...,...,...,...
6386,Homo sapiens,PBMC Healthy control (T50),blood
6387,Homo sapiens,parahippocampus,brain
6388,Mus musculus,urethra,
6389,Gallus gallus,Basilar papillae (auditory epithelium),


In [31]:
# filter to NaN
pd.set_option('display.max_rows', None)
df_srx_meta[df_srx_meta["category"].isnull()]["organism"].value_counts()

organism
Homo sapiens               826
Mus musculus               717
Rattus norvegicus          106
other                       85
Danio rerio                 32
Drosophila melanogaster     32
metagenome                  31
Gallus gallus               28
Callithrix jacchus          27
Macaca mulatta              25
Sus scrofa                  18
Pan troglodytes             17
Canis lupus                 16
Zea mays                    12
Arabidopsis thaliana        10
Ovis aries                   9
Bos taurus                   7
Solanum lycopersicum         5
Equus caballus               5
Schistosoma mansoni          4
Oryza sativa                 2
Caenorhabditis elegans       2
Oryctolagus cuniculus        2
Heterocephalus glaber        1
Name: count, dtype: int64

In [38]:
# just mammals
pd.set_option('display.max_rows', 10)
mammals = [
    "Homo sapiens",
    "Mus musculus",
    "Rattus norvegicus",
    "Callithrix jacchus",
    "Macaca mulatta",
    "Sus scrofa",
    "Pan troglodytes",
    "Canis lupus",
    "Ovis aries",
    "Bos taurus",
    "Equus caballus",
    "Oryctolagus cuniculus",
    "Heterocephalus glaber"
]

df_srx_meta_mam = df_srx_meta[(df_srx_meta["category"].isnull()) & (df_srx_meta["organism"].isin(mammals))]
df_srx_meta_mam

Unnamed: 0,organism,tissue,category
1,Mus musculus,caudal tissues from E8.5 embryos,
3,Homo sapiens,iPSC-derived neural cultures,
7,Homo sapiens,"bronchus, lung",
14,Homo sapiens,ovarian tissue,
15,Homo sapiens,cardiomyocytes,
...,...,...,...
6374,Mus musculus,ventricular cardiac tissue,
6383,Mus musculus,coronal suture,
6384,Mus musculus,lung resident immune cells,
6388,Mus musculus,urethra,


In [43]:
print("\n".join(sorted(df_srx_meta_mam["tissue"].unique().tolist())))

1st molar tooth
2-cell embryo
2nd molar tooth
3D culture
3D culture in RGF BME
8990_TBR1_S9_L003
8th section of the small intestine
ALI culture of tracheal aspirate derived airway basal stem cells
Abdominal adhesion tissue
Acute slice culture of glioma resection
Adipose Tissue
Adrenal-Gland
Adult Ovary Tissue
Adult Ovary Tissue, Follicle 2-5mm
Adult Ovary Tissue, Stroma
Adult human heart
Adult midbrain
Adult mouse ILC progenitors from femur
Adult mouse ILC progenitors harvested from femur
Amnion
Amygdala
Aorta
Aortic root
Apex of the heart
Apical region of left ventricle
Arcuate-Median Eminence
Area postrema and nucleus tractus solitarius
Ascitic fluid
Atrioventricular node
Atrioventricular node, left cardiac atrium
Auditory (AUD)
Auditory Cortex
B-cell
B-cells
B-cells (CVID naive B-cell)
B-lymphocyte
B-lymphocyte, Peripheral Blood
B16-OVA tumor
BALF cells (Bronchoalveolar Lavage Fluid)
BALF cells (Bronchoalveolar lavage fluid)
BALF cells (bronchoalveolar lavage fluid cells)
BLA (Basol

### Updated categories

In [64]:
pd.set_option('display.max_rows', 10)
df_updated_cats = pd.read_csv("./data/2025-03-11_tissue_categories.csv")
df_updated_cats

Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung
...,...,...
5613,"whole animal, developmental stages: 26, 28, 30...",other
5614,whole skin tissue,skin of body
5615,wing,other
5616,zebrafish embryos,embryo


In [65]:
# reload metadata
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct()

with db_connect() as conn:
    df_srx_meta = pd.read_sql(str(stmt), conn)
df_srx_meta

Unnamed: 0,organism,tissue
0,,
1,Mus musculus,caudal tissues from E8.5 embryos
2,Danio rerio,hepatized intestine
3,Homo sapiens,iPSC-derived neural cultures
4,Bos taurus,macrophages
...,...,...
6386,Homo sapiens,PBMC Healthy control (T50)
6387,Homo sapiens,parahippocampus
6388,Mus musculus,urethra
6389,Gallus gallus,Basilar papillae (auditory epithelium)


In [66]:
# merge on tissue
df_srx_meta = df_srx_meta.merge(df_updated_cats, on="tissue", how="left")
df_srx_meta

Unnamed: 0,organism,tissue,category
0,,,other
1,,,other
2,,,other
3,Mus musculus,caudal tissues from E8.5 embryos,embryo
4,Danio rerio,hepatized intestine,intestine
...,...,...,...
6388,Homo sapiens,PBMC Healthy control (T50),blood
6389,Homo sapiens,parahippocampus,brain
6390,Mus musculus,urethra,other
6391,Gallus gallus,Basilar papillae (auditory epithelium),sensory system


In [67]:
# filter to NaN
pd.set_option('display.max_rows', None)
df_srx_meta[df_srx_meta["category"].isnull()]["organism"].value_counts()

organism
other           78
metagenome      28
Homo sapiens     1
Name: count, dtype: int64

In [None]:
# tissues to categorize
x = sorted(df_srx_meta[(df_srx_meta["category"].isnull()) & (~df_srx_meta["organism"].isin(["other", "metagenome"]))]["tissue"].unique())
print('\n'.join(x))

N/A


# sessionInfo

In [96]:
!mamba list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.4.3                    pypi_0    pypi
aiohttp                   3.10.10                  pypi_0    pypi
aiosignal                 1.3.1                    pypi_0    pypi
annotated-types           0.7.0                    pypi_0    pypi
anyio                     4.6.2.post1              pypi_0    pypi
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
attrs                     24.2.0                   pypi_0    pypi
beautifulsoup4            4.12.3                   pypi_0    pypi
biopython                 1.84                     pypi_0    pypi
build                     1.2.2.post1              pypi_0    pypi
bzip2                     1.0.8             