# Goal

* Expand tissue annotations beyond CellxGene's tissue categories
  * e.g., include plant tissues
    * for plants, using categories from the [scPlantDB](https://biobigdata.nju.edu.cn/scplantdb/dataset)

In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
import os
from pathlib import Path
import pandas as pd
from pypika import Query, Table, Field, Column, Criterion, functions as fn

In [3]:
from SRAgent.db.connect import db_connect
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, db_get_table, execute_query
from SRAgent.db.get import db_find_srx

In [4]:
# set to prod database
os.environ['DYNACONF'] = 'prod'

In [5]:
# get base of github repo
base_dir = !git rev-parse --show-toplevel
base_dir = Path(base_dir[0])

In [6]:
# list database tables
with db_connect() as conn:
    print("\n".join(db_list_tables(conn)))

screcounter_star_results
eval
scbasecamp_metadata
screcounter_trace
srx_srr
srx_metadata
screcounter_log
scbasecamp_metadata_tmp
screcounter_star_params


# Current tissue categories

In [7]:
#tissue_cat_file = "data/2025-02-20_tissue_categories.csv"
tissue_cat_file = base_dir / 'data' / 'tissues' / '2025-03-11_tissue_categories.csv.gz'
tissue_cat = pd.read_csv(tissue_cat_file)
print(tissue_cat.shape)
tissue_cat.head()

(5618, 2)


Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung


# Organisms

In [8]:
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism) \
    .distinct()

with db_connect() as conn:
    orgs = [str(x) for x in pd.read_sql(str(stmt), conn)["organism"].tolist()]
print("\n".join(sorted(orgs)))

Ambystoma mexicanum
Anas platyrhynchos
Anopheles gambiae
Anser cygnoides
Arabidopsis thaliana
Bos taurus
Caenorhabditis elegans
Callithrix jacchus
Canis lupus
Capra hircus
Cavia porcellus
Chlorocebus aethiops
Danio rerio
Drosophila melanogaster
Equus caballus
Erinaceus europaeus
Felis catus
Gallus gallus
Gasterosteus aculeatus
Gorilla gorilla
Heterocephalus glaber
Homo sapiens
Macaca mulatta
Mesocricetus auratus
Monodelphis domestica
Mus musculus
Mustela putorius
None
Ornithorhynchus anatinus
Oryctolagus cuniculus
Oryza sativa
Ovis aries
Pan paniscus
Pan troglodytes
Rattus norvegicus
Salmo salar
Schistosoma mansoni
Solanum lycopersicum
Sus scrofa
Taeniopygia guttata
Trachemys scripta
Tupaia belangeri
Vicugna pacos
Xenopus tropicalis
Zea mays
metagenome
other


# Plant tissue annotations

In [9]:
plants = ["Arabidopsis thaliana", "Oryza sativa", "Solanum lycopersicum", "Zea mays"]

## SRX metadata

In [10]:
# list all records for a feature in scbasecamp_metadata
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct() \
    .where(tbl.organism.isin(plants))

with db_connect() as conn:
    df = pd.read_sql(str(stmt), conn)
print(df.shape)
df.head()

(682, 2)


Unnamed: 0,organism,tissue
0,Arabidopsis thaliana,10-day-old seedling
1,Arabidopsis thaliana,12-day-old seedlings
2,Arabidopsis thaliana,"12-day-old seedlings (callus), callus cells"
3,Arabidopsis thaliana,12-day-old seedlings (whole plant)
4,Arabidopsis thaliana,14-day-old seedlings


In [11]:
# merge df with tissue_cat on tissue
df_j = df.merge(tissue_cat, on="tissue", how="left")
df_j

Unnamed: 0,organism,tissue,category
0,Arabidopsis thaliana,10-day-old seedling,
1,Arabidopsis thaliana,12-day-old seedlings,seedling
2,Arabidopsis thaliana,"12-day-old seedlings (callus), callus cells",
3,Arabidopsis thaliana,12-day-old seedlings (whole plant),
4,Arabidopsis thaliana,14-day-old seedlings,
...,...,...,...
677,Zea mays,Tassel FM-S3,flower
678,Zea mays,unsure,other
679,Zea mays,"V2 seedling leaf 2, distal 6cm",
680,Zea mays,whole seedlings,


In [12]:
# summarize the categories
df_j["category"].value_counts()

category
root              28
leaf              26
other             19
flower            16
seedling          12
seed              10
shoot apex         4
callus             4
sensory system     4
placenta           2
protoplast         1
ovary              1
Name: count, dtype: int64

In [13]:
# pull out non-other plant tissues
df_j[(df_j["category"] != "other") & (df_j["category"].notnull())]

Unnamed: 0,organism,tissue,category
1,Arabidopsis thaliana,12-day-old seedlings,seedling
16,Arabidopsis thaliana,Arabidopsis root cells,root
17,Arabidopsis thaliana,Arabidopsis Root Protoplasts,protoplast
18,Arabidopsis thaliana,callus,callus
19,Arabidopsis thaliana,Callus,callus
...,...,...,...
663,Zea mays,Seedling,seedling
672,Zea mays,shoot apex (SAM+P6),shoot apex
673,Zea mays,Shoot apex (SAM+P6),shoot apex
676,Zea mays,Tassel FM-S2,flower


In [14]:
# pull out non-other
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct() \
    .where(tbl.tissue.isin(["HAE+ FACS", "ear primordia"]))
    

with db_connect() as conn:
    df_check = pd.read_sql(str(stmt), conn)
df_check

Unnamed: 0,organism,tissue
0,Arabidopsis thaliana,HAE+ FACS
1,Zea mays,ear primordia


In [15]:
# list all tissues
print("\n".join([f" - '{x}'" for x in sorted(df["tissue"].unique().tolist())]))

 - '10-day-old seedling'
 - '10DAP endosperm'
 - '12 day seedling'
 - '12-day-old seedlings'
 - '12-day-old seedlings (callus), callus cells'
 - '12-day-old seedlings (whole plant)'
 - '14-day-old seedlings'
 - '2-week-old seedling'
 - '2-week-old seedlings'
 - '2cm root tips'
 - '4-day-old seedlings'
 - '4-day-old whole seedlings'
 - '5-day seedling shoot'
 - '5mm root tip'
 - '7-d-old continuous light grown seedlings'
 - '7-day-old continuous light grown seedlings'
 - '7-day-old rice seedling'
 - '7-day-old rice seedlings'
 - '7-day-old rice seedlings undergoing de-etiolation'
 - '7-day-old rice seedlings undergoing deetiolation'
 - '7-day-old seedlings'
 - 'Adventitious root'
 - 'Aerial tissue (protoplasts extracted from the whole aerial tissue)'
 - 'Aerial tissue of 5-day-old Arabidopsis seedlings'
 - 'Arabidopsis Root Protoplasts'
 - 'Arabidopsis root cells'
 - 'Axillary bud'
 - 'B73, seedling, fresh'
 - 'Callus'
 - 'Cotyledon'
 - 'Detached Arabidopsis leaves'
 - 'Ear'
 - 'Ear FM-

In [9]:
# tissue categories added to existing tissue categories
infile = base_dir / "data" / "tissues" / "2025-06-30_tissue_categories.csv"
pd.read_csv(infile).groupby("category").count().sort_values("tissue", ascending=False)

Unnamed: 0_level_0,tissue
category,Unnamed: 1_level_1
brain,2210
other,1550
blood,1279
lung,832
embryo,668
...,...
axilla,4
paracolic gutter,3
ureter,3
neck,2


# Animal tissue categories 

In [27]:
#tissue_cat_file = "data/2025-02-20_tissue_categories.csv"
tissue_cat_file = base_dir / 'data' / 'tissues' / '2025-06-30_tissue_categories.csv'
tissue_cat = pd.read_csv(tissue_cat_file)
print(tissue_cat.shape)
tissue_cat.head()

(6250, 2)


Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung


In [22]:
plants = ["Arabidopsis thaliana", "Oryza sativa", "Solanum lycopersicum", "Zea mays"]

In [30]:
# list all records for a feature in scbasecamp_metadata, in which STAR results are not available; excluding plants
meta = Table("srx_metadata")
star = Table("screcounter_star_results")

star_query = (
    Query.from_(star)
    .select('*')
)

stmt = Query \
    .from_(star) \
    .join(meta) \
    .on(star.sample == meta.srx_accession) \
    .select(meta.tissue) \
    .distinct() \
    .where(~meta.organism.isin(plants))

with db_connect() as conn:
    df = pd.read_sql(str(stmt), conn)
print(df.shape)
df.head()

(13113, 1)


Unnamed: 0,tissue
0,bone marrow (hematopoietic progenitors: Lin- c...
1,lamina propria CD4+ T cells
2,ventricular myocardium (postnatal day 4 mouse ...
3,"placenta (deep layer, maternal-fetal interface)"
4,dentate gyrus and subventricular zone of the b...


In [48]:
# join on tissue categories
# merge df with tissue_cat on tissue
df_j = df.merge(tissue_cat, on="tissue", how="left")
df_j

Unnamed: 0,tissue,category
0,bone marrow (hematopoietic progenitors: Lin- c...,
1,lamina propria CD4+ T cells,
2,ventricular myocardium (postnatal day 4 mouse ...,
3,"placenta (deep layer, maternal-fetal interface)",
4,dentate gyrus and subventricular zone of the b...,brain
...,...,...
13112,FNA,other
13113,"skin epithelium (epidermis, P2 back skin)",
13114,transplanted human intestinal organoids (tHIO),intestine
13115,Esophageal adenocarcinoma tumor microenvironment,


In [49]:
# filter to nan category values
df_j = df_j[df_j["category"].isna()].sort_values("tissue")
df_j

Unnamed: 0,tissue,category
5653,17.1 Wk human fetal kidney,
3784,21-day human developmental cell aggregates (HD...,
9070,24-hour post fertilization zebrafish embryos,
363,26-somite stage ocular area (lens placode region),
10674,2D Placental pericyte,
...,...,...
1821,zebrafish explants,
526,"zebrafish heads, cranial neural crest-derived ...",
3722,zebrafish larval intestine,
5490,zebrafish larval intestines,


In [58]:
9404 / 200

47.02

In [103]:
# list all tissues in batches
def list_tissue_batch(df, batch_num=0, batch_size=200):
    batch = df.iloc[batch_size * batch_num:batch_size * (batch_num + 1)]
    print("\n".join([f" - '{x}'" for x in sorted(batch["tissue"].unique().tolist())]))

list_tissue_batch(df_j, 47)

 - 'zebrafish heads, cranial neural crest-derived cells'
 - 'zebrafish larval intestine'
 - 'zebrafish larval intestines'
 - 'zebrafish marrow'


# Checking overall tissue categories

In [12]:
# tissue categories added to existing tissue categories
infile = base_dir / "data" / "tissues" / "2025-06-30_tissue_categories.csv"
df_tissue_cats = pd.read_csv(infile)
df_tissue_cats

Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung
...,...,...
15640,zebrafish explants,other
15641,"zebrafish heads, cranial neural crest-derived ...",head
15642,zebrafish larval intestine,intestine
15643,zebrafish larval intestines,intestine


In [13]:
# group by category and count
df_tissue_cats.groupby("category").count().sort_values("tissue", ascending=False)

Unnamed: 0_level_0,tissue
category,Unnamed: 1_level_1
brain,2210
other,1550
blood,1279
lung,832
embryo,668
...,...
axilla,4
paracolic gutter,3
ureter,3
neck,2


In [14]:
# remove duplicate tissues  
df_tissue_cats.drop_duplicates(subset="tissue", inplace=True)
df_tissue_cats

Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung
...,...,...
15640,zebrafish explants,other
15641,"zebrafish heads, cranial neural crest-derived ...",head
15642,zebrafish larval intestine,intestine
15643,zebrafish larval intestines,intestine


In [15]:
# write the results
outfile = base_dir / "data" / "tissues" / "2025-06-30_tissue_categories.csv.gz"
df_tissue_cats.to_csv(outfile, index=False, compression="gzip")

# OLD

### Tissue summary

In [21]:
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct()

with db_connect() as conn:
    df_srx_meta = pd.read_sql(str(stmt), conn)
df_srx_meta

Unnamed: 0,organism,tissue
0,Homo sapiens,Fecal matter
1,,
2,Mus musculus,cortex (postnatal day 5)
3,Homo sapiens,iPSC-derived pancreatic duct-like organoids (P...
4,Mus musculus,forebrain microglia
...,...,...
45675,Canis lupus,bone marrow (CD34+ enriched)
45676,Mus musculus,"murine tracheal epithelial cells (mTECs), air–..."
45677,Homo sapiens,acute myeloid leukemia patient sample
45678,Homo sapiens,CD8+ T lymphocytes (from tumor co-culture)


In [25]:
df_srx_meta = df_srx_meta.merge(
    tissue_cat_j.drop(columns=["category_x", "category_y"]),
    on="tissue", how="left"
)
df_srx_meta

Unnamed: 0,organism,tissue,category
0,,,other
1,Mus musculus,caudal tissues from E8.5 embryos,
2,Danio rerio,hepatized intestine,intestine
3,Homo sapiens,iPSC-derived neural cultures,
4,Bos taurus,macrophages,immune system
...,...,...,...
6386,Homo sapiens,PBMC Healthy control (T50),blood
6387,Homo sapiens,parahippocampus,brain
6388,Mus musculus,urethra,
6389,Gallus gallus,Basilar papillae (auditory epithelium),


In [31]:
# filter to NaN
pd.set_option('display.max_rows', None)
df_srx_meta[df_srx_meta["category"].isnull()]["organism"].value_counts()

organism
Homo sapiens               826
Mus musculus               717
Rattus norvegicus          106
other                       85
Danio rerio                 32
Drosophila melanogaster     32
metagenome                  31
Gallus gallus               28
Callithrix jacchus          27
Macaca mulatta              25
Sus scrofa                  18
Pan troglodytes             17
Canis lupus                 16
Zea mays                    12
Arabidopsis thaliana        10
Ovis aries                   9
Bos taurus                   7
Solanum lycopersicum         5
Equus caballus               5
Schistosoma mansoni          4
Oryza sativa                 2
Caenorhabditis elegans       2
Oryctolagus cuniculus        2
Heterocephalus glaber        1
Name: count, dtype: int64

In [38]:
# just mammals
pd.set_option('display.max_rows', 10)
mammals = [
    "Homo sapiens",
    "Mus musculus",
    "Rattus norvegicus",
    "Callithrix jacchus",
    "Macaca mulatta",
    "Sus scrofa",
    "Pan troglodytes",
    "Canis lupus",
    "Ovis aries",
    "Bos taurus",
    "Equus caballus",
    "Oryctolagus cuniculus",
    "Heterocephalus glaber"
]

df_srx_meta_mam = df_srx_meta[(df_srx_meta["category"].isnull()) & (df_srx_meta["organism"].isin(mammals))]
df_srx_meta_mam

Unnamed: 0,organism,tissue,category
1,Mus musculus,caudal tissues from E8.5 embryos,
3,Homo sapiens,iPSC-derived neural cultures,
7,Homo sapiens,"bronchus, lung",
14,Homo sapiens,ovarian tissue,
15,Homo sapiens,cardiomyocytes,
...,...,...,...
6374,Mus musculus,ventricular cardiac tissue,
6383,Mus musculus,coronal suture,
6384,Mus musculus,lung resident immune cells,
6388,Mus musculus,urethra,


In [43]:
print("\n".join(sorted(df_srx_meta_mam["tissue"].unique().tolist())))

1st molar tooth
2-cell embryo
2nd molar tooth
3D culture
3D culture in RGF BME
8990_TBR1_S9_L003
8th section of the small intestine
ALI culture of tracheal aspirate derived airway basal stem cells
Abdominal adhesion tissue
Acute slice culture of glioma resection
Adipose Tissue
Adrenal-Gland
Adult Ovary Tissue
Adult Ovary Tissue, Follicle 2-5mm
Adult Ovary Tissue, Stroma
Adult human heart
Adult midbrain
Adult mouse ILC progenitors from femur
Adult mouse ILC progenitors harvested from femur
Amnion
Amygdala
Aorta
Aortic root
Apex of the heart
Apical region of left ventricle
Arcuate-Median Eminence
Area postrema and nucleus tractus solitarius
Ascitic fluid
Atrioventricular node
Atrioventricular node, left cardiac atrium
Auditory (AUD)
Auditory Cortex
B-cell
B-cells
B-cells (CVID naive B-cell)
B-lymphocyte
B-lymphocyte, Peripheral Blood
B16-OVA tumor
BALF cells (Bronchoalveolar Lavage Fluid)
BALF cells (Bronchoalveolar lavage fluid)
BALF cells (bronchoalveolar lavage fluid cells)
BLA (Basol

### Updated categories

In [64]:
pd.set_option('display.max_rows', 10)
df_updated_cats = pd.read_csv("./data/2025-03-11_tissue_categories.csv")
df_updated_cats

Unnamed: 0,tissue,category
0,2 layer spheroid,other
1,"3 layer spheroid channel 1,epithelial",other
2,3 layer spheroid channel 2,other
3,3D healthy skin model,skin of body
4,A549 cells,lung
...,...,...
5613,"whole animal, developmental stages: 26, 28, 30...",other
5614,whole skin tissue,skin of body
5615,wing,other
5616,zebrafish embryos,embryo


In [65]:
# reload metadata
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.organism, tbl.tissue) \
    .distinct()

with db_connect() as conn:
    df_srx_meta = pd.read_sql(str(stmt), conn)
df_srx_meta

Unnamed: 0,organism,tissue
0,,
1,Mus musculus,caudal tissues from E8.5 embryos
2,Danio rerio,hepatized intestine
3,Homo sapiens,iPSC-derived neural cultures
4,Bos taurus,macrophages
...,...,...
6386,Homo sapiens,PBMC Healthy control (T50)
6387,Homo sapiens,parahippocampus
6388,Mus musculus,urethra
6389,Gallus gallus,Basilar papillae (auditory epithelium)


In [66]:
# merge on tissue
df_srx_meta = df_srx_meta.merge(df_updated_cats, on="tissue", how="left")
df_srx_meta

Unnamed: 0,organism,tissue,category
0,,,other
1,,,other
2,,,other
3,Mus musculus,caudal tissues from E8.5 embryos,embryo
4,Danio rerio,hepatized intestine,intestine
...,...,...,...
6388,Homo sapiens,PBMC Healthy control (T50),blood
6389,Homo sapiens,parahippocampus,brain
6390,Mus musculus,urethra,other
6391,Gallus gallus,Basilar papillae (auditory epithelium),sensory system


In [67]:
# filter to NaN
pd.set_option('display.max_rows', None)
df_srx_meta[df_srx_meta["category"].isnull()]["organism"].value_counts()

organism
other           78
metagenome      28
Homo sapiens     1
Name: count, dtype: int64

In [None]:
# tissues to categorize
x = sorted(df_srx_meta[(df_srx_meta["category"].isnull()) & (~df_srx_meta["organism"].isin(["other", "metagenome"]))]["tissue"].unique())
print('\n'.join(x))

N/A


# sessionInfo

In [96]:
!mamba list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.4.3                    pypi_0    pypi
aiohttp                   3.10.10                  pypi_0    pypi
aiosignal                 1.3.1                    pypi_0    pypi
annotated-types           0.7.0                    pypi_0    pypi
anyio                     4.6.2.post1              pypi_0    pypi
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
attrs                     24.2.0                   pypi_0    pypi
beautifulsoup4            4.12.3                   pypi_0    pypi
biopython                 1.84                     pypi_0    pypi
build                     1.2.2.post1              pypi_0    pypi
bzip2                     1.0.8             