# Script to curate basis matrix for deconvolution

**Author:** Mercedes Dalman

**Date:** December 2025

In [4]:
import pandas as pd
import numpy as np
import re

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)


In [None]:
basis = pd.read_csv("tsp_v1_basisMatrix.txt", sep="\t", index_col=0)

basis.shape, basis.head()


((17199, 62),
           adventitial cell  cardiac muscle cell  cell of skeletal muscle  \
 NAME                                                                       
 A1BG              1.000000             1.000000                 1.000000   
 A1BG.AS1          1.000000             1.000000                 1.000000   
 A1CF              1.000000             1.000000                 1.000000   
 A2M             380.760964            80.495118               435.426217   
 A2M.AS1           1.000000            13.444111                 1.000000   
 
           fibroblast/mesenchymal stem cell  melanocyte  mesothelial cell  \
 NAME                                                                       
 A1BG                              1.000000         1.0          1.000000   
 A1BG.AS1                          1.000000         1.0          1.000000   
 A1CF                              1.000000         1.0          1.000000   
 A2M                              49.747706         1.0     

In [None]:
# vascular / stromal-ish
vascular_stromal_cols = [
    "adventitial cell",
    "fibroblast/mesenchymal stem cell",
    "pericyte cell",
    "smooth muscle cell",
    "stromal cell",
    "endothelial cell",
]

#  core blood / immune for plasma cfRNA 
blood_core_cols = [
    "platelet",
    "erythrocyte/erythroid progenitor",
    "neutrophil",
    "monocyte",
    "macrophage",
    "myeloid progenitor",
    "mature conventional dendritic cell",
    "basophil",
    "mast cell",
    "b cell",
    "plasma cell",
    "plasmablast",
    "t cell",
    "nk cell",
    "innate lymphoid cell",
    "hematopoietic stem cell",
    "thymocyte",       
]

# Clearly non-blood parenchymal/epithelial/organ-specific
parenchymal_cols = [
    "cardiac muscle cell",
    "cell of skeletal muscle",
    "melanocyte",
    "mesothelial cell",
    "pancreatic stellate cell",
    "schwann cell",
    "tendon cell",
    "acinar cell of salivary gland/myoepithelial cell",
    "basal cell",
    "basal cell of prostate epithelium",
    "bladder urothelial cell",
    "ciliated cell/lung ciliated cell",
    "ciliated epithelial cell/epithelial cell of uterus",
    "club cell/type i pneumocyte",
    "club cell of prostate epithelium/hillock cell of prostate epithelium/hillock-club cell of prostate epithelium",
    "duct epithelial cell/serous cell of epithelium of bronchus",
    "duodenum glandular cell",
    "enterocyte of epithelium of large intestine/enterocyte of epithelium of small intestine/intestinal crypt stem cell of large intestine/large intestine goblet cell/mature enterocyte/paneth cell of epithelium of large intestine/small intestine goblet cell",
    "goblet cell",
    "hepatocyte",
    "immature enterocyte/intestinal crypt stem cell/intestinal crypt stem cell of small intestine/transit amplifying cell of large intestine",
    "intestinal enteroendocrine cell/paneth cell of epithelium of small intestine/transit amplifying cell of small intestine",
    "intestinal tuft cell",
    "intrahepatic cholangiocyte",
    "ionocyte/luminal epithelial cell of mammary gland",
    "keratinocyte",
    "kidney epithelial cell",
    "luminal cell of prostate epithelium",
    "medullary thymic epithelial cell",
    "mucus secreting cell/secretory cell/tracheal goblet cell",
    "pancreatic acinar cell",
    "pancreatic alpha cell/pancreatic beta cell",
    "pancreatic delta cell",
    "pancreatic ductal cell",
    "pancreatic pp cell",
    "pulmonary ionocyte",
    "respiratory goblet cell/respiratory mucous cell/serous cell of epithelium of trachea",
    "salivary gland cell",
    "type ii pneumocyte",
]



In [5]:
all_defined = set(vascular_stromal_cols) | set(blood_core_cols) | set(parenchymal_cols)
missing = set(basis.columns) - all_defined
extra   = all_defined - set(basis.columns)
missing, extra


(set(), set())

In [None]:

cfRNA_cols = blood_core_cols + vascular_stromal_cols  # drop parenchymal

basis_cfRNA = basis[cfRNA_cols].copy()
basis_cfRNA.shape
basis_cfRNA.head()


Unnamed: 0_level_0,platelet,erythrocyte/erythroid progenitor,neutrophil,monocyte,macrophage,myeloid progenitor,mature conventional dendritic cell,basophil,mast cell,b cell,plasma cell,plasmablast,t cell,nk cell,innate lymphoid cell,hematopoietic stem cell,thymocyte,adventitial cell,fibroblast/mesenchymal stem cell,pericyte cell,smooth muscle cell,stromal cell,endothelial cell
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
A1BG,12.28066,1.0,1.0,1.0,23.870167,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,25.315367,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
A1BG.AS1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,42.991619,1.0,7.609385,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
A1CF,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
A2M,1.0,1.0,1.0,118.031353,210.855543,1.0,27.33242,1.0,1.0,1.0,1.0,1.0,52.971705,49.631269,14.011085,1.0,1.0,380.760964,49.747706,2760.864541,1457.122802,59.641445,2085.923384
A2M.AS1,1.0,1.0,1.0,1.0,1.0,13.428319,1.0,119.402852,25.715621,1.0,1.0,1.0,45.364558,1.0,1.0,10.069964,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
rename_map = {
    "erythrocyte/erythroid progenitor": "erythroid",
    "mature conventional dendritic cell": "dendritic cell",
    "hematopoietic stem cell": "HSPC",
    "myeloid progenitor": "myeloid prog",
    "innate lymphoid cell": "ILC",
}

basis_cfRNA_simplified = basis_cfRNA.rename(columns=rename_map)
basis_cfRNA_simplified.columns


Index(['platelet', 'erythroid', 'neutrophil', 'monocyte', 'macrophage',
       'myeloid prog', 'dendritic cell', 'basophil', 'mast cell', 'b cell',
       'plasma cell', 'plasmablast', 't cell', 'nk cell', 'ILC', 'HSPC',
       'thymocyte', 'adventitial cell', 'fibroblast/mesenchymal stem cell',
       'pericyte cell', 'smooth muscle cell', 'stromal cell',
       'endothelial cell'],
      dtype='object')

In [13]:
basis_cfRNA_simplified

Unnamed: 0_level_0,platelet,erythroid,neutrophil,monocyte,macrophage,myeloid prog,dendritic cell,basophil,mast cell,b cell,plasma cell,plasmablast,t cell,nk cell,ILC,HSPC,thymocyte,adventitial cell,fibroblast/mesenchymal stem cell,pericyte cell,smooth muscle cell,stromal cell,endothelial cell
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
A1BG,12.280660,1.000000,1.0,1.000000,23.870167,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,25.315367,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
A1BG.AS1,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,42.991619,1.000000,7.609385,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
A1CF,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
A2M,1.000000,1.000000,1.0,118.031353,210.855543,1.000000,27.332420,1.000000,1.000000,1.000000,1.000000,1.000000,52.971705,49.631269,14.011085,1.000000,1.000000,380.760964,49.747706,2760.864541,1457.122802,59.641445,2085.923384
A2M.AS1,1.000000,1.000000,1.0,1.000000,1.000000,13.428319,1.000000,119.402852,25.715621,1.000000,1.000000,1.000000,45.364558,1.000000,1.000000,10.069964,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,38.566165,44.758963,1.0,1.000000,1.000000,183.781923,1.000000,1.000000,1.000000,1.000000,77.029255,30.265571,1.000000,48.026958,1.000000,37.740244,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
ZXDA,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,37.846418,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
ZXDB,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,61.805672,8.021457,1.000000,8.217161,22.620112,20.167588,17.358643,11.016476,13.819278
ZXDC,1.000000,21.716659,1.0,67.768594,19.700570,11.681043,31.090623,1.000000,1.000000,42.819518,10.331882,9.538726,25.626828,33.338987,1.000000,10.397762,1.000000,1.000000,20.814735,40.585413,12.650967,1.000000,1.000000


In [14]:
basis_cfRNA_simplified.to_csv("basis_cfRNA_blood_endothelial.txt", sep="\t", index = True)

