In [1]:
pip install scanpy

Note: you may need to restart the kernel to use updated packages.


In [22]:
import matplotlib.pyplot as plt
from matplotlib.colors import CenteredNorm
from scipy.stats import spearmanr
import numpy as np
import pandas as pd
import deconomix
import scanpy as sc
from pathlib import Path

In [23]:
data_path = Path('../Data/scatlas')

In [24]:
adata = sc.read_mtx(data_path / "E-MTAB-9543.aggregated_filtered_counts.mtx")
adata = adata.T

In [25]:
print(adata.shape)

(20484, 34369)


In [26]:
# Set cell and gene names
barcodes = pd.read_csv(data_path / "E-MTAB-9543.aggregated_filtered_counts.mtx_cols", header=None)[0].values
genes = pd.read_csv(data_path / "E-MTAB-9543.aggregated_filtered_counts.mtx_rows", header=None)[0].values

In [27]:
print(barcodes[:5])
print(genes[:5])

['ERR9924225-AACACGTCAGCCAGAA' 'ERR9924225-AACGTTGAGGACATTA'
 'ERR9924225-AAGGAGCTCGGCGCTA' 'ERR9924225-ACCGTAAGTCTCTCTG'
 'ERR9924225-ACGGAGATCAGCGACC']
['ENSG00000000003\tENSG00000000003' 'ENSG00000000005\tENSG00000000005'
 'ENSG00000000419\tENSG00000000419' 'ENSG00000000457\tENSG00000000457'
 'ENSG00000000460\tENSG00000000460']


In [28]:
adata.obs_names = barcodes
adata.var_names = genes

In [29]:
# Load clustering (cell type) labels
clusters = pd.read_csv(data_path / "E-MTAB-9543.clusters.tsv", sep="\t", header=None)
clusters = clusters.T # now rows = cells
print(clusters.shape)
print(clusters.columns)

print("Available clustering levels:", clusters.columns.tolist())

(20486, 10)
RangeIndex(start=0, stop=10, step=1)
Available clustering levels: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [30]:
clusters

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,sel.K,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE
1,K,7,12,17,24,32,45,58,67,76
2,ERR9924225-AACACGTCAGCCAGAA,1,1,1,1,1,1,1,15,11
3,ERR9924225-AACGTTGAGGACATTA,1,1,1,1,1,1,3,12,12
4,ERR9924225-AAGGAGCTCGGCGCTA,1,1,1,1,1,6,14,33,30
...,...,...,...,...,...,...,...,...,...,...
20481,ERR9924288-TTCTCAAGTAGTAGTA,3,2,3,2,2,5,8,1,1
20482,ERR9924288-TTCTCAATCTTCATGT,3,2,3,2,2,5,23,24,19
20483,ERR9924288-TTGACTTTCCCAAGTA,3,2,3,2,2,5,8,1,1
20484,ERR9924288-TTTGGTTAGGGAGTAA,3,2,3,2,2,5,8,1,1


In [31]:
marker_dir = Path("../Data/scatlas/marker_gene_files/")
df = pd.read_csv(marker_dir / "E-MTAB-9543.marker_genes_inferred_cell_type_-_ontology_labels.tsv", sep="\t")
print(df)

         cluster   ref  rank            genes     scores  logfoldchanges  \
0         B cell  rest     0  ENSG00000019582  39.896060        9.195016   
1         B cell  rest     1  ENSG00000245904  38.009483        6.031949   
2         B cell  rest     2  ENSG00000245970  35.502335        4.700161   
3         B cell  rest     3  ENSG00000152348  35.147835        4.410508   
4         B cell  rest     4  ENSG00000225339  33.374424        4.262527   
..           ...   ...   ...              ...        ...             ...   
794  plasma cell  rest    95  ENSG00000143384  10.775816        2.040168   
795  plasma cell  rest    96  ENSG00000270550  10.746228        3.853781   
796  plasma cell  rest    97  ENSG00000163902  10.731209        2.565032   
797  plasma cell  rest    98  ENSG00000211651  10.711214        3.221825   
798  plasma cell  rest    99  ENSG00000224373  10.613530        3.717747   

             pvals      pvals_adj  
0     0.000000e+00   0.000000e+00  
1     0.000000e

In [32]:
# Step 1: Get most common cell type names
cell_types = df["cluster"].value_counts().index.tolist()
print(cell_types)

['B cell', 'T cell', 'endothelial cell', 'mesenchymal cell', 'myeloid cell', 'neuron', 'plasma cell', 'epithelial cell']


In [33]:
cells = pd.read_csv(data_path / "E-MTAB-9543.cells.txt", sep="\t")
print(cells.columns)


Index(['Cell ID', 'Original barcode', 'Cell type_orig',
       'inferred cell type - authors labels',
       'inferred cell type - ontology labels'],
      dtype='object')


In [34]:
barcode_to_celltype = dict(zip(
    cells["Cell ID"],
    cells["inferred cell type - ontology labels"]
))

In [35]:
adata.obs["cell_type"] = adata.obs_names.map(barcode_to_celltype)

In [36]:
print(adata.obs["cell_type"].value_counts())

cell_type
epithelial cell     7224
T cell              1771
plasma cell         1382
B cell               769
mesenchymal cell     599
endothelial cell     372
myeloid cell         198
neuron                 2
Name: count, dtype: int64


In [37]:
# make a Boolean mask … True for all rows except the ones equal to "neuron"
mask = adata.obs["cell_type"] != "neuron"

# subset AnnData in-place (rows = spots)
adata = adata[mask].copy()

# check
print(adata.obs["cell_type"].value_counts())

cell_type
epithelial cell     7224
T cell              1771
plasma cell         1382
B cell               769
mesenchymal cell     599
endothelial cell     372
myeloid cell         198
Name: count, dtype: int64


In [38]:
# parameters
n_max = 10_000                        # maximum spots to average per cell-type
rng   = np.random.default_rng(0)      # reproducible sampling


mean_expr_dict = {}

# iterate over the *observed* categories only
for cell_type, n_spots in adata.obs["cell_type"].value_counts().items():

    # 1) select the spots of this cell-type
    subset = adata[adata.obs["cell_type"] == cell_type]

    # sanity-check: skip if somehow empty
    if subset.n_obs == 0:
        print(f"⚠️  no spots for '{cell_type}' – skipped")
        continue

    # 2) subsample if necessary
    if subset.n_obs > n_max:
        chosen = rng.choice(subset.obs_names, n_max, replace=False)
        subset = subset[chosen]

    # 3) average expression (works for dense *and* sparse)
    mean_vec = subset.X.mean(axis=0)
    mean_vec = mean_vec.A1 if hasattr(mean_vec, "A1") else np.asarray(mean_vec).ravel()

    mean_expr_dict[cell_type] = mean_vec

# 4) build reference matrix  (genes  ×  cell-types)
X_ref = pd.DataFrame(mean_expr_dict, index=adata.var_names)

print("X_ref shape:", X_ref.shape)   # genes × cell-types
display(X_ref.iloc[:5, :8])          # preview


X_ref shape: (34369, 7)


Unnamed: 0,epithelial cell,T cell,plasma cell,B cell,mesenchymal cell,endothelial cell,myeloid cell
ENSG00000000003\tENSG00000000003,0.025612,0.001129,0.007241,0.0013,0.015025,0.02957,0.0
ENSG00000000005\tENSG00000000005,0.000138,0.0,0.0,0.0,0.005008,0.0,0.0
ENSG00000000419\tENSG00000000419,0.039756,0.043196,0.026673,0.039034,0.045075,0.053763,0.035354
ENSG00000000457\tENSG00000000457,0.028085,0.028976,0.022048,0.020806,0.023342,0.013441,0.005257
ENSG00000000460\tENSG00000000460,0.048601,0.012987,0.011577,0.014304,0.019199,0.010753,0.015152


In [39]:
X_ref.to_csv("../Data/cell_type_reference_matrix.csv")

In [40]:
adata.write("../Data/scatlas/E-MTAB-9543_reference_data.h5ad")