## Integration of cancer data from Matusiak et al.

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
from muon import prot as pt

from matplotlib import colors
%matplotlib inline

import matplotlib.pyplot as plt

import muon as mu

## load datasets

In [None]:
mdata_raw = mu.read("./citeseq_mdata_allsamples_filtered.h5mu")

In [None]:
#load citeseq data
mdata = mu.read("./citeseq_mdata_allsamples_filtered_fine_clustering.h5mu")

In [None]:
adata_raw = mdata_raw["rna"]

In [None]:
mdata["rna"].obs['celltype_hires'] = mdata.obs['celltype_hires']

In [None]:
adata = mdata["rna"]

In [None]:
#limit adata_raw.
adata_raw_filtered = adata_raw[adata_raw.obs_names.isin(adata.obs_names)].copy()
adata_raw_filtered

In [None]:
adata_raw_filtered.obs = adata[adata_raw_filtered.obs_names].obs

In [None]:
adata_raw_filtered.raw = adata_raw_filtered

In [None]:
# load the raw counts matrix
raw_counts = pd.read_csv("./matusiak_raw_counts.csv", index_col=0)

# load the metadata
metadata = pd.read_csv("./matusiak_metadata.csv", index_col=0)

# ensure the indices match between the two files
raw_counts = raw_counts.loc[:, metadata.index]

# create an AnnData object
adata_cancer = sc.AnnData(X=raw_counts.T)  
adata_cancer.obs = metadata

In [None]:
adata_cancer.obs.columns

In [None]:
adata_cancer.obs

In [None]:
adata_cancer.obs['orig.ident'].astype('category').cat.categories

In [None]:
adata_cancer.obs['PatientNumber'].astype('category').cat.categories

In [None]:
adata_raw_filtered.obs['celltype_hires'].cat.categories

In [None]:
adata_raw_myelo = adata_raw_filtered[adata_raw_filtered.obs['celltype_hires'].isin(['actMAC','MAC_IL1B','MAC_SPP1','MAC_FOLR2',
                                                                                   'monoMAC','MAC_prol',
                                                                                   'Mono_CD16+'])]

In [None]:
adata_raw_myelo.obs['celltype_hires'].cat.categories

In [None]:
adata_cancer.obs["batch_scvi"] = adata_cancer.obs["dataset"]
adata_raw_myelo.obs["batch_scvi"] = "decidua"
adata_cancer.obs["donor_id"] = adata_cancer.obs["PatientNumber"]

In [None]:
adata_cancer.obs["location_new"] = adata_cancer.obs["MM_location"]
adata_raw_myelo.obs["location_new"] = adata_raw_myelo.obs["tissue"]

adata_cancer.obs["location_new"] = adata_cancer.obs["MM_location"]
adata_raw_myelo.obs["location_new"] = adata_raw_myelo.obs["tissue"]

In [None]:
adata_cancer.obs["celltype"] = adata_cancer.obs["annot_clean2"]
adata_raw_myelo.obs["celltype"] = adata_raw_myelo.obs["celltype_hires"]

In [None]:
adata_both=adata_raw_myelo.concatenate(adata_cancer, index_unique=None)

In [None]:
adata_both

In [None]:
print(adata_both.X)

In [None]:
adata_both.layers["counts"] = adata_both.X.copy()  # preserve counts
sc.pp.normalize_total(adata_both)
sc.pp.log1p(adata_both)
adata_both.raw = adata_both  # freeze the state in `.raw`

In [None]:
print(adata_both.layers["counts"])

In [None]:
print(adata_both.X)

In [None]:
print(adata_both.shape) 
print(adata_both.layers['counts'].shape)

In [None]:
print(type(adata_both.layers['counts']))
print(adata_both.layers['counts'].dtype)


In [None]:
from scipy.sparse import issparse

counts = adata_both.layers['counts']

# convert to dense array temporarily for checking NaNs
if issparse(counts):
    counts_dense = counts.toarray()
    print("Total NaNs:", np.isnan(counts_dense).sum())
else:
    print("Total NaNs:", np.isnan(counts).sum())

In [None]:
print("All-zero rows:", np.sum((counts.toarray() == 0).all(axis=1)))
print("All-zero columns:", np.sum((counts.toarray() == 0).all(axis=0)))


In [None]:
from scipy.sparse import csr_matrix

counts = adata_both.layers['counts']
counts_dense = counts.toarray()  

nonzero_cols = np.asarray((counts_dense != 0).any(axis=0)).flatten() 


In [None]:
adata_both = adata_both[:, nonzero_cols]  
counts_filtered = counts[:, nonzero_cols]  
adata_both.layers['counts'] = counts_filtered  


In [None]:
print("adata_both shape:", adata_both.shape)  
print("counts_filtered shape:", counts_filtered.shape)  


In [None]:
print(adata_both.obs['donor_id'].value_counts())


In [None]:
sc.pp.highly_variable_genes(adata_both, layer="counts", n_top_genes=5000, batch_key = 'batch_scvi', subset=True)

In [None]:
adata_both.var

In [None]:
# non-harmonized data: 
sc.tl.pca(adata_both)
sc.pp.neighbors(adata_both, n_pcs=50, n_neighbors=20) 
sc.tl.umap(adata_both, min_dist=0.3)


In [None]:
sc.pl.umap(adata_both, color=["donor_id","batch_scvi"], ncols=1,
    frameon=False)

In [None]:
adata_both.var

In [None]:
import scvi

In [None]:
adata_both.obs['location_new']

In [None]:
adata_scvi= adata_both.copy() 

In [None]:
adata_scvi.layers['counts']

In [None]:
scvi.model.SCVI.setup_anndata(adata_scvi, layer="counts", categorical_covariate_keys=["donor_id",
                                                                                     "location_new"],
                              batch_key = "batch_scvi"
                             )

In [None]:
model = scvi.model.SCVI(adata_scvi, n_layers=2, n_latent=30, gene_likelihood="nb") 

In [None]:
model

In [None]:
model.train()

In [None]:
model.save("myeloid_cell_matusiak_cancer_decidua/")

In [None]:
model = scvi.model.SCVI.load("myeloid_cell_matusiak_cancer_decidua/", adata_scvi)

In [None]:
latent = model.get_latent_representation()

In [None]:
adata_scvi.obsm["X_scVI"] = latent

In [None]:
adata_scvi.obs['celltype_hires']

In [None]:
# convert to object type to handle mixed data types
adata_scvi.obs['celltype_hires'] = adata_scvi.obs['celltype_hires'].astype(object)

# replace actual NaN values with the string 'NaN'
adata_scvi.obs['celltype_hires'] = adata_scvi.obs['celltype_hires'].fillna("NaN")

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(
    model,
    adata=adata_scvi,
    labels_key="celltype_hires",
    unlabeled_category="NaN",
)

In [None]:
scanvi_model.train(max_epochs=30, n_samples_per_label=100)

In [None]:
SCANVI_LATENT_KEY = "X_scANVI"
adata_scvi.obsm[SCANVI_LATENT_KEY] = scanvi_model.get_latent_representation(adata_scvi)

In [None]:

labels = scanvi_model.predict()           
probs  = scanvi_model.predict(soft=True)   

adata_scvi.obs["scanvi_predictions"] = labels            
adata_scvi.obsm["scanvi_prediction_probs"] = probs  

In [None]:
probs

In [None]:
label_indices = probs.columns.get_indexer(labels)
conf = probs.to_numpy()[np.arange(probs.shape[0]), label_indices]

adata_scvi.obs["scanvi_confidence"] = conf               

print(f"Median posterior probability: {np.median(conf):.2f}")
print(f"Cells with P > 0.9: {(conf > 0.9).mean():.1%}")

In [None]:
adata_scvi.obs['scanvi_predictions']

In [None]:
# non-harmonized data: 
sc.pl.umap(adata_scvi, color=["location_new","batch_scvi","scanvi_predictions",'celltype'], ncols=1,
    frameon=False)

In [None]:
pd.crosstab(adata_scvi.obs['celltype'],adata_scvi.obs['scanvi_predictions'])

In [None]:
pd.crosstab(adata_scvi.obs['celltype'],adata_scvi.obs['scanvi_predictions'])

In [None]:
pd.crosstab(adata_scvi.obs['location_new'],adata_scvi.obs['scanvi_predictions'])

In [None]:
pd.crosstab(adata_scvi.obs['location_new'],adata_scvi.obs['location_new'])

In [None]:
pd.crosstab(adata_scvi.obs['location_new'],adata_scvi.obs['scanvi_predictions'])

In [None]:
adata_scvi.obs['scanvi_celltype'] = adata_scvi.obs['scanvi_predictions']

In [None]:
if 'scanvi_celltype' not in adata_scvi.obs.columns:
    adata_scvi.obs['scanvi_celltype'] = pd.Series(dtype='object')
else:
    adata_scvi.obs['scanvi_celltype'] = adata_scvi.obs['scanvi_celltype'].astype('object')


adata_scvi.obs.loc[adata_scvi.obs['batch_scvi'] == 'decidua', 'scanvi_celltype'] = \
adata_scvi.obs.loc[adata_scvi.obs['batch_scvi'] == 'decidua', 'celltype_hires']

In [None]:
adata_scvi.obs = adata_scvi.obs.astype('str')
adata_scvi.write('matusiak_integr_20250407.h5ad')

In [None]:
adata_scvi = sc.read('matusiak_integr_20250407.h5ad')

In [None]:
adata_scvi.obs

In [None]:
adata_scvi.obs.columns

In [None]:
adata_scvi_mac = adata_scvi[adata_scvi.obs['scanvi_celltype'].isin(['MAC_FOLR2','MAC_SPP1','MAC_IL1B'])]

In [None]:
adata_scvi_mac.obs["scanvi_predictions"]

In [None]:
adata_scvi_mac.obs["scanvi_confidence"]

In [None]:
print(f"Median posterior probability: {np.median(adata_scvi_mac.obs['scanvi_confidence']):.2f}")
print(f"Cells with P > 0.9: {(adata_scvi_mac.obs['scanvi_confidence'] > 0.9).mean():.1%}")

In [None]:
pd.crosstab(adata_scvi_mac.obs['location_new'],adata_scvi_mac.obs['location_new'])

In [None]:
label_indices

In [None]:
adata_scvi_mac.obs['batch_location'] = adata_scvi_mac.obs['batch_scvi'].astype(str) + adata_scvi_mac.obs['location_new'].astype(str)

adata_scvi_mac.obs['predictions_batch'] = adata_scvi_mac.obs['scanvi_celltype'].astype(str) + adata_scvi_mac.obs['batch_scvi'].astype(str)


## Figure 3I

In [None]:
# get the data from adata
composition_data = adata_scvi_mac.obs[['batch_location', 'scanvi_celltype']]

# create a crosstab of tissue_combined by fib_subtypes
crosstab = pd.crosstab(composition_data['batch_location'], composition_data['scanvi_celltype'], normalize='index')

# define a custom order for sample_id
custom_order = ['LeeCRCNormal','blueCRCNormal','deciduaparietalis','LeeCRCTumor','blueCRCTumor',
                'blueBCTumor','pembroBCTumor','deciduabasalis'] 


# reindex the crosstab to follow the custom order
sorted_crosstab = crosstab.reindex(custom_order)

# plot the sorted stacked bar chart
fig, ax = plt.subplots(figsize=(4, 4))  # Adjust figure size
sorted_crosstab.plot(kind='bar', stacked=True, width=1, edgecolor='black', ax=ax, legend=False)


plt.xticks(rotation=45, ha='right')
plt.ylabel('Proportion')
plt.title('Macrophage Subtypes across integrated datasets')

plt.tight_layout()

plt.savefig("./Fig2-bargraph-cancer-integration.pdf", bbox_inches='tight')

plt.show()




fig, ax = plt.subplots(figsize=(4, 4))  
sorted_crosstab.plot(kind='bar', stacked=True, width=1, edgecolor='black', ax=ax, legend=True)


ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title="Mac Subtypes")

plt.xticks(rotation=45, ha='right')
plt.ylabel('Proportion')
plt.title('Bar Chart of Tissues by Mac Subtypes')

plt.tight_layout()

plt.show()

## Figure S2G

In [None]:
adata_scvi_mac_cancer = adata_scvi_mac[adata_scvi_mac.obs['batch_location'].isin(['LeeCRCNormal','LeeCRCTumor',
                                                                          'blueCRCNormal','blueCRCTumor',
                                                                          'blueBCTumor','pembroBCTumor'])]



markers = ["C1QA","FOLR2","MS4A6A","MS4A4A","MRC1",'TREM2',"SPP1",'CTSD',"HMOX1",'CD28',"APOE",'ITGAX','CD44',
          'MARCO',"CXCL8","CXCL2",'CCL3','SEMA3C','AQP9','IL1B',"NLRP3"]

sc.pl.matrixplot(adata_scvi_mac_cancer, markers, groupby='scanvi_celltype',
                 standard_scale='var',
                 categories_order=["MAC_FOLR2","MAC_SPP1","MAC_IL1B"
                                  ],
                 cmap='Blues'
                 ,save="Fig2-integration-matrixplot-macros-selectedRNA-cancer.pdf"
                 )

In [None]:
adata_scvi_mac_cancer.obsm['X_scVI'] = model.get_latent_representation(adata_scvi_mac_cancer)

In [None]:
sc.pp.neighbors(adata_scvi_mac_cancer, use_rep = 'X_scVI')

In [None]:
sc.tl.paga(adata_scvi_mac_cancer, groups = 'scanvi_celltype')
sc.pl.paga(adata_scvi_mac_cancer)

In [None]:
sc.tl.umap(adata_scvi_mac_cancer)

## Figure 3H

In [None]:
sc.pl.umap(adata_scvi_mac_cancer, color='scanvi_celltype', size=60, save='matusiak_cancer_scanvi.pdf')

In [None]:
sc.pl.umap(adata_scvi_mac_cancer, color='location_new', size=60, save='matusiak_cancer_tissue.pdf')

In [None]:
adata_scvi_mac_cancer.obs = adata_scvi_mac_cancer.obs.astype('str')
adata_scvi_mac_cancer.write('matusiak_integr_macs_20250407.h5ad')