### Notebook to subset monocytes in data set

#### Environment: Scanpy

- **Developed by:** Alexandra Cirnu
- **Modified by:** Alexandra Cirnu
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Date of creation:** 240507
- **Date of modification:** 240507

### Load in required modules

In [1]:
import numpy as np
import scanpy as sc
import pandas as pd
import muon as mu
from muon import atac as ac
from muon import prot as pt
from scipy.sparse import csr_matrix

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

-----
anndata     0.10.6
scanpy      1.9.8
-----
PIL                 10.2.0
appnope             0.1.4
asttokens           NA
colorama            0.4.6
comm                0.2.1
cycler              0.12.1
cython_runtime      NA
dateutil            2.8.2
debugpy             1.8.1
decorator           5.1.1
exceptiongroup      1.2.0
executing           2.0.1
h5py                3.10.0
igraph              0.11.4
ipykernel           6.29.2
ipywidgets          8.1.2
jedi                0.19.1
joblib              1.3.2
kiwisolver          1.4.5
leidenalg           0.10.2
llvmlite            0.42.0
matplotlib          3.8.3
mpl_toolkits        NA
mudata              0.2.3
muon                0.1.5
natsort             8.4.0
numba               0.59.0
numpy               1.26.4
packaging           23.2
pandas              2.2.0
parso               0.8.3
patsy               0.5.6
pickleshare         0.7.5
platformdirs        4.2.0
prompt_toolkit      3.0.42
psutil              5.9.8
pure_eval     

In [11]:
input_folder = '/Users/alex/data/ACM_cardiac_leuco/5_Leiden_clustering_and_annotation/'
output_folder = '/Users/alex/data/ACM_cardiac_leuco/Cell2cell/n_latent150/'

In [12]:
input = input_folder + 'ACM_myeloids_clustered_muon_ac240502.raw.h5mu'
mdata = mu.read_h5mu(input)
mdata



In [13]:
adata = mdata.mod["rna"]

In [14]:
X_data = adata.X.copy()
X_data_sparse = csr_matrix(X_data)
X_data_df = pd.DataFrame.sparse.from_spmatrix(X_data_sparse, index=adata.obs.index, columns=adata.var.index)
print("Shape of counts DataFrame:", X_data_df.shape)
X_data_df

Shape of counts DataFrame: (34482, 29378)


Unnamed: 0,Xkr4,Gm1992,Gm19938,Gm37381,Rp1,Sox17,Gm37587,Gm37323,Mrpl15,Lypla1,...,Vmn2r122,Il11ra2-1,Ccl27a-1,Ccl21c-1,Ccl27a-2,Il11ra2-2,Ccl19-1,Ccl21a-1,Gm10931,Gm16367
AAACGCTGTTGTGTTG-1-A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACGCTTCTCGCTCA-1-A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAGGTACAGAACATA-1-A1,0,0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,0
AAAGTCCAGGGACACT-1-A1,0,0,0,0,0,0,0,0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
AAAGTCCCAGTAGGAC-1-A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGAGGTTAGTA-1-B2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTGTTGCAAGCTCTA-1-B2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTGTTGGTACAGGTG-1-B2,0,0,0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
TTTGTTGTCCCAGGAC-1-B2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
adata_raw = adata.copy()

In [16]:
adata.obs

Unnamed: 0,cell_source,donor,n_counts,sample,seed_labels,condition,genotype,infection,library,model,...,percent_chrY,XIST-counts,S_score,G2M_score,_scvi_batch,_scvi_labels,batch,C_scANVI,leiden,classification
AAACGCTGTTGTGTTG-1-A1,AG_Gerull,Pkp2_Ctr_noninf_1,478.0,Pkp2_Ctr_noninf_1,DOCK4+MØ,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.000000,0.0,-0.019995,-0.083225,6,1,reference,DOCK4+MØ,3,DOCK4+MØ_3
AAACGCTTCTCGCTCA-1-A1,AG_Gerull,Pkp2_Ctr_noninf_1,3581.0,Pkp2_Ctr_noninf_1,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.025940,0.0,-0.169221,-0.390143,6,4,reference,Monocytes,3,DOCK4+MØ_3
AAAGGTACAGAACATA-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,16539.0,Pkp2_Ctr_noninf_2,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.051323,0.0,-0.317631,-0.578955,7,4,reference,Monocytes,6,Monocytes_6
AAAGTCCAGGGACACT-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,13389.0,Pkp2_Ctr_noninf_2,LYVE1+MØ,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.014110,0.0,-0.370103,-0.370084,7,2,reference,LYVE1+MØ,2,LYVE1+MØ_2
AAAGTCCCAGTAGGAC-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,12092.0,Pkp2_Ctr_noninf_2,LYVE1+MØ,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.040006,0.0,-0.142503,-0.357287,7,2,reference,LYVE1+MØ,2,LYVE1+MØ_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGAGGTTAGTA-1-B2,AG_Gerull,Ttn_HetKO_MCMV_1,4226.0,Ttn_HetKO_MCMV_1,Unknown,Ttn_HetKO_MCMV,Ttn_HetKO,MCMV,B2,Ttn,...,0.141978,0.0,-0.373712,-0.402608,25,7,query,Neutrophils,5,Neutrophils_5
TTTGTTGCAAGCTCTA-1-B2,AG_Gerull,Ttn_Ctr_noninf_1,3733.0,Ttn_Ctr_noninf_1,Unknown,Ttn_Ctr_noninf,Ttn_Ctr,noninf,B2,Ttn,...,0.000000,0.0,-0.179842,-0.495021,23,7,query,DOCK4+MØ,9,DOCK4+MØ_9
TTTGTTGGTACAGGTG-1-B2,AG_Gerull,Ttn_Ctr_MCMV_2,24293.0,Ttn_Ctr_MCMV_2,Unknown,Ttn_Ctr_MCMV,Ttn_Ctr,MCMV,B2,Ttn,...,0.000000,13.0,-0.435458,-0.744327,21,7,query,MØ_general,10,MØ_general_10
TTTGTTGTCCCAGGAC-1-B2,AG_Gerull,Ttn_Ctr_noninf_2,6388.0,Ttn_Ctr_noninf_2,Unknown,Ttn_Ctr_noninf,Ttn_Ctr,noninf,B2,Ttn,...,0.140889,0.0,0.080642,-0.877899,24,7,query,MØ_general,0,MØ_general_0


In [17]:
adata.obs['classification'].cat.categories

Index(['DC_12', 'DC_14', 'DC_16', 'DOCK4+MØ_3', 'DOCK4+MØ_9', 'LYVE1+MØ_1',
       'LYVE1+MØ_2', 'LYVE1+MØ_4', 'LYVE1+MØ_8', 'Mast_15', 'Monocytes_6',
       'Monocytes_11', 'Monocytes_13', 'Monocytes_17', 'MØ_general_0',
       'MØ_general_7', 'MØ_general_10', 'Neutrophils_5'],
      dtype='object')

In [18]:
adata = adata[adata.obs['classification'].isin(["Monocytes_6", 'Monocytes_11', 'Monocytes_13', 'Monocytes_17']) , :]
adata.obs   

Unnamed: 0,cell_source,donor,n_counts,sample,seed_labels,condition,genotype,infection,library,model,...,percent_chrY,XIST-counts,S_score,G2M_score,_scvi_batch,_scvi_labels,batch,C_scANVI,leiden,classification
AAAGGTACAGAACATA-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,16539.0,Pkp2_Ctr_noninf_2,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.051323,0.0,-0.317631,-0.578955,7,4,reference,Monocytes,6,Monocytes_6
AAGACAACAAGAAATC-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,1789.0,Pkp2_Ctr_noninf_2,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.106781,0.0,-0.239630,-0.128468,7,4,reference,Monocytes,13,Monocytes_13
AAGACAATCTTTGCAT-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,19172.0,Pkp2_Ctr_noninf_2,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.014481,0.0,-0.412424,-0.347555,7,4,reference,Monocytes,11,Monocytes_11
AAGGAATAGGACAGTC-1-A1,AG_Gerull,Pkp2_Ctr_noninf_2,16261.0,Pkp2_Ctr_noninf_2,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.017912,0.0,-0.953079,-0.571431,7,4,reference,Monocytes,6,Monocytes_6
AAGTACCCAGAAATTG-1-A1,AG_Gerull,Pkp2_Ctr_noninf_1,11186.0,Pkp2_Ctr_noninf_1,Monocytes,Pkp2_Ctr_noninf,Pkp2_Ctr,noninf,A1,Pkp2,...,0.008538,0.0,-0.629725,-0.501903,6,4,reference,Monocytes,6,Monocytes_6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGAGTGAGCCGATAG-1-B2,AG_Gerull,Ttn_HetKO_MCMV_1,5116.0,Ttn_HetKO_MCMV_1,Unknown,Ttn_HetKO_MCMV,Ttn_HetKO,MCMV,B2,Ttn,...,0.058640,0.0,-0.391301,-0.470570,25,7,query,Monocytes,11,Monocytes_11
TTGATGGGTTCTCACC-1-B2,AG_Gerull,Ttn_HetKO_MCMV_2,1059.0,Ttn_HetKO_MCMV_2,Unknown,Ttn_HetKO_MCMV,Ttn_HetKO,MCMV,B2,Ttn,...,0.188857,0.0,-0.108308,-0.302285,26,7,query,Monocytes,13,Monocytes_13
TTGCCTGGTGTGAGCA-1-B2,AG_Gerull,Ttn_Ctr_noninf_2,9552.0,Ttn_Ctr_noninf_2,Unknown,Ttn_Ctr_noninf,Ttn_Ctr,noninf,B2,Ttn,...,0.000000,0.0,-0.410611,-0.599171,24,7,query,Monocytes,6,Monocytes_6
TTTAGTCTCTACAGGT-1-B2,AG_Gerull,Ttn_HetKO_noninf_2,8281.0,Ttn_HetKO_noninf_2,Unknown,Ttn_HetKO_noninf,Ttn_HetKO,noninf,B2,Ttn,...,0.024152,0.0,0.018767,-0.552671,29,7,query,Monocytes,6,Monocytes_6


In [19]:
def X_is_raw(adata): return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

X_is_raw(adata)

True

#### Update the mdata object

In [20]:
mdata.mod['rna'] = adata
mdata

In [21]:
mdata.mod['prot'] = mdata.mod['prot'][mdata.mod['rna'].obs.index]
mdata.update()
mdata

## Save merged object

In [22]:
mdata.write(output_folder + "/Subsetted_monocytes_ac240507.raw.h5mu")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c
