In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
from muon import prot as pt

from matplotlib import colors
%matplotlib inline

import matplotlib.pyplot as plt

import muon as mu

import scvi

## load datasets

In [None]:
mdata_raw = mu.read("./citeseq_mdata_allsamples_filtered.h5mu")

In [None]:
#load citeseq data
mdata = mu.read("./citeseq_mdata_allsamples_filtered_fine_clustering.h5mu")

In [None]:
adata_raw = mdata_raw["rna"]

In [None]:
mdata["rna"].obs['celltype_hires'] = mdata.obs['celltype_hires']

In [None]:
#limit adata_raw.
adata = mdata["rna"]
adata_raw_filtered = adata_raw[adata_raw.obs_names.isin(adata.obs_names)].copy()
adata_raw_filtered

In [None]:
adata_raw_filtered.obs = adata[adata_raw_filtered.obs_names].obs

In [None]:
adata_raw_filtered.raw = adata_raw_filtered

In [None]:
adata_roser = sc.read('adata_all_donors_all_cell_states_raw_counts_in_raw_normlog_counts_in_X_for_download_UPD_20230307.h5ad')

In [None]:
adata_roser.obs['cell_type'].cat.categories

In [None]:
adata_roser.obs['coarse_annot'].cat.categories

In [None]:
adata_roser = adata_roser.raw.to_adata()

In [None]:
adata_roser.obs['dataset']
adata_nature = adata_roser[adata_roser.obs['dataset'].isin(['Vento_Nature'])]

In [None]:
adata_nature.obs

In [None]:
adata_raw_filtered.obs['celltype_hires'].cat.categories

In [None]:
adata_raw_filtered.obs['celltype_hires']

In [None]:
adata_raw_filtered = adata_raw_filtered.copy()
adata_raw_filtered.obs["dataset"] = "CITE-seq"
adata_nature.obs["donor_id"] = adata_nature.obs["donor"]

In [None]:
adata_nature.obs["celltype"] = adata_nature.obs["cell_type"]
adata_raw_filtered.obs["celltype"] = adata_raw_filtered.obs["celltype_hires"]

In [None]:
adata_both=adata_raw_filtered.concatenate(adata_nature, index_unique=None)

In [None]:
adata_both

In [None]:
print(adata_both.X)

In [None]:
adata_both.layers["counts"] = adata_both.X.copy()  # preserve counts
sc.pp.normalize_total(adata_both)
sc.pp.log1p(adata_both)
adata_both.raw = adata_both  # freeze the state in `.raw`

In [None]:
print(adata_both.shape)  # check the overall shape
print(adata_both.layers['counts'].shape)

In [None]:
print(type(adata_both.layers['counts']))
print(adata_both.layers['counts'].dtype)

In [None]:
print(adata_both.obs['donor_id'].value_counts())


In [None]:
print(adata_both.obs['tissue'].value_counts())


In [None]:
adata_both = adata_both[adata_both.obs['tissue'].isin(['basalis','parietalis','decidua_immune','decidua_non_immune'])]

In [None]:
sc.pp.highly_variable_genes(adata_both, layer="counts", n_top_genes=5000, batch_key = 'dataset', subset=True)

In [None]:
#Non-harmonized data: 
sc.tl.pca(adata_both)
sc.pp.neighbors(adata_both, n_pcs=50, n_neighbors=30) #was 30
sc.tl.umap(adata_both, min_dist=0.3)

In [None]:
sc.pl.umap(adata_both, color=["donor_id","tissue"], ncols=1,
    frameon=False)

In [None]:
adata_both.obs['tissue']

In [None]:
adata_scvi= adata_both.copy() 

In [None]:
scvi.model.SCVI.setup_anndata(adata_scvi, layer="counts", categorical_covariate_keys=["donor_id","dataset",
                                                                                     "tissue"]
                             )

In [None]:
model = scvi.model.SCVI(adata_scvi, n_layers=1, n_latent=10, gene_likelihood="nb") 

In [None]:
model

In [None]:
model.train()

In [None]:
model.save("vento_citeseq_20250213/")

In [None]:
latent = model.get_latent_representation()

In [None]:
adata_scvi.obsm["X_scVI"] = latent

In [None]:
denoised = model.get_normalized_expression(adata_scvi, library_size=1e4)
denoised.iloc[:5, :5]

In [None]:
adata_scvi.obs['cell_type']

In [None]:
# convert to object type to handle mixed data types
adata_scvi.obs['cell_type'] = adata_scvi.obs['cell_type'].astype(object)

# replace actual NaN values with the string 'NaN'
adata_scvi.obs['cell_type'] = adata_scvi.obs['cell_type'].fillna("NaN")


In [None]:
print(adata_scvi.obs['cell_type'].value_counts())


In [None]:
# remove placental cells: HOFB, fF1, fF2, Endo_F
adata_scvi = adata_scvi[~adata_scvi.obs['cell_type'].isin(['HOFB','fF1','fF2','Endo_F'])]

In [None]:
print(adata_scvi.obs['cell_type'].value_counts())


In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(
    model,
    adata=adata_scvi,
    labels_key="cell_type",
    unlabeled_category="NaN",
)

In [None]:
scanvi_model.train(max_epochs=30, n_samples_per_label=100)

In [None]:
scanvi_model.save("scanvi_vento_citeseq_20250213/")

In [None]:
SCANVI_LATENT_KEY = "X_scANVI"
adata_scvi.obsm[SCANVI_LATENT_KEY] = scanvi_model.get_latent_representation(adata_scvi)

In [None]:
adata_scvi.obs["scanvi_predictions"] = scanvi_model.predict()

In [None]:
adata_scvi.obs['scanvi_predictions']

In [None]:
# non-harmonized data: 
sc.pl.umap(adata_scvi, color=["tissue","dataset","scanvi_predictions",'celltype'], ncols=1,
    frameon=False)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200) 

In [None]:
pd.crosstab(adata_scvi.obs['celltype_hires'],adata_scvi.obs['scanvi_predictions'])

In [None]:
pd.crosstab(adata_scvi.obs['celltype_hires'], adata_scvi.obs['scanvi_predictions'])

In [None]:
adata_scvi_citeseq = adata_scvi[adata_scvi.obs['dataset'].isin(['CITE-seq'])]

In [None]:
pd.crosstab(adata_scvi_citeseq.obs['celltype_hires'], adata_scvi_citeseq.obs['scanvi_predictions'])

In [None]:
adata_scvi_citeseq.obs['scanvi_predictions'].cat.categories

## Figure S1A

In [None]:
crosstab = pd.crosstab(
    adata_scvi_citeseq.obs['celltype_hires'],
    adata_scvi_citeseq.obs['scanvi_predictions']
)

crosstab_norm = crosstab.div(crosstab.sum(axis=1), axis=0) * 100

manual_row_order = [
    'BCell', 'PLASMA', 'ILC', 'CD39+ NK', 'CD39- NK', 'CD39-CD103+ NK', 'pNK','NK_CD16+',
       'Tcell_CD4+cyto', 'Tcell_CD4+exh', 'Tcell_CD4+prol', 'Tcell_CD4+tr',
       'Tcell_CD4+blood', 'Tcell_gd', 'Tcell_reg', 'MAITcell', 'Tcell_CD8+tr',
       'Tcell_CD8+eff', 'Tcell_CD8+prol', 'Tcell_CD8+blood', 'DC1','DC1_prol', 'DC2','DC2_prol',
        'decPAM2', 'decBAM2', 'decBAM1', 'decPAM1','pMac', 'Mono','CD16+ Mono', 
        'MAST', 'hpFib','decFib', 'ENDO', 'Lymphatic', 'MURAL', 'vCTB','CCT', 'iEVT', 'STB',
        'EpiCell'
]

manual_col_order = [
    'B_cells','ILC3','dNK1', 'dNK2',
       'dNK3','dT_cells', 'dT_regs','T_cells', 'DC','dDC', 'dM1', 'dM2','M3', 'MO', 'Granulocytes',
    'dS1', 'dS2', 'dS3','Endo_M','Endo_L','PV AOC3', 'PV MMP11', 'PV STEAP4',  
      'uSMC', 'EVT_1', 'EVT_2', 
         'iEVT','SCT',
         'dEpi_secretory'
]

# rows
remaining_rows = [r for r in crosstab_norm.index if r not in manual_row_order]
final_row_order = manual_row_order + remaining_rows

# columns
remaining_cols = [c for c in crosstab_norm.columns if c not in manual_col_order]
final_col_order = manual_col_order + remaining_cols

# Re-index with the manual order
crosstab_ord = crosstab_norm.loc[final_row_order, final_col_order]


plt.figure(figsize=(15, 15))
sns.heatmap(
    crosstab_ord,
    annot=True, fmt='.2f',
    cmap='Blues', cbar=True,
    vmin=0,          
    vmax=100,        
    xticklabels=crosstab_ord.columns,
    yticklabels=crosstab_ord.index,
    annot_kws={"fontsize": 8}
)

plt.xlabel('scANVI predictions (Vento-Tormo et al.)', fontsize=14)
plt.ylabel('CITE-seq annotation',           fontsize=14)
plt.title('Heatmap of scANVI predictions vs. CITE-seq annotations', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.savefig("./Supplement_vento_decidua_integration_matrix.pdf", bbox_inches='tight')

plt.show()
