## Load required packages

In [None]:
import os
import warnings
import scvi
import anndata
import scipy
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
%matplotlib inline

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")

pwd = os.getcwd()

## Load in anndata generated by pipeline

In [None]:
ad = anndata.read_h5ad(filename="multi_Micro-PVM.h5ad")

In [None]:
# split to three datasets by modality (RNA, ATAC, Multiome), and corrupt data
# by remove some data to create single-modality data
ad_rna = ad[ad.obs.modality == "rna", ad.var.modality == "Gene Expression"].copy()
ad_paired = ad[ad.obs.modality == "paired"].copy()
ad_atac = ad[ad.obs.modality == "atac", ad.var.modality == "Peaks"].copy()

In [None]:
## add filter steps here to reduce memory usage.
# We also filter features to remove those that appear in fewer than 1% of the cells
# This instruction removes genes or peaks that are not expressed/accessible in more than 1% of the cells.
print(ad_atac.shape)
min_cells = int(ad_atac.shape[0] * 0.01)
sc.pp.filter_genes(ad_atac, min_cells=min_cells)
print(ad_atac.shape)


In [None]:
rna_seaad = anndata.read_h5ad(filename='/allen/programs/celltypes/workgroups/hct/SEA-AD/Integration/multivi_subclasses/singleomeRNA_MTG_Full/Micro-PVM.h5ad')

In [None]:
var_names = ad_rna.var.index.values
print(var_names)
## merged multi_Micro-PVM.h5ad only has part of genes.
ad_rna_new = rna_seaad[:, var_names].copy()
print(sum(ad_rna_new.var.index.values != ad_rna.var.index.values))

In [None]:
print(ad_rna_new.shape)
min_cells = int(ad_rna_new.shape[0] * 0.01)
sc.pp.filter_genes(ad_rna_new, min_cells=min_cells)
print(ad_rna_new.shape)

In [None]:
ad_p = anndata.AnnData(X=ad_paired.X, obs=ad_paired.obs, var=ad_paired.var)
ad_a = anndata.AnnData(X=ad_atac.X, obs=ad_atac.obs, var=ad_atac.var)
ad_r = anndata.AnnData(X=ad_rna_new.X, obs=ad_rna_new.obs, var=ad_rna_new.var)

In [None]:
peaks = ad_a.var_names
genes = ad_r.var_names
features = genes.union(peaks)

In [None]:
print(ad_p.shape)
ad_p = ad_p[:, features].copy()
print(ad_p.shape)

In [None]:
# We can now use the organizing method from scvi to concatenate these anndata
adata_mvi = scvi.data.organize_multiome_anndatas(ad_p, ad_r, ad_a)

In [None]:
adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()

In [None]:
del ad, ad_rna, ad_paired, ad_atac, rna_seaad, ad_rna_new, ad_r, ad_p, ad_a

In [None]:
# create a mapping of values to replace
replace_dict = {'Female': 'F', 'Male': 'M'}

# replace the values using replace method
adata_mvi.obs['sex'] = adata_mvi.obs['sex'].replace(replace_dict)

In [None]:
print(adata_mvi.obs.modality.unique())
print(sum(adata_mvi.obs.donor_name.isnull()))
print(sum(adata_mvi.obs.sex.isnull()))
print(adata_mvi.obs.sex.unique())

In [None]:
scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality', categorical_covariate_keys=["donor_name", "sex"])

In [None]:
mvi = scvi.model.MULTIVI(
    adata_mvi,
    n_genes=(adata_mvi.var['modality']=='Gene Expression').sum(),
    n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
    n_layers_encoder=1, n_layers_decoder=1, n_hidden=256,
    n_latent=8, dropout_rate=0.1,
)

In [None]:
mvi.train()

In [None]:
adata_mvi.obsm["X_MultiVI"] = mvi.get_latent_representation()
sc.pp.neighbors(adata_mvi, use_rep="X_MultiVI")
sc.tl.umap(adata_mvi, min_dist=0.2)

plt.rcParams["figure.figsize"] = (5, 5)
sc.pl.umap(adata_mvi, color='modality')

sc.pl.umap(adata_mvi, color='sex')

sc.pl.umap(adata_mvi, color='donor_name')


In [None]:
def compute_label_purity_slot(adata_mvi, cell_idx, slot="subclass_scANVI"):
    idx = np.where(adata_mvi.uns['neighbors']['connectivities'][cell_idx].todense()>0)[1]
    df = adata_mvi.obs[[slot]].iloc[idx]
    df = df.loc[~df[slot].isnull()]
    u, c = np.unique(df, return_counts=True)
    if np.size(c) == 0:
        ratio = 0
        label = np.nan
    else:
        ratio = c[np.argmax(c)] / c.sum()
        label = u[np.argmax(c)]
    
    return ratio, label

In [None]:
cells = adata_mvi.shape[0]
purity_ratios = np.zeros(cells)
labels = adata_mvi.obs["supertype_scANVI"].to_numpy()
for i in np.arange(cells):
    ratio, label_ = compute_label_purity_slot(adata_mvi, i, slot="supertype_scANVI")
    purity_ratios[i] = ratio
    labels[i] = label_

adata_mvi.obs["supertype_scANVI_purity_ratio"] = purity_ratios
adata_mvi.obs["supertype_scANVI_label_transfer"] = labels

for i in np.where(adata_mvi.obs["supertype_scANVI_label_transfer"].isnull())[0]:
    ratio, label_ = compute_label_purity_slot(adata_mvi, i, slot="supertype_scANVI_label_transfer")
    adata_mvi.obs["supertype_scANVI_purity_ratio"].loc[i] = ratio
    adata_mvi.obs["supertype_scANVI_label_transfer"].loc[i] = label_
    
print(adata_mvi.obs["supertype_scANVI_label_transfer"].isnull().sum())

In [None]:
cells = adata_mvi.shape[0]
purity_ratios = np.zeros(cells)
labels = adata_mvi.obs["supertype_scANVI_leiden"].to_numpy()
for i in np.arange(cells):
    ratio, label_ = compute_label_purity_slot(adata_mvi, i, slot="supertype_scANVI_leiden")
    purity_ratios[i] = ratio
    labels[i] = label_

adata_mvi.obs["supertype_scANVI_leiden_purity_ratio"] = purity_ratios
adata_mvi.obs["supertype_scANVI_leiden_label_transfer"] = labels

for i in np.where(adata_mvi.obs["supertype_scANVI_leiden_label_transfer"].isnull())[0]:
    ratio, label_ = compute_label_purity_slot(adata_mvi, i, slot="supertype_scANVI_leiden_label_transfer")
    adata_mvi.obs["supertype_scANVI_leiden_purity_ratio"].loc[i] = ratio
    adata_mvi.obs["supertype_scANVI_leiden_label_transfer"].loc[i] = label_
    
print(adata_mvi.obs["supertype_scANVI_leiden_label_transfer"].isnull().sum())

In [None]:
sc.pl.umap(adata_mvi, color='supertype_scANVI_leiden_label_transfer')
sc.pl.umap(adata_mvi, color="supertype_scANVI_leiden")

In [None]:
adata_mvi.obs['supertype_scANVI_leiden_label_transfer'].value_counts()

In [None]:
from joblib import parallel_backend
sc.settings.n_jobs=32
with parallel_backend('threading', n_jobs=32):
    sc.tl.leiden(adata_mvi, key_added = "leiden_2.0", resolution = 2.0) # default resolution in 1.0
## Scanpy Clustering Leiden (https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/scanpy/scanpy_04_clustering.html)

In [None]:
sc.pl.umap(adata_mvi, color=['leiden_2.0'], legend_loc='on data')

In [None]:
total_peaks = np.asarray(np.sum(adata_mvi[:, adata_mvi.var.modality == 'Peaks'].X, axis=1)).reshape(-1)
adata_mvi.obs['total_peak_count'] = total_peaks
meta = adata_mvi.obs.copy()
df2 = meta.groupby(['leiden_1.0'])['total_peak_count'].mean()
df2

In [None]:
with parallel_backend('threading', n_jobs=32):
    sc.tl.leiden(adata_mvi, key_added = "leiden_1.0", resolution = 1.0) # default resolution in 1.0
## Scanpy Clustering Leiden (https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/scanpy/scanpy_04_clustering.html)

In [None]:
sc.pl.umap(adata_mvi, color=['leiden_1.0'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_mvi, color=['leiden_1.0'])

In [None]:
# Get data types of all columns.
dtypes = adata_mvi.obs.dtypes
# Find all non-string columns
non_string_cols = dtypes[dtypes == 'object'].index.tolist()
adata_mvi.obs[non_string_cols] = adata_mvi.obs[non_string_cols].astype(str)

In [None]:
adata_mvi.write("multivi_AD_MicroPVM.h5ad")
mvi.save("trained_multivi_AD_MicroPVM", save_anndata=True)

## Replace old names to new (multiome used old names)

In [None]:
adata_mvi = anndata.read_h5ad(filename="multivi_AD_MicroPVM.h5ad")
new_names = pd.read_csv('/allen/programs/celltypes/workgroups/hct/SEA-AD/RNAseq/diff_prop/input/MTG/old_names_to_new_names.csv', index_col=0)

In [None]:
# create a dictionary
name_dict = dict(zip(new_names['old_names'], new_names['new_names']))

# replace specific values in a DataFrame column with other values, use .replace()
adata_mvi.obs['supertype_scANVI_leiden_label_transfer_new'] = adata_mvi.obs['supertype_scANVI_leiden_label_transfer'].replace(name_dict)

In [None]:
plt.rcParams["figure.figsize"] = (5, 5)
sc.pl.umap(adata_mvi, color='supertype_scANVI_leiden_label_transfer_new')

In [None]:
sc.pl.umap(adata_mvi, color='supertype_scANVI_leiden')

In [None]:
## output supertype annotation in csv
meta = adata_mvi.obs.copy()
meta = meta.reset_index()
index_col_name = meta.columns[0]
meta = meta.rename(columns={index_col_name: 'index_name'})
meta.to_csv("multivi_AD_MicroPVM.h5ad.obs.csv", index=False)


In [None]:
adata_mvi.write("multivi_AD_MicroPVM_NewNames.h5ad")

In [None]:
bad_labels = ['cluster_doublet_score_flag', 'cluster_fraction_ribo_flag']
cut_off = (~adata_mvi.obs['supertype_scANVI_leiden_label_transfer_new'].isin(bad_labels))
ad = adata_mvi[cut_off].copy()

In [None]:
plt.rcParams["figure.figsize"] = (5, 5)
sc.pl.umap(ad, color='supertype_scANVI_leiden_label_transfer_new')

In [None]:
df = ad.obs.groupby(["leiden_1.0", "supertype_scANVI_leiden_label_transfer_new"]).size().reset_index(name="Counts")
df_wide = df.pivot_table(index="leiden_1.0", columns="supertype_scANVI_leiden_label_transfer_new", values="Counts")
import seaborn as sns
sns.heatmap(df_wide.T)

## reload anndata and plot QC

In [None]:
adata_mvi = anndata.read_h5ad(filename="multivi_AD_MicroPVM_NewNames.h5ad")

In [None]:
parent_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
csv_file_path = os.path.join(parent_folder, 'donor_name_ch_cognitivestatus_binary.csv')
selected_data = pd.read_csv(csv_file_path)

selected_dict = dict(zip(selected_data['donor_name'], selected_data['ch_cognitivestatus_binary']))
adata_mvi.obs['ch_cognitivestatus_binary'] = adata_mvi.obs['donor_name'].map(selected_dict)

In [None]:
plt.rcParams["figure.figsize"] = (5, 5)
sc.pl.umap(adata_mvi, color='nCount_RNA')
sc.pl.umap(adata_mvi, color='nFeature_RNA')
sc.pl.umap(adata_mvi, color='ch_cognitivestatus_binary')
sc.pl.umap(adata_mvi, color='total_peak_count')