## scib-pipeline-R4.0 env: Sample Integration

In [None]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
#import scib
import anndata

# pyscenic env:
#import loompy
#import colorcet as cc
import scvi

# Initialize random seed
import random
random.seed(111)

# set a working directory
# wdir = "/ceph/project/tendonhca/akurjan/analysis/"
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
NORMALIZATION_FOLDERNAME = "foetal/results/Normalisation/"
RESULTS_FOLDERNAME = "foetal/results/scVI/"
FIGURES_FOLDERNAME = "foetal/figures/scVI/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')

# FOETAL DATA

In [None]:
adata = sc.read_h5ad(os.path.join(NORMALIZATION_FOLDERNAME, 'dev_adata_normalized.h5ad'))
adata

In [None]:
def convert_uint_to_int_single(adata):
    """
    Convert uint32 and uint64 dtypes in anndata object to int32 and int64 dtypes,
    respectively. Prints a message for each conversion.
    """
    # Check var and obs dtypes
    for dtype in ['var', 'obs']:
        dtype_data = getattr(adata, dtype)
        if isinstance(dtype_data, np.ndarray):
            # If dtype_data is a structured array, convert each field individually
            for field in dtype_data.dtype.names:
                if dtype_data[field].dtype == 'uint32':
                    dtype_data[field] = dtype_data[field].astype('float32')
                    print(f"Converted {dtype}.{field} from uint32 to float32.")
                elif dtype_data[field].dtype == 'uint64':
                    dtype_data[field] = dtype_data[field].astype('float64')
                    print(f"Converted {dtype}.{field} from uint64 to float64.")
                elif dtype_data[field].dtype == 'uint16':
                    dtype_data[field] = dtype_data[field].astype('int16')
                    print(f"Converted {dtype}.{field} from uint16 to int16.")
        elif isinstance(dtype_data, pd.DataFrame):
            # If dtype_data is a DataFrame, convert each column individually
            for col in dtype_data.columns:
                if dtype_data[col].dtype == 'uint32':
                    dtype_data[col] = dtype_data[col].astype('float32')
                    print(f"Converted {dtype}.{col} from uint32 to float32.")
                elif dtype_data[col].dtype == 'uint64':
                    dtype_data[col] = dtype_data[col].astype('float64')
                    print(f"Converted {dtype}.{col} from uint64 to float64.")
                elif dtype_data[col].dtype == 'uint16':
                    dtype_data[col] = dtype_data[col].astype('int16')
                    print(f"Converted {dtype}.{col} from uint16 to int16.")
                
    # Update X and layers dtypes
    if adata.X.dtype == 'uint32':
        adata.X = adata.X.astype('int32')
        print("Converted X from uint32 to int32.")
    elif adata.X.dtype == 'uint64':
        adata.X = adata.X.astype('int64')
        print("Converted X from uint64 to int64.")
    elif adata.X.dtype == 'uint16':
        adata.X = adata.X.astype('int16')
        print("Converted X from uint16 to int16.")
    for layer_key, layer_val in adata.layers.items():
        if layer_val.dtype == 'uint32':
            adata.layers[layer_key] = layer_val.astype('int32')
            print(f"Converted layer {layer_key} from uint32 to int32.")
        elif layer_val.dtype == 'uint64':
            adata.layers[layer_key] = layer_val.astype('int64')
            print(f"Converted layer {layer_key} from uint64 to int64.")
        elif layer_val.dtype == 'uint16':
            adata.layers[layer_key] = layer_val.astype('int16')
            print(f"Converted layer {layer_key} from uint16 to int16.")

In [None]:
convert_uint_to_int_single(adata)

In [None]:
adata.X = adata.layers["counts"].copy()
print(adata.X[1:5,1:5])

In [None]:
sc.pp.filter_genes(adata, min_cells=20)

In [None]:
adata

In [None]:
del adata.raw

# scVI INTEGRATION
As a first step, we assume that the data is completely unlabelled and we wish to find common axes of variation between the two datasets. There are many methods available in scanpy for this purpose (BBKNN, Scanorama, etc.). In this notebook we present scVI. To run scVI, we simply need to:

1) Register the AnnData object with the correct key to identify the sample and the layer key with the count data.
2) Create an SCVI model object.

Decide which variable to treat as the batch on which the model will be conditioned (i.e. its effects minimised in low dimensional space). Make sure you really understand how scVI and scANVI work. Do scIB, actually.

#### scVI (single-cell Variational Inference)
*scVI* is a hierarchical Bayesian model for single-cell RNA sequencing data with conditional distributions parametrized by neural networks. Working as a hybrid between a neural network and a bayesian network, scVI performs data harmonization. VAE refers to variational auto-encoders for single-cell gene expression data. scVI is similar to VAE as it tries to bring a more suitable structure to the latent space. While VAE allows users to make observations in a semi-supervised fashion, scVI is easier to train and specific cell-type labels for the dataset are not required in the pure unsupervised case.

Define the scVI model
First, we define the model and its hyperparameters:

n_hidden: number of units in the hidden layer = 128
n_latent: number of dimensions in the shared latent space = 10 (how many dimensions in z)
n_layers: number of layers in the neural network
dispersion: ‘gene’: each gene has its own dispersion parameter; ‘gene-batch’: each gene in each batch has its own dispersion parameter



In [None]:
adata.obs[['sample', 'age']].value_counts()

In [None]:
adata.obs.columns

In [None]:
sc.pl.umap(adata, color='sample')

## Hyperparameter Tuning

In [None]:
model_cls = scvi.model.SCVI
model_cls.setup_anndata(adata)

In [None]:
model_cls

In [None]:
import ray
import hyperopt
from ray import tune
from scvi import autotune

scvi_tuner = autotune.ModelTuner(model_cls)
scvi_tuner.info()

In [None]:
search_space = {
    "n_latent": tune.choice([10, 14, 6]),
    "n_hidden": tune.choice([256, 128]),
    "n_layers": tune.choice([1, 2, 3]),
    "lr": tune.loguniform(1e-4, 1e-2)
}

In [None]:
ray.init(log_to_driver=False)

In [None]:
results = scvi_tuner.fit(
    adata,
    metric="validation_loss",
    search_space=search_space,
    searcher='hyperopt',
    num_samples=20,
    max_epochs=150,
    resources={"cpu": 20, "gpu": 1},
)

In [None]:
print(results.model_kwargs)
print(results.train_kwargs)

In [None]:
results

In [None]:
ray.shutdown()

In [None]:
search_space = {
    "n_latent": tune.choice([14, 30]),
    "n_layers": tune.choice([1, 2]),
    "dropout_rate": tune.choice([0.1, 0.2]),
    "gene_likelihood": tune.choice(['zinb', 'nb'])
}

In [None]:
ray.init(log_to_driver=False)

In [None]:
results = scvi_tuner.fit(
    adata,
    metric="validation_loss",
    search_space=search_space,
    searcher='hyperopt',
    num_samples=30,
    max_epochs=150,
    resources={"cpu": 20, "gpu": 1},
)

In [None]:
print(results.model_kwargs)
print(results.train_kwargs)

In [None]:
results

# Final Model

In [None]:
scvi.model.SCVI.setup_anndata(adata, #adata_hvg
                              layer="counts", 
                              #batch_key="sampletype",
                              categorical_covariate_keys=["type","libbatch"], # effects that are taken into account that we are not necessarily interested in
                              #"age", "type", "phase", - not correcting for to avoid removing variation of interest
                              )

#continuous_covariate_keys=["percent_mito"]

In [None]:
# vae = scvi.model.SCVI(adata_hvg, n_hidden = 128, n_latent=30, n_layers=2, dropout_rate=0.1, dispersion="gene-batch", gene_likelihood='nb')
vae = scvi.model.SCVI(adata, n_hidden = 128, n_latent=30, n_layers=2, dropout_rate=0.1, dispersion="gene-batch", gene_likelihood='zinb')
vae

In [None]:
vae.view_anndata_setup(adata)

Then, we define a trainer using the model and the dataset to train it with
in the unsupervised setting, train_size=1.0 and all cells are used for training

In [None]:
#max_epochs_scvi = np.min([round((20000 / adata.n_obs) * 400), 400])
#max_epochs_scvi

In [None]:
%%time
vae.train(max_epochs = 400, train_size = 0.9, validation_size = 0.1, 
          use_gpu=True, accelerator='gpu', 
          check_val_every_n_epoch=1,
          early_stopping=True,
          early_stopping_patience=20,
          early_stopping_monitor="elbo_validation",
         )

In [None]:
# train_elbo = vae.history["elbo_train"][1:]
# test_elbo = vae.history["elbo_validation"]
# ax = train_elbo.plot()
# test_elbo.plot(ax=ax)

# Ensure convergence
train_test_results = vae.history["elbo_train"]
train_test_results["elbo_validation"] = vae.history["elbo_validation"]
train_test_results.iloc[10:].plot(logy=True)  # exclude first 10 epochs
plt.show()

In [None]:
vae.get_elbo(adata)

In [None]:
# Full = all genes, batch-corrected for libbatch and sex, parameters set to zinb and gene-batch:
vae.save(os.path.join(RESULTS_FOLDERNAME, "FoetalFull_SampleTypeLibbatch_ZinbGeneBatch/"), overwrite=True)

# Load saved model

In [None]:
scvi.model.SCVI.setup_anndata(adata, #adata_hvg
                              layer="counts",
                              categorical_covariate_keys=["sampletype", "libbatch"], # other effects that are taken into account that we are not necessarily interested in
                              #"age", "type", "phase", "sample" - not correcting for to avoid removing variation of interest
                              )

vae = scvi.model.SCVI(adata, n_hidden = 128, n_latent=30, n_layers=2, dropout_rate=0.1, dispersion="gene-batch", gene_likelihood='zinb')
vae

In [None]:
vae = vae.load(os.path.join(RESULTS_FOLDERNAME, "FoetalFull_SampleTypeLibbatch_ZinbGeneBatch/"), use_gpu=True, adata=adata)
vae

Once the training is done, we can evaluate the latent representation of each cell in the dataset and add it to the AnnData object. Now, we use the scVI latent space to generate the same UMAP plots to see if scVI successfully accounts for batch effects in the data.

In [None]:
adata.obsm["X_scVI"] = vae.get_latent_representation()
# use scVI latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="X_scVI", metric='correlation')
sc.tl.umap(adata)

In [None]:
def plot_umaps(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(7, 4*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_umaps(adata, ['sampletype', 'sample', 'age', 'libbatch', 'type', 'phase', 'sex'], 
           filename = 'dev_UMAP_plots_scVI_withoutSampletypeCorrection.svg')

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'dev_libbatchsex_scVI.h5ad'))

# ANNOTATION

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_libbatchsex_scVI.h5ad'))
adata

In [None]:
sc.pl.umap(adata, 
           color=['NOTCH3', 'MYH11', 'MYH9', 'DES', 'MCAM', 'RGS5', 'ACTA2', 
                  'CSPG4', 'PDGFRB', 'COL4A1', 'KCNJ8', 'MGP', 'MYL9', 
                  'ABCC9', 'cell_type'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_pericyte.png')

In [None]:
adata.obs['ageint'] = adata.obs['age'].str.replace('w', '').astype(int)
adata.obs['ageint'].value_counts()
sc.pl.umap(adata, color='ageint', save='_dev_ageint.svg')

In [None]:
sc.tl.leiden(adata, resolution = 0.4, key_added = 'leiden_04scvi')
sc.tl.leiden(adata, resolution = 0.5, key_added = 'leiden_05scvi')
sc.tl.leiden(adata, resolution = 0.6, key_added = 'leiden_06scvi')
sc.pl.umap(adata, color=["leiden_04scvi", "leiden_05scvi", "leiden_06scvi"], legend_loc="on data", frameon=False)

In [None]:
sc.pl.umap(adata, color='leiden_06scvi', frameon=False, legend_loc='on data', save='_leiden06.svg')

In [None]:
sc.pl.umap(adata, color=["age", "sampletype"], frameon=False)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_06scvi', method='wilcoxon', key_added='wilcoxon_06scvi', 
                        use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_06scvi')

In [None]:
sc.tl.dendrogram(adata, 'leiden_06scvi')
sc.pl.dendrogram(adata, groupby='leiden_06scvi')

In [None]:
sc.tl.rank_genes_groups(adata, groupby='type', method='wilcoxon', key_added='wilcoxon_type_scvi', 
                        use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=30, sharey=False, key='wilcoxon_type_scvi')

In [None]:
sc.pl.umap(adata, 
           color=['SCX', 'MKX', 'TNMD', 'FMOD', 'THBS2', 'THBS4', 'EGR1', 'leiden_06scvi', 'age', 'type', 'sample', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_early_tendon_lineagegenes-scvi.svg'
          )

In [None]:
sc.pl.umap(adata, 
           color=['COL3A1', 'LUM', 'FBN1', 'COL6A6', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_fib_type2-scvi.svg'
          )

In [None]:
sc.pl.umap(adata, 
           color=['SOX9', 'SOX5', 'SOX6', 'ACAN', 'COL9A1', 'COL2A1', 'COL11A2', 'COMP', 'DCX', 'CNMD', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_chondrogenic_fibroblasts-scvi.svg')

In [None]:
sc.pl.umap(adata, 
           color=['NEB', 'TTN', 'DMD', 'NEXN', 'TRDN', 'MYO18B', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_myocytes_fibroblasts-scvi.svg')

In [None]:
sc.pl.umap(adata, 
           color=['MYH8', 'MYH3', 'MYL1', 'TNNC2', 'ACTA1', 'DES', 'MYOG', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_myocytes2_fibroblasts-scvi.svg')

In [None]:
sc.pl.umap(adata, 
           color=['PECAM1', 'CD34', 'MCAM', 'ENG', 'KDR', 'VWF', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_VECs-scvi.svg')

In [None]:
sc.pl.umap(adata, 
           color=['PROX1', 'LYVE1', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_DEV_LECs-scvi.svg')

In [None]:
# Convert the index to string
adata.var.index = adata.var.index.astype(str)
adata.var_names_make_unique()

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, groupby='leiden_06scvi',
                                use_raw=False, 
                                layer="log1p_norm", 
                                vmax=3, vmin=0, 
                                #cmap='RdBu_r',
                                key='wilcoxon_06scvi',
                                save='topDEGs06scvi_norm.svg')

In [None]:
result = adata.uns['wilcoxon_06scvi']
groups = result['names'].dtype.names
df = pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups 
    for key in ['names','scores','logfoldchanges', 'pvals', 'pvals_adj']})
df.to_csv(os.path.join(RESULTS_FOLDERNAME, 'wilcoxon_DGE_leiden06scvi.csv'))
df.head(5)

In [None]:
# create an empty dictionary to store the top 10 names for each cell_type
top_names_dict = {}

for group in groups:
    top_names = result['names'][group][:10] # select the top 10 names for this cell_type
    top_names_dict[group] = list(top_names) # add the list of top names to the dictionary

In [None]:
adata.var

During development, fibroblasts change their mesenchymal phenotype to an epithelial phenotype, in a process defined as the mesenchymal–epithelial transition (MET), which is fundamental in global ontogenetic development [4,5]. MET involves a complex functional phenotypic change from a typical mesenchymal nonpolarized cell to a polarized ELC. In this process, fibroblasts lose their spindle-shaped morphology and migratory capacity. The expression of typical markers, such as vimentin, desmin, α-SMA, N-cadherin, collagen type I and III, and Thy-1 cell surface antigen (CD90), is lost as well [4,5,6]. These features are replaced by polarity and the expression of epithelial markers, such as E-cadherin, tight junction protein 1 (TJP1), also known as zonula occludens protein-1 (TJP1/ZO-1), cytokeratins, type IV and VII collagen, and laminin [7].

In [None]:
sc.pl.umap(adata, 
           color=['DES', 'VIM', 'S100A4', 'ACTA2', 'CDH2', 'THY1', 'COL1A1', 'COL3A1', 'MMP2', 'FN1', 'leiden_06scvi'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_mesenchymal_fibro_type.svg')

In [None]:
sc.pl.umap(adata, 
           color=['CDH1', 'TJP1', 'LAMA1', 'COL4A1', 'FAP', 'CTNNB1', 'MUC1', 'leiden_06scvi'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_epithelial_fibro_type.svg')

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6701373/

In [None]:
sc.pl.umap(adata, 
           color=['SNAI1', 'SNAI2', 'TWIST1', 'leiden_06scvi'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_endoTM_fibro_type.svg')

In [None]:
for ct in top_names_dict.keys():
    print(f"Cluster {ct.upper()}:")  # print cluster name
    sc.pl.umap(
        adata,
        layer='log1p_norm',
        color=top_names_dict[ct]+['leiden_06scvi'],
        vmin=0,
        vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=False,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=False,
        cmap="Reds",  # or choose another color map e.g. from here: https://matplotlib.org/stable/tutorials/colors/colormaps.html
        save=f'_dev_cluster{ct}_scvi_topDEGs_06scvi.svg'
    )
    print("\n\n\n")  # print white space for legibility

Data was manually annotated based on known tenocyte and other cell type markers (shown above) + (CellMESH querying with https://uncurl.cs.washington.edu/db_query + using g:GOSt functional profiling tool on 500+ differentially expressed genes https://biit.cs.ut.ee/gprofiler/gost). Clustering dendrograms were consulted.

In [None]:
sc.pl.umap(adata, 
           color=['PAX6', 'NES', 'PROM1', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_neuroepithelial_markers-scvi.svg')

In [None]:
sc.pl.umap(adata, 
           color=['VIM', 'S100A4', 'PDGFRB', 'DDR2', 'PDPN', 'THY1', 'PDGFRA', 'CD44', 'leiden_06scvi', 'sampletype'], 
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_osteoblast_markers-scvi.svg')

In [None]:
gene_markers = ['SCX', 'MKX', 'TNMD', 'FMOD', 'THBS2', 'THBS4', 'EGR1', 
                'ABI3BP','COL1A1',"COL12A1",'COL3A1','COL6A6',"SPARC","POSTN","DCN","BGN",'KERA','LUM','FBN1']
sc.pl.matrixplot(adata, gene_markers, groupby='leiden_06scvi', cmap='viridis', 
                 save='_matrix_tenocytemarkers.svg',
                 dendrogram=True, layer='log1p_norm', vmin=0, vmax=3)

In [None]:
gene_markers = {'ECM genes':['ABI3BP','COL1A1',"COL12A1",'COL3A1','COL6A6',"SPARC","POSTN","DCN","BGN",'KERA','LUM','FBN1'],
                'Tenocyte genes': ['SCX', 'MKX', 'TNMD', 'FMOD', 'THBS2', 'THBS4', 'EGR1']
               }

sc.pl.dotplot(adata, gene_markers, groupby='leiden_06scvi', dendrogram=True, 
                 save='_tenocytemarkers.svg',
                 layer='log1p_norm', vmin=0, vmax=3)

In [None]:
sc.pl.umap(adata, color=["COL1A1","COL12A1","SPARC","POSTN","DCN","BGN", "leiden_06scvi", "sampletype"],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_general_fibroblasts-scvi.svg'
          )

In [None]:
gene_markers = ['ABI3BP','COL1A1',"COL12A1",'COL3A1','COL6A6',"SPARC","POSTN","DCN","BGN",'KERA','LUM','FBN1',
                'SCX', 'MKX', 'TNMD', 'FMOD', 'THBS2', 'THBS4', 'EGR1']

In [None]:
sc.pl.umap(adata, color= gene_markers,
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_allfibroblasts-scvi.svg'
          )

In [None]:
sc.pl.umap(adata, color=["MYH11","ACTA2","NOTCH3","CALD1", "leiden_06scvi", "sampletype"],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_smooth_myocytes-scvi.svg'
          )

In [None]:
sc.tl.rank_genes_groups(adata, groupby='age', method='wilcoxon', key_added='wilcoxon_age', 
                        use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_age')

In [None]:
# adata.var.index=adata.var['ensembl_gene_id']
# annot = sc.queries.biomart_annotations(
#     "hsapiens",
#     ["description", "ensembl_gene_id"],
# ).set_index("ensembl_gene_id")

# adata.var[annot.columns] = annot
# adata.var.index=adata.var['Gene']
# adata.var.drop(columns='Gene', inplace=True)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_06scvi', groups=['7'], reference='3',
                     method='wilcoxon', key_added='wilcoxon_7vs3', use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_7vs3')

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden_06scvi', groups=['2'], reference='7',
                     method='wilcoxon', key_added='wilcoxon_2vs7', use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_2vs7')

In [None]:
markers_dict = {
    'chondrocytes':[
        "COL2A1",
        "SOX9",
        "COL9A1",
        "ACAN",
        "COMP",
        "HAPLN1",
        "MATN1"
    ],
    'fibroblasts (general)': [
        "COL1A1",
        "COL12A1",
        "SPARC",
        "POSTN",
        "DCN",
        #"BGN"
    ],
    'fibroblasts (type 1)': [
        "SOX5",
        "COL11A1",
        "ABI3BP",
        "GAS2",
        #"COL24A1", # could regulate type I collagen fibrillogenesis, upregulated in human tendinopathy
        "FMOD",
        "TNMD",
        "MKX",
        "KERA",
        "SCX"
    ],
    'fibroblasts (type 2)': [
        "COL3A1", 
        "COL6A6",
        "DCLK1",
        "EBF1",
        "TSHZ2",
        "PLAGL1",
        "VCAN",
        "FBN1",
        "NOVA1",
        "NEGR1",
        "NAV3",
        "LUM",
        "SEMA5A",
        "TNXB",
        "PI16",
        "SCN7A",
        "CDH18",
    ],
    
    'fibroblasts (type 3)': [
        "FGF14",
        "FSTL5",
        "THBS4",
        "BMP5",
        "CDH12"
    ],
    'macrophages': [
        "MRC1",
        "F13A1",
        "CSF1R", 
        "CD163",   
        "CD68",    
        "CD36"
    ],
    'immune cells': [
        "CD4",     # Cluster of differentiation 4 (T helper cells)
        "FOXP3",   # Forkhead box protein P3 (Regulatory T cells)
        "SKAP1",
        "THEMIS",
        "PTPRC",
        "RIPOR2",
        "IKZF1"
    ],
    'vascular endothelial cells': [
        "CDH5",    # Cadherin 5
        "VWF",     # Von Willebrand factor
        "PECAM1",  # Platelet and endothelial cell adhesion molecule 1
        "CD34"
    ],
    'lymphatic endothelial cells': [
        "LYVE1",   # Lymphatic vessel endothelial hyaluronan receptor 1
        "PROX1",   # Prospero homeobox 1
        "FLT4",    # Fms-related tyrosine kinase 4 (VEGFR3)
    ],
    'nervous system cells': [
        "NRXN1",
        "XKR4",
        "SLC35F1",
        "NCAM2",
        "PTPRZ1",
        "FIGN",
        "IL1RAPL2",
        "CDH6",
        "GRID2",
        "SOX10"
    ],
    'smooth myocytes': [
        "ACTA2",   # Alpha-smooth muscle actin
        "MYH11",   # Smooth muscle myosin heavy chain
        "NOTCH3",
        #"DES",     # Desmin
        #"VIM",     # Vimentin
        #"SMTN",    # Smoothelin
        "CALD1",   # Caldesmon
        "PDGFRB",
        "COL4A1",
    ],
    'dividing cells': [
        "MKI67",
        "DIAPH3",
        "CENPK",
        "CENPP",
        "TOP2A"
    ],
    'satellite/myoblast cells': [
        "PAX7",    # Paired box 7
        "DES",      # Desmin
        "CDH15",   # Cadherin-15 (M-Cadherin)
        "NES",     # Nestin
        "DLK1",
        "NCAM1",    # Neural Cell Adhesion Molecule 1 (CD56)
        "MYO18B",
        "RYR3",
        "MYF5",    # Myogenic factor 5
        "MYOD1",   # Myogenic Differentiation 1
        "MYOG",     # Myogenin
    ],
    'skeletal myocytes': [
        "TNNT3",
        "TTN",
        "MYH3",
        "COL22A1",
        "TNNC1"
    ],
}


In [None]:
sc.pl.dotplot(adata, markers_dict, groupby='leiden_06scvi',
             use_raw=False, layer='log1p_norm', dendrogram=True,
             save='_annotation.svg')

In [None]:
# Subset the dictionary
filtered_dict = {key: value for key, value in markers_dict.items() if 'fibro' in key.lower() or 'dividing' in key.lower()}
# Print the filtered dictionary
print(filtered_dict)

In [None]:
sc.pl.dotplot(adata, filtered_dict, groupby='leiden_06scvi', 
              use_raw=False, layer='log1p_norm', dendrogram=True,
              cmap='Reds',
              save='_annotation_fibroblasts.svg')

In [None]:
sc.pl.umap(adata, color=['SOX5', 'COL11A1', 'ABI3BP', 'GAS2', 'FMOD', 'TNMD', 'MKX', 'KERA', 'SCX'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_annotation_type1fibros.svg'
          )

In [None]:
sc.pl.umap(adata, color=['COL3A1', 'COL6A6', 'DCLK1', 'EBF1', 'TSHZ2', 'PLAGL1', 'VCAN', 'FBN1', 
                         'NOVA1', 'NEGR1', 'NAV3', 'LUM', 'SEMA5A', 'TNXB', 'PI16', 'SCN7A', 'CDH18'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_annotation_type2fibros.svg'
          )

In [None]:
sc.pl.umap(adata, color=['FGF14', 'FSTL5', 'THBS4', 'BMP5', 'CDH12'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_annotation_type3fibros.svg'
          )

In [None]:
sc.pl.umap(adata, color=['MKI67', 'DIAPH3', 'CENPK', 'CENPP', 'TOP2A'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_annotation_dividing.svg'
          )

In [None]:
# Subsetting the dictionary to remove keys containing 'fibro' or 'dividing'
filtered_dict = {key: value for key, value in markers_dict.items() if not ('fibro' in key.lower() or 'dividing' in key.lower())}
# Printing the resulting subsetted dictionary
print(filtered_dict)

In [None]:
sc.pl.dotplot(adata, filtered_dict, groupby='leiden_06scvi', 
              use_raw=False, layer='log1p_norm', dendrogram=True,
              save='_annotation_rest.svg')

In [None]:
end_markers_dict = {
    'interfascicular maxtrix (general)': [
        'COL4A1', 
        'COL4A2', 
        'HSPG2', # perlecan
        'NID1',  #nidogen-1
        'LAMB1', 
        'LAMC1', 
        'ITGA6', 
        'ITGB1',
    ],
    'vascular endothelial cells': [
        "CDH5",    # Cadherin 5
        "VWF",     # Von Willebrand factor
        "PECAM1",  # Platelet and endothelial cell adhesion molecule 1
        "CD34",
        "ENG",
        "TEK"
    ],
    'lymphatic endothelial cells': [
        "LYVE1",   # Lymphatic vessel endothelial hyaluronan receptor 1
        "PROX1",   # Prospero homeobox 1
        "FLT4",    # Fms-related tyrosine kinase 4 (VEGFR3)
    ],
    'nervous system cells': [
        "NRXN1",
        "XKR4",
        "SLC35F1",
        "NCAM2",
        "PTPRZ1",
        "FIGN",
        "IL1RAPL2",
        "CDH6",
        "GRID2",
        "SOX10"
    ],
    'smooth myocytes': [
        "ACTA2",   # Alpha-smooth muscle actin
        "MYH11",   # Smooth muscle myosin heavy chain
        "NOTCH3",
        #"DES",     # Desmin
        #"VIM",     # Vimentin
        #"SMTN",    # Smoothelin
        "CALD1",   # Caldesmon
        "PDGFRB",
    ],
}

In [None]:
sc.pl.dotplot(adata, end_markers_dict, groupby='leiden_06scvi', 
              use_raw=False, layer='log1p_norm', dendrogram=True,
              save='_annotation_ifm_markers.svg',
              cmap='Greens'
             )

In [None]:
sc.pl.umap(adata, color=['COL4A1', 'COL4A2', 'HSPG2', 'NID1', 'LAMB1', 'LAMC1', 'ITGA6', 'ITGB1', 'CDH5', 
                         'VWF', 'PECAM1', 'CD34', 'ENG', 'TEK', 
                         'LYVE1', 'PROX1', 'FLT4', 'NRXN1', 'XKR4', 
                         'SLC35F1', 'NCAM2', 'PTPRZ1', 'FIGN', 'IL1RAPL2', 
                         'CDH6', 'GRID2', 'SOX10', 'ACTA2', 'MYH11', 'NOTCH3', 'CALD1', 'PDGFRB'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Greens",
           save = '_annotation_ifm.svg'
          )

In [None]:
import matplotlib
matplotlib.rcParams['patch.edgecolor'] = 'black'

In [None]:
structure_dict = {
    'chondrocytes': ['COL2A1', 'SOX9', 'COL9A1', 'ACAN', 'COMP', 'HAPLN1', 'MATN1'],
    'satellite/myoblast cells': ['PAX7', 'DES', 'CDH15', 'NES', 'DLK1', 'NCAM1', 'MYO18B', 'RYR3', 'MYF5', 'MYOD1', 'MYOG'], 
    'skeletal myocytes': ['TNNT3', 'TTN', 'MYH3', 'COL22A1', 'TNNC1']}

sc.pl.dotplot(adata, structure_dict, groupby='leiden_06scvi', 
              use_raw=False, layer='log1p_norm', dendrogram=True,
              save='_annotation_microanat_markers.svg',
              #swap_axes=True,
              cmap='Oranges'
             )

In [None]:
sc.pl.umap(adata, color=['COL2A1', 'SOX9', 'COL9A1', 'ACAN', 'COMP', 'HAPLN1', 'MATN1',
                        'PAX7', 'DES', 'CDH15', 'NES', 'DLK1', 'NCAM1', 'MYO18B', 'RYR3', 'MYF5', 'MYOD1', 'MYOG',
                        'TNNT3', 'TTN', 'MYH3', 'COL22A1', 'TNNC1'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Oranges",
           save = '_annotation_microanat.svg'
          )

In [None]:
# Assuming adata is your AnnData object
cluster_column = 'leiden_06scvi'
sample_column = 'sampletype'

# Create a DataFrame with the cluster and sampletype columns
data_to_plot = pd.DataFrame({'Cluster': adata.obs[cluster_column], 'SampleType': adata.obs[sample_column]})

# Pivot the DataFrame to get the count of each SampleType in each Cluster
pivot_data = data_to_plot.groupby(['Cluster', 'SampleType']).size().reset_index(name='Count')
pivot_data = pivot_data.pivot(index='Cluster', columns='SampleType', values='Count').fillna(0)

# Plot the bar plot
sns.set(style="whitegrid")
pivot_data.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Tissue Type Donors in Clusters')
plt.xlabel('Clusters')
plt.ylabel('Number of Cells')
plt.legend(title='Sample Type')
plt.xticks(rotation=45)
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'donordistInclusters.svg'))
plt.show()

In [None]:
adata.obs.leiden_06scvi.value_counts()

In [None]:
sc.tl.filter_rank_genes_groups(
    adata,
    min_in_group_fraction=0.2,
    max_out_group_fraction=0.2,
    key="wilcoxon_06scvi",
    key_added="wilcoxon_06scvi_filtered",
)

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, groupby='leiden_06scvi',
                                use_raw=False, 
                                layer="log1p_norm", 
                                vmax=3, vmin=0, 
                                #cmap='RdBu_r',
                                key='wilcoxon_06scvi_filtered',
                                save='topDEGs06scvi_norm_filtered.svg')

In [None]:
immune_dict = {
    'immune cells': [
        'PTPRC',#'FOXP3',
        'CD44',
        'IKZF1', 
        'RUNX1', 
        'DOCK2', 
        'INPP5D',
    ],
    'macrophages': [
        'MRC1', 'F13A1', 
        'CSF1R', 'CD163', 
        'CD68', 'CD36',
        #'LRMDA', 'ITPR2',
        'LGMN'
    ], 
    'lymphoid': [
        'SKAP1', 
        'THEMIS', 
        'CD247',
        'EPB41', 
        'PIP4K2A', 
        'PRKCB',
        'CD96',
        'CD38'
    ],
    'unknown progenitors': [
        'KIT',
        'TFRC',
        'IL18R1',
        'MITF',
        'BMP2K', 
        'GATA2',
    ]
                     
}
#'FLT3'
#'ILR7', 
# CD127
#'PAX5',
#'ID2',
#'TCF3'

sc.pl.dotplot(adata, immune_dict, groupby='leiden_06scvi', 
              use_raw=False, layer='log1p_norm', dendrogram=True,
              cmap='Purples',
              save='_annotation_immune_markers.svg'
             )

In [None]:
sc.pl.umap(adata, color=['PTPRC','CD44','IKZF1','RUNX1','DOCK2','INPP5D','MRC1', 'F13A1', 
        'CSF1R', 'CD163', 
        'CD68', 'CD36',
        'LGMN',
        'SKAP1', 
        'CD247',
        'EPB41', 
        'PIP4K2A', 
        'PRKCB',
        'KIT',
        'TFRC',
        'IL18R1',
        'MITF',
        'BMP2K', 
        'GATA2'],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Purples",
           save = '_annotation_immune.svg'
          )

In [None]:
adata.obs['cell_type'] = adata.obs['leiden_06scvi'].astype(int)
cell_type_names = { 
    0: 'ABI3BP GAS2 Fibroblasts 2', #
    1: 'ABI3BP GAS2 Fibroblasts 3', #
    2: 'COL3A1 PI16 Fibroblasts', 
    3: 'COL6A6 FNDC1 Fibroblasts', 
    4: 'Unknown', #dividing fibroblasts
    5: 'Macrophages',
    6: 'ABI3BP GAS2 Fibroblasts 1',
    7: 'NEGR1 SCN7A Fibroblasts',
    8: 'Satellite Cells',
    9: 'Chondrocytes', #
    10: 'vasEndothelial Cells',
    11: 'Smooth Myocytes', 
    12: 'FGF14 THBS4 Fibroblasts',
    13: 'Skeletal Myocytes', 
    14: 'Nervous System Cells', 
    15: 'Unknown', # dividing cells
    16: 'Unknown', # low immune cell nr
    17: 'Immune Cells',
    18: 'lymEndothelial Cells'
}
adata.obs['cell_type'] = adata.obs['cell_type'].replace(cell_type_names)

In [None]:
adata.obs['cell_type'].value_counts()

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=["COL1A1","COL12A1","SPARC","POSTN","DCN","BGN", "leiden_06scvi", "sampletype"],
           layer='log1p_norm',
           vmin=0,
           vmax="p99",
           sort_order=False,
           frameon=False,
           cmap="Reds",
           save = '_general_fibroblasts-scvi.svg'
          )

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'dev_libbatchsex_scVI.h5ad'))

# scANVI INTEGRATION

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_libbatchsex_scVI.h5ad'))

In [None]:
vae = scvi.model.SCVI(adata, n_hidden = 128, n_latent=30, n_layers=2, dropout_rate=0.1, dispersion="gene-batch", gene_likelihood='zinb')
vae = vae.load(os.path.join(RESULTS_FOLDERNAME, "FoetalFull_SampleTypeLibbatch_ZinbGeneBatch/"), use_gpu=True, adata=adata)
vae

In [None]:
# Replace the cell_type values directly in the original adata object
#adata.obs['cell_type'] = adata.obs['cell_type'].replace(['Dividing Cells', 'Dividing Cells 2'], 'Unknown')
#np.unique(adata.obs["cell_type"], return_counts=True)

In [None]:
lvae = scvi.model.SCANVI.from_scvi_model(
    vae,
    adata=adata,
    labels_key="cell_type",
    unlabeled_category="Unknown",
)

In [None]:
lvae.train(max_epochs=25, train_size = 0.9, validation_size = 0.1, 
          use_gpu=True, accelerator='gpu', 
          check_val_every_n_epoch=1,
          early_stopping=True,
          early_stopping_patience=10,
          early_stopping_monitor="elbo_validation",)

In [None]:
# Ensure convergence
train_test_results = lvae.history["elbo_train"]
train_test_results["elbo_validation"] = lvae.history["elbo_validation"]
train_test_results.iloc[10:].plot(logy=True)  # exclude first 10 epochs
plt.show()

In [None]:
lvae.save(os.path.join(RESULTS_FOLDERNAME, "FoetalFull_SampleTypeLibbatch_ZinbGeneBatch_scANVI/"), overwrite=True)

In [None]:
adata.obs["C_scANVI"] = lvae.predict(adata)
adata.obsm["X_scANVI"] = lvae.get_latent_representation(adata)

In [None]:
sc.pp.neighbors(adata, use_rep="X_scANVI", metric='correlation')
sc.tl.umap(adata)

In [None]:
def plot_umaps(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(8, 4*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False, s=2)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_umaps(adata, ['sampletype', 'sample', 'age', 'libbatch', 'type', 'phase', 'sex', 'cell_type', 'C_scANVI'], 
           filename = 'dev_UMAP_plots_scANVI.svg')

In [None]:
adata.obs['cell_type'].value_counts()

In [None]:
adata.obs['C_scANVI'].value_counts()

In [None]:
#adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_scANVI.h5ad'))

In [None]:
df = adata.obs.groupby(["cell_type", "C_scANVI"]).size().unstack(fill_value=0)
conf_mat = df / df.sum(axis=1).values[:, np.newaxis]

In [None]:
plt.figure(figsize=(8, 8))
plt.pcolormesh(conf_mat, edgecolors='k', linewidths=0.5, cmap='viridis')
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90)
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.grid(False)
plt.xlabel("Predicted")
plt.ylabel("Observed")
# Add colorbar for better interpretation of the plot
plt.colorbar(label='Proportion')
savesvg('scANVI_prediction_matrix.svg', plt)
plt.show()

In [None]:
#adata.write(os.path.join(RESULTS_FOLDERNAME, 'dev_scANVI.h5ad'))

In [None]:
sc.pl.umap(adata, color='ageint', frameon=False,
          save='_ageint.svg', cmap='viridis')

In [None]:
#sc.tl.leiden(adata, resolution = 0.4, key_added = 'leiden_04scanvi')
sc.pl.umap(adata, color = ['C_scANVI', 'leiden_04scanvi', 'cell_type'], add_outline=True, legend_loc='on data',
           legend_fontsize=4, legend_fontoutline=1, frameon=False,
           save='_scANVI_before_refining.svg'
          )

In [None]:
# Step 1: Identify cells corresponding to clusters 9 and 16 in leiden_04scanvi
cluster_9_cells = adata.obs.index[adata.obs['leiden_04scanvi'] == '9']
cluster_16_cells = adata.obs.index[adata.obs['leiden_04scanvi'] == '16']

# Step 2: Update C_scANVI labels for the identified cells
adata.obs['C_scANVI'] = np.where(
    adata.obs.index.isin(cluster_16_cells), 'lymEndothelial Cells',
    np.where(adata.obs.index.isin(cluster_9_cells), 'vasEndothelial Cells', adata.obs['C_scANVI'])
)
print(adata.obs['C_scANVI'].value_counts())

In [None]:
adata.obs['C_scANVI'] = np.where(
    adata.obs['C_scANVI'] == 'ABI3BP GAS2 Fibroblasts 2', 'ABI3BP GAS2 Fibroblasts 1', adata.obs['C_scANVI']
)
adata.obs['C_scANVI'] = np.where(
    adata.obs['C_scANVI'] == 'ABI3BP GAS2 Fibroblasts 3', 'ABI3BP GAS2 Fibroblasts 2', adata.obs['C_scANVI']
)
print(adata.obs['C_scANVI'].value_counts())

In [None]:
adata.obs['C_scANVI'] = np.where(
    adata.obs['C_scANVI'] == 'Macrophages', 'Immune Cells', adata.obs['C_scANVI']
)
print(adata.obs['C_scANVI'].value_counts())

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_scANVI.h5ad'))
adata

In [None]:
sc.pl.umap(adata, color=['NES', 'PDGFRA', 'TPPP3', 'AXIN2'], frameon=False,
          legend_fontoutline=True, legend_loc='on data', legend_fontsize=5,
          #save='dev_umap_C_scANVI_cell_type_nolegend.svg'
          )

In [None]:
sc.pl.umap(adata, color=['cell_type', 'C_scANVI'], frameon=False,
          legend_fontoutline=True, legend_loc='none', legend_fontsize=5,
          save='dev_umap_C_scANVI_cell_type_nolegend.svg'
          )

In [None]:
sc.tl.rank_genes_groups(adata, groupby='C_scANVI', groups=['ABI3BP GAS2 Fibroblasts 1'], 
                        reference='ABI3BP GAS2 Fibroblasts 2', method='wilcoxon', 
                        key_added='wilcoxon_abi3', use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_abi3')

In [None]:
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, key="wilcoxon_scanvi", groupby="C_scANVI",
                                layer='log1p_norm', show_gene_labels=True, figsize=(15,25),
                                cmap='YlOrRd',save='heatmap.png'
                                )

In [None]:
sc.pl.rank_genes_groups_violin(adata, groups='ABI3BP GAS2 Fibroblasts 1', n_genes=15, key='wilcoxon_abi3')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=20, groupby='C_scANVI', 
                                use_raw=False, 
                                layer="log1p_norm", 
                                vmax=3, vmin=0, 
                                #cmap='RdBu_r',
                                key='wilcoxon_abi3',
                                #save='topDEGsCscANVI.svg'
                               )

In [None]:
sc.tl.dendrogram(adata, 'C_scANVI')
sc.pl.dendrogram(adata, groupby='C_scANVI')

In [None]:
sc.tl.rank_genes_groups(adata, groupby='C_scANVI', method='wilcoxon', key_added='wilcoxon_scanvi', 
                        use_raw=False, layer='log1p_norm')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key='wilcoxon_scanvi')

In [None]:
result = adata.uns['wilcoxon_scanvi']
groups = result['names'].dtype.names
df = pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups 
    for key in ['names','scores','logfoldchanges', 'pvals', 'pvals_adj']})
df.to_csv(os.path.join(RESULTS_FOLDERNAME, 'dev_wilcoxon_DGE_scanvi.csv'))
df.head(5)

In [None]:
# Convert the index to string
adata.var.index = adata.var.index.astype(str)
adata.var_names_make_unique()

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, groupby='C_scANVI',
                                use_raw=False, 
                                layer="log1p_norm", 
                                vmax=3, vmin=0, 
                                #cmap='RdBu_r',
                                key='wilcoxon_scanvi',
                                save='topDEGsCscANVI.svg')

In [None]:
sc.tl.filter_rank_genes_groups(
    adata,
    min_in_group_fraction=0.2,
    max_out_group_fraction=0.2,
    key="wilcoxon_scanvi",
    key_added="wilcoxon_scanvi_filtered",
)

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, groupby='C_scANVI',
                                use_raw=False, 
                                layer="log1p_norm", 
                                standard_scale='var',
                                #cmap='RdBu_r',
                                key='wilcoxon_scanvi_filtered',
                                save='topDEGsCscANVI_filtered.svg')

In [None]:
c

In [None]:
# create an empty dictionary to store the top 10 names for each cell_type
top_names_dict = {}

for group in groups:
    top_names = result['names'][group][:10] # select the top 10 names for this cell_type
    top_names_dict[group] = list(top_names) # add the list of top names to the dictionary

In [None]:
for ct in top_names_dict.keys():
    print(f"Cluster {ct.upper()}:")  # print cluster name
    sc.pl.umap(
        adata,
        layer='log1p_norm',
        color=top_names_dict[ct]+['C_scANVI'],
        vmin=0,
        vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=False,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=False,
        cmap="Reds",  # or choose another color map e.g. from here: https://matplotlib.org/stable/tutorials/colors/colormaps.html
        save=f'_dev_cluster{ct}_topDEGs_CscANVI.svg'
    )
    print("\n\n\n")  # print white space for legibility

In [None]:
def plot_umaps(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(7, 4*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_umaps(adata, ['sampletype', 'sample', 'age', 'libbatch', 'type', 'phase', 'sex', 'C_scANVI', 'cell_type'], 
           filename = 'dev_UMAP_plots_scANVIfull.svg')

In [None]:
markers_full_dict = {
    'chondrocytes': ['COL2A1', 'SOX9', 'COL9A1', 'ACAN', 'COMP', 'HAPLN1', 'MATN1'],
    'fibroblasts (general)': [
        "COL1A1",
        "COL12A1",
        "SPARC",
        "POSTN",
        "DCN",
        #"BGN"
    ],
    'fibroblasts (type 1)': [
        "SOX5",
        "COL11A1",
        "ABI3BP",
        "GAS2",
        #"COL24A1", # could regulate type I collagen fibrillogenesis, upregulated in human tendinopathy
        "FMOD",
        "TNMD",
        "MKX",
        "KERA",
        "SCX"
    ],
    'fibroblasts (type 2)': [
        "COL3A1", 
        "COL6A6",
        "DCLK1",
        "EBF1",
        "TSHZ2",
        "PLAGL1",
        "VCAN",
        "FBN1",
        "NOVA1",
        "NEGR1",
        "NAV3",
        "LUM",
        "SEMA5A",
        "TNXB",
        "PI16",
        "SCN7A",
        "CDH18",
    ],
    
    'fibroblasts (type 3)': [
        "FGF14",
        "FSTL5",
        "THBS4",
        "BMP5",
        "CDH12"
    ],
    'dividing cells': [
        "MKI67",
        "DIAPH3",
        "CENPK",
        "CENPP",
        "TOP2A"
    ],
    'interfascicular maxtrix (general)': [
        'COL4A1', 
        'COL4A2', 
        'HSPG2', # perlecan
        'NID1',  #nidogen-1
        'LAMB1', 
        'LAMC1', 
        'ITGA6', 
        'ITGB1',
    ],
    'vascular endothelial cells': [
        "CDH5",    # Cadherin 5
        "VWF",     # Von Willebrand factor
        "PECAM1",  # Platelet and endothelial cell adhesion molecule 1
        "CD34",
        "ENG",
        "TEK"
    ],
    'lymphatic endothelial cells': [
        "LYVE1",   # Lymphatic vessel endothelial hyaluronan receptor 1
        "PROX1",   # Prospero homeobox 1
        "FLT4",    # Fms-related tyrosine kinase 4 (VEGFR3)
    ],
    'nervous system cells': [
        "NRXN1",
        "XKR4",
        "SLC35F1",
        "NCAM2",
        "PTPRZ1",
        "FIGN",
        "IL1RAPL2",
        "CDH6",
        "GRID2",
        "SOX10"
    ],
    'smooth myocytes': [
        "ACTA2",   # Alpha-smooth muscle actin
        "MYH11",   # Smooth muscle myosin heavy chain
        "NOTCH3",
        #"DES",     # Desmin
        #"VIM",     # Vimentin
        #"SMTN",    # Smoothelin
        "CALD1",   # Caldesmon
        "PDGFRB",
    ],
    'satellite/myoblast cells': ['PAX7', 'DES', 'CDH15', 'NES', 'DLK1', 'NCAM1', 'MYO18B', 'RYR3', 'MYF5', 'MYOD1', 'MYOG'], 
    'skeletal myocytes': ['TNNT3', 'TTN', 'MYH3', 'COL22A1', 'TNNC1'],
    'immune cells': [
        'PTPRC',#'FOXP3',
        'CD44',
        'IKZF1', 
        'RUNX1', 
        'DOCK2', 
        'INPP5D',
    ],
    'macrophages': [
        'MRC1', 'F13A1', 
        'CSF1R', 'CD163', 
        'CD68', 'CD36',
        #'LRMDA', 'ITPR2',
        'LGMN'
    ], 
    'lymphoid': [
        'SKAP1', 
        'THEMIS', 
        'CD247',
        'EPB41', 
        'PIP4K2A', 
        'PRKCB',
        'CD96',
        'CD38'
    ],
    'unknown progenitors': [
        'KIT',
        'TFRC',
        'IL18R1',
        'MITF',
        'BMP2K', 
        'GATA2',
    ]
}

In [None]:
sc.pl.dotplot(adata, markers_full_dict, groupby='C_scANVI', swap_axes=True,
              use_raw=False, layer='log1p_norm', dendrogram=True, var_group_rotation=0,
              #vcenter=45,
              save='_full_annotation_scANVI.svg')

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format('dev_scANVI')))

In [None]:
sc.pl.violin(adata, ['SCX', 'TNMD', 'MKX'], 
             groupby='age', layer='log1p_norm',
             save='_earlytendonmarkers.svg'
             )

In [None]:
sc.pl.violin(adata, ['FMOD', 'EGR1', 'MKI67'], 
             groupby='age', layer='log1p_norm',
             save='_earlytendonmarkers2.svg'
             )

In [None]:
sc.pl.violin(adata, ['TOP2A', 'DIAPH3', 'CENPF'], 
             groupby='age', layer='log1p_norm',
             save='_earlytendonmarkers3.svg'
             )

# scANVI DGE

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_scANVI.h5ad'))
adata

In [None]:
adata.var_names = adata.var_names.astype(str)
adata.var_names_make_unique()
# adata.X = adata.layers['log1p_norm'].copy()

In [None]:
del adata.layers['scaled']

In [None]:
scvi.model.SCVI.setup_anndata(adata, #adata_hvg
                              layer="counts",
                              categorical_covariate_keys=["sampletype", "libbatch"], # other effects that are taken into account that we are not necessarily interested in
                              #"age", "type", "phase", "sample" - not correcting for to avoid removing variation of interest
                              )

vae = scvi.model.SCVI(adata, n_hidden = 128, n_latent=30, n_layers=2, dropout_rate=0.1, dispersion="gene-batch", gene_likelihood='zinb')
vae = vae.load(os.path.join(RESULTS_FOLDERNAME, "FoetalFull_SampleTypeLibbatch_ZinbGeneBatch/"), use_gpu=True, adata=adata)
vae

In [None]:
lvae = scvi.model.SCANVI.load(os.path.join(RESULTS_FOLDERNAME, "FoetalFull_SampleTypeLibbatch_ZinbGeneBatch_scANVI/"), 
                              use_gpu=True, adata=adata)
lvae

In [None]:
scvi.settings.batch_size

In [None]:
denoised_norm = lvae.get_normalized_expression(library_size = 10e4)

In [None]:
denoised_norm

In [None]:
adata.layers["scvi_normalized"] = denoised_norm

In [None]:
sc.pl.rank_genes_groups_heatmap(adata, n_genes=6, key="wilcoxon_scanvi", groupby="C_scANVI",
                                layer='scvi_normalized', show_gene_labels=True, figsize=(15,25),
                                cmap='YlOrRd',save='_scANVInorm.png'
                                )

In [None]:
change_per_cluster_de = lvae.differential_expression(groupby="C_scANVI")
change_per_cluster_de

In [None]:
cell_types = (
    adata.obs["cell_type"]
    .value_counts()
    # .loc[lambda x: (x >= 500) & (x.index != "nan")]
    .loc[lambda x: x.index != "nan"]
    .to_frame("n_cells")
)
cell_types.loc[:, "associated_test"] = cell_types.index.astype(str) + " vs Rest"

In [None]:
change_per_cluster_de

In [None]:
# This cell extracts list of top 5 upregulated genes for every cell-type
marker_genes = (
    change_per_cluster_de.reset_index()
    .loc[lambda x: x.comparison.isin(cell_types.associated_test.values)]
    .groupby("comparison")
    .apply(
        lambda x: x.sort_values("lfc_mean", ascending=False).iloc[:5]
    )  # Select top 5 DE genes per comparison
    .reset_index(drop=True)["Gene"]
    .unique()
)

In [None]:
adata_log = adata[adata.obs.cell_type.isin(cell_types.index.values)].copy()
sc.pp.normalize_total(adata_log)
sc.pp.log1p(adata_log)
sc.pl.dotplot(adata_log, marker_genes, groupby="C_scANVI")

# CellPhoneDB Matrix

In [None]:
#loomdata=sc.AnnData(adata.layers['counts'],obs=adata.obs,var=adata.var)
#loomdata.write_loom(os.path.join(RESULTS_FOLDERNAME, '{}.loom'.format('dev_scenicinput')))

In [None]:
del adata, loomdata

In [None]:
cellphonedata =sc.AnnData(adata.layers['scran_normlogcounts'],obs=adata.obs,var=adata.var)
print(cellphonedata.X.shape)
print(cellphonedata.X[1:10,1:10])

In [None]:
cellphonedata.var.index = cellphonedata.var['ensembl_gene_id']
cellphonedata.var

In [None]:
df = cellphonedata.to_df().T
df.head(5)

In [None]:
df.to_csv(os.path.join(RESULTS_FOLDERNAME, '{}.txt'.format('dev_cellPhDBinput')), sep='\t')

# Benchmarking

In [None]:
import scib

In [None]:
label_key = "C_scANVI"
batch_key = "sample"

unintegrated = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'unintegrated_adata.h5ad'))
adata_scanvi = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_scANVI.h5ad'))
adata_scvi = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_scVI.h5ad'))
adata_hvg = unintegrated[:, unintegrated.var["highly_variable"]].copy()


metrics_scanvi = scib.metrics.metrics_fast(unintegrated,
                                         adata_scanvi, #scANVI-integrated object
                                         batch_key,
                                         label_key,
                                         embed="X_scANVI"
                                        )
metrics_scvi = scib.metrics.metrics_fast(
    unintegrated, adata_scvi, batch_key, embed="X_scVI"
)
metrics_hvg = scib.metrics.metrics_fast(unintegrated, adata_hvg, batch_key)

In [None]:
# Concatenate metrics results
metrics = pd.concat(
    [metrics_scvi, metrics_scanvi, metrics_hvg],
    axis="columns",
)
# Set methods as column names
metrics = metrics.set_axis(
    ["scVI", "scANVI", "Unintegrated"], axis="columns"
)
# Select only the fast metrics
metrics = metrics.loc[
    [
        "ASW_label",
        "ASW_label/batch",
        "PCR_batch",
        "isolated_label_silhouette",
        "graph_conn",
        "hvg_overlap",
    ],
    :,
]
# Transpose so that metrics are columns and methods are rows
metrics = metrics.T
# Remove the HVG overlap metric because it's not relevant to embedding outputs
metrics = metrics.drop(columns=["hvg_overlap"])
metrics