In [None]:
# Import dependencies
%matplotlib inline
import os
import numpy as np
import scanpy as sc
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import anndata as ad

#import scvi

matplotlib.rcParams['font.family'] = 'sans-serif'

# Initialize random seed
import random
random.seed(111)

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# wdir = "/ceph/project/tendonhca/akurjan/analysis/"
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/"
os.chdir( wdir )

# folder structures
EMB_FOLDERNAME = "embryonic ScAndSp/scVI/results/"
FTL_FOLDERNAME = "foetal/results/scVI/"
RESULTS_FOLDERNAME = "developmental/scVI/results/"
FIGURES_FOLDERNAME = "developmental/scVI/figures/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

def plot_umaps(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(8, 4*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False, s=2)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()    
    
# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

# Subsetting and scVI/scANVI integration
Use scvi-env conda environment

In [None]:
foetal = sc.read_h5ad(os.path.join(FTL_FOLDERNAME, 'dev_scANVI.h5ad'))
foetal.var_names = foetal.var_names.astype(str)
foetal.var_names_make_unique()

In [None]:
foetal.obs['C_scANVI_orig'] = foetal.obs['C_scANVI']

foetal.obs['C_scANVI'] = np.where(
    foetal.obs['C_scANVI'] == 'ABI3BP GAS2 Fibroblasts 1', 'ABI3BP GAS2 Fibroblasts', foetal.obs['C_scANVI']
)
foetal.obs['C_scANVI'] = np.where(
    foetal.obs['C_scANVI'] == 'ABI3BP GAS2 Fibroblasts 2', 'ABI3BP GAS2 Fibroblasts', foetal.obs['C_scANVI']
)
#foetal.obs['C_scANVI'] = np.where(
#    foetal.obs['C_scANVI'] == 'COL6A6 FNDC1 Fibroblasts', 'COL3A1 PI16 Fibroblasts', foetal.obs['C_scANVI']
#)
foetal.obs['C_scANVI'] = np.where(
    foetal.obs['C_scANVI'] == 'Smooth Myocytes', 'Mural Cells', foetal.obs['C_scANVI']
)

foetal.obs['C_scANVI'].value_counts()

In [None]:
foetal.obs.head(5)

In [None]:
foetal.var.head(5)

In [None]:
embryonic = sc.read_h5ad(os.path.join(EMB_FOLDERNAME, 'tendonsubset_scANVI.h5ad'))
embryonic.var_names = embryonic.var_names.astype('str')
embryonic.var_names_make_unique()
embryonic.obs['C_scANVI_orig'] = embryonic.obs['C_scANVI']
embryonic.obs.head(5)

In [None]:
embryonic.var.head(5)

In [None]:
cat = list(embryonic.obs['norm_sample_stage'].cat.categories)
embryonic.obs['age'] = pd.Categorical(embryonic.obs['norm_sample_stage'], categories=cat, ordered=True)
embryonic = embryonic[embryonic.obs['age'].argsort()]
embryonic.obs 

In [None]:
label_mapping = {
     'pcw7.1 -1': '7.1w',
     'pcw5.6': '5.6w',
     'pcw9.3-1 ': '9.3w',
     'pcw6.1': '6.1w',
     'pcw9.0-1 ': '9.0w',
     'pcw7.2-2 ': '7.2w',
     'pcw9.0-2 ': '9.0w',
     'pcw8.0': '8.0w',
     'pcw6.5': '6.5w',
     'pcw9.3-2': '9.3w',
     'pcw8.4-1 ': '8.4w',
     'pcw7.2-1 ': '7.2w',
     'pcw8.4-2': '8.4w'
}
embryonic.obs['age'] = embryonic.obs['norm_sample_stage'].map(label_mapping)
embryonic.obs['age'] = embryonic.obs['age'].astype('category')
embryonic.obs['age'].dtype

In [None]:
label_mapping = {
     'pcw7.1 -1': 7.1,
     'pcw5.6': 5.6,
     'pcw9.3-1 ': 9.3,
     'pcw6.1': 6.1,
     'pcw9.0-1 ': 9.0,
     'pcw7.2-2 ': 7.2,
     'pcw9.0-2 ': 9.0,
     'pcw8.0': 8.0,
     'pcw6.5': 6.5,
     'pcw9.3-2': 9.3,
     'pcw8.4-1 ': 8.4,
     'pcw7.2-1 ': 7.2,
     'pcw8.4-2': 8.4
}

embryonic.obs['ageint'] = embryonic.obs['norm_sample_stage'].map(label_mapping)
embryonic.obs['ageint'].value_counts()

In [None]:
embryonic.obs['sampletype'] = embryonic.obs['samplename']

In [None]:
adata = ad.concat((foetal, embryonic), join='outer', index_unique=None)
adata

In [None]:
del adata.obsm, adata.layers['log1p_norm'], adata.layers['scaled'], adata.layers['normcounts'], adata.raw

In [None]:
adata.X = adata.layers['counts'].copy()

In [None]:
adata.var_names_make_unique()

In [None]:
#keep = ['ABI3BP GAS2 Fibroblasts',
#        'COL3A1 PI16 Fibroblasts',
#        'FGF14 THBS4 Fibroblasts', 
#        #'NEGR1 SCN7A Fibroblasts'
#       ]
# Use the boolean mask to subset the AnnData object
#adata = adata[adata.obs['C_scANVI'].isin(keep)].copy()
#adata

In [None]:
del embryonic, foetal

# Data Preparation 

In [None]:
adata.obs['age'] = adata.obs['age'].astype("category")
list(adata.obs['age'].cat.categories)

In [None]:
# sort by categorical age
adata.obs['age'] = pd.Categorical(adata.obs['age'], categories=['6.1w', '6.5w', '7.2w', '8.4w', '9.0w', '9.3w', '12w', '17w', '20w'], ordered=True)
adata = adata[adata.obs['age'].argsort()]
adata.obs 

In [None]:
adata.obs['C_scANVI'].value_counts()

In [None]:
sc.pp.filter_genes(adata, min_counts=5, inplace=True)
sc.pp.filter_cells(adata, min_genes=200)

In [None]:
scaled_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
adata.layers["log1p_norm"] = sc.pp.log1p(scaled_counts["X"], copy=True)
print(adata.layers["log1p_norm"][0:5, 0:5])

In [None]:
adata.X = adata.layers['log1p_norm'].copy()

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor="cell_ranger")
sc.pl.highly_variable_genes(adata)

In [None]:
sc.pp.scale(adata)
print(adata.X[0:5,0:5])

In [None]:
hvgenes = adata.var.index[adata.var.highly_variable].tolist()
adata.layers['scaled'] = adata.X.copy()
adata.obsm["X_pca"] = sc.pp.pca(adata[:,hvgenes].X, n_comps=50, svd_solver="arpack")

In [None]:
adata.obs['seq_protocol'] = adata.obs['seq_protocol'].astype('str').astype('category')
adata.obs['seq_protocol'] = adata.obs['seq_protocol'].replace("nan", "Illumina-HTP NovaSeq 6000 Paired end sequencing")

In [None]:
adata.obs['kit'] = adata.obs['kit'].astype("str").astype("category")
adata.obs['kit'] = adata.obs['kit'].replace("nan", "3' v3.1")

In [None]:
adata.obs['libbatch'] = adata.obs['libbatch'].astype("str").astype("category")
adata.obs['libbatch'] = adata.obs['libbatch'].replace("nan", "2021")

In [None]:
sc.pp.neighbors(adata, use_rep="X_pca", metric='correlation')
sc.tl.umap(adata)

In [None]:
plot_umaps(adata, ['sampletype', 'phase', 'hospital_id', 'sample', 'libbatch',
                         'sex', 'age', 'ageint','kit', 'seq_protocol', 'C_scANVI'],
           filename='unintegrated_fulldev.svg'
          )

# scVI Integration

In [None]:
scvi.model.SCVI.setup_anndata(adata, #adata_hvg
                              layer="counts", labels_key='C_scANVI',
                              categorical_covariate_keys=["sampletype", "libbatch", "kit", "seq_protocol", "sex"], # effects that are taken into account that we are not necessarily interested in
                              #"age", "type", "phase", - not correcting for to avoid removing variation of interest
                              )

In [None]:
vae = scvi.model.SCVI(adata, n_hidden = 256, n_latent=14, n_layers=3, 
                      dropout_rate=0.1, dispersion="gene-batch", gene_likelihood='nb')
vae.view_anndata_setup(adata)

In [None]:
%%time
vae.train(max_epochs = 400, train_size = 0.9, validation_size = 0.1, 
          use_gpu=True, accelerator='gpu', 
          check_val_every_n_epoch=1,
          early_stopping=True,
          early_stopping_patience=20,
          early_stopping_monitor="elbo_validation",
         )

In [None]:
train_test_results = vae.history["elbo_train"]
train_test_results["elbo_validation"] = vae.history["elbo_validation"]
train_test_results.iloc[10:].plot(logy=True)  # exclude first 10 epochs
plt.show()

In [None]:
vae.get_elbo(adata)

In [None]:
adata.obsm["X_scVI"] = vae.get_latent_representation()
# use scVI latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="X_scVI", metric='correlation')
sc.tl.umap(adata)

In [None]:
plot_umaps(adata, ['sampletype', 'phase', 'hospital_id', 'sample', 'libbatch',
                         'sex', 'age', 'ageint','kit', 'seq_protocol', 'C_scANVI'],
           filename='scVIintegrated_fulldev.svg', cmap='gnuplot'
          )

In [None]:
vae.save(os.path.join(RESULTS_FOLDERNAME, "DevelopmentalFull_SampletypeLibbatchKitSeqSex/"), overwrite=True)

In [None]:
sc.pl.umap(adata, color=[#"leiden_04scvi", "leiden_06scvi", 
    "C_scANVI", "kit"], legend_loc="on data", frameon=False, legend_fontsize=5)

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'full_dev_scVI.h5ad'))

# Harmony (Change env!)
Use seacells_env environment

In [None]:
import harmony
import harmony.core
import harmony.plot

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'full_dev_scVI.h5ad'))
adata.var_names_make_unique()
adata.obs

In [None]:
adata.obs['age'].value_counts()

In [None]:
tp = adata.obs['age'].astype(str)
timepoint_connections = pd.DataFrame(columns=[0, 1])
index = 0

timepoint_connections.loc[index, :] = ['6.1w', '6.5w']; index += 1
timepoint_connections.loc[index, :] = ['6.5w', '7.2w']; index += 1
timepoint_connections.loc[index, :] = ['7.2w', '8.4w']; index += 1
timepoint_connections.loc[index, :] = ['8.4w', '9.0w']; index += 1
timepoint_connections.loc[index, :] = ['9.0w', '9.3w']; index += 1
timepoint_connections.loc[index, :] = ['9.3w', '12w']; index += 1
timepoint_connections.loc[index, :] = ['12w', '17w']; index += 1
timepoint_connections.loc[index, :] = ['17w', '20w']; index += 1
timepoint_connections

In [None]:
data_df=pd.DataFrame(adata.obsm["X_scVI"],index=adata.obs_names)
data_df

specifying the number of neighbors helps define the level of granularity in identifying cell relationships. A smaller value like 10 will result in a more local and fine-grained view of cell-to-cell connections, whereas a larger value would capture more distant or global similarities.

My data is fairly sparce, so k value will need to be larger to capture meaningful similarities.

In [None]:
# generate the augmented affinity matrix (aug_aff) and the non-augmented matrix (aff)
aug_aff, aff = harmony.core.augmented_affinity_matrix(data_df, tp, timepoint_connections, 
                                                      n_neighbors=20, pc_components=None)

# add the augmented affinity matrix to obsm
adata.obsm['X_aug_aff'] = aug_aff

In [None]:
# computes force directed layout coordinates from the augmented aff matrix
layout = harmony.plot.force_directed_layout(aug_aff, data_df.index)

In [None]:
harmony.plot.plot_timepoints(layout, tp)
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'harmony_plot.svg'), format='svg')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'harmony_plot.png'), format='png')

In [None]:
# add layout coordinates to the anndata object
adata.obsm['X_force_directed_layout'] = layout
adata.obsm['force_directed_array'] = adata.obsm['X_force_directed_layout'].values
sc.pl.embedding(adata, basis='force_directed_array', color='age',
               frameon=False,
               save='_dev_harmony_byage.png')

In [None]:
sc.pl.embedding(adata, basis='force_directed_array', color='ageint',
               frameon=False,
               #save='_dev_harmony_byage.png'
               )

In [None]:
adata.obsm['X_umap_orig'] = adata.obsm['X_umap']

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'developmental_harmony.h5ad'))

## Palantir trajectory detection
Palantir is an algorithm developed by the Pe'er lab to align cells along differentiation trajectories. Palantir models differentiation as a stochastic process where stem cells differentiate to terminally differentiated cells by a series of steps through a low dimensional phenotypic manifold. Palantir effectively captures the continuity in cell states and the stochasticity in cell fate determination.

The first step in Palantir trajectory detection is to project data onto diffusion maps. Harmony augmented affinity matrix is used as the input for identifying diffusion maps. Please see https://github.com/dpeerlab/Palantir for more details on Palantir

In [None]:
import palantir

dm_res = palantir.utils.run_diffusion_maps(adata.obsm["X_aug_aff"], knn=40)
#dm_res = palantir.utils.run_diffusion_maps(pd.DataFrame(adata.obsm["X_aug_aff"].toarray(),index=adata.obs_names))
#dm_res = palantir.utils.run_diffusion_maps(pd.DataFrame(adata.obsm["X_scANVI"],index=adata.obs_names),knn=100)

#adata.obsp["T"]=dm_res["T"]
adata.obsm["X_diff"]=dm_res['EigenVectors'].values

In [None]:
sc.pp.neighbors(adata,n_neighbors=40,use_rep="X_diff")
sc.tl.paga(adata,"C_scANVI")
sc.pl.paga(adata,threshold=.02, node_size_scale=2,
           fontsize=6, fontoutline=1, frameon=False, 
           save='_developmentaldiffPAGA.png'
          )

In [None]:
sc.tl.umap(adata, spread=.6, init_pos='paga')
sc.pl.umap(adata, color='C_scANVI', frameon=False)

In [None]:
sc.pl.umap(adata, color=['age', 'ageint'], frameon=False)

In [None]:
ms_data = palantir.utils.determine_multiscale_space(dm_res, n_eigs=9)
adata.obsm["X_msdiff"]=ms_data.values
ms_data

In [None]:
%%time
sc.tl.tsne(adata,use_rep="X_msdiff",
           perplexity=50,
           learning_rate=adata.shape[0]/12
          )

In [None]:
sc.pl.tsne(adata,color="C_scANVI", 
           frameon=False,
           save='_developmental.svg'
          )
#plt.savefig(os.path.join(FIGURES_FOLDERNAME, '20pcw_tendonfibro_msdiff_TSNE.svg'), format='svg')
#plt.savefig(os.path.join(FIGURES_FOLDERNAME, '20pcw_tendonfibro_msdiff_TSNEpng.png'), format='png')
# Image(filename=os.path.join(FIGURES_FOLDERNAME,'msdiff_TSNEpng.png'))

In [None]:
sc.pl.tsne(adata,color="C_scANVI", 
           frameon=False, legend_loc='on data', legend_fontsize=4,
           #save='_developmental.svg'
          )

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, use_rep="X_msdiff")
sc.tl.draw_graph(adata, init_pos="X_tsne", 
                 maxiter=500)

In [None]:
sc.pl.draw_graph(adata, color=["age", "ageint"], frameon=False,
                 save='_developmental_age_msdifffa.svg'
                )

In [None]:
sc.pl.draw_graph(adata,
                 color="C_scANVI", frameon=False,
                 save='_developmental_msdifffa.svg'
                )
#plt.savefig(os.path.join(FIGURES_FOLDERNAME, '20pcw_tendonfibro_msdiff_graph.png'), format='png')
# Image(filename=os.path.join(FIGURES_FOLDERNAME,'msdiff_graph.png'))

In [None]:
sc.pl.draw_graph(adata,
                 color="C_scANVI", frameon=False,
                 legend_loc='on data', legend_fontsize=6,
                 #save='_developmental_msdifffa.svg'
                )

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'developmental_harmony.h5ad'))

# Palantir Pseudotime

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'developmental_harmony.h5ad'))

In [None]:
ms_data = pd.DataFrame(adata.obsm['X_msdiff'], index=adata.obs_names)
ms_data

In [None]:
sc.pl.embedding(adata, basis='msdiff', color=['MKI67', 'DIAPH2', 'TOP2A', 'CENPK', 'CENPP', 'C_scANVI'], 
                components=["1, 2"], ncols=3)

In [None]:
sc.pl.embedding(adata, basis='msdiff', color=['FGF14', 'SCX', 'MKX', 'KERA', 'TNMD', 'FMOD', 'EGR1', 'ABI3BP', 'ageint'], 
                components=["1, 2"], ncols=3)

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
adata.X = adata.layers['log1p_norm'].copy()

In [None]:
adata.raw = adata.copy()

In [None]:
print(adata.raw.X[0:10, 0:10])

In [None]:
import numpy as np

# Find the cell index with the highest expression of the NRK gene - find double expression with MKI67 and CENPP and SCX/MKX/TNMD
max_expression_cell_index = np.argmin(adata[:, 'FGF14'].X) + np.argmin(adata[:, 'SCX'].X) + np.argmin(adata[:, 'TNMD'].X)
cell_id_with_highest_expression = adata.obs_names[max_expression_cell_index]
print("Cell ID with highest gene expression:", cell_id_with_highest_expression)

In [None]:
#gene_list = ['SCX', 'FGF14']
#'CENPK', 'CENPP', 'DIAPH3'
#adding 'NES', 'ITGB1' and markers of senescence 'CDKN1B', 'CDKN1A', 'CD34', 'CDK4'
#sc.tl.score_genes(adata, gene_list, score_name='tnp_score', use_raw=True)
#cell_id_with_highest_expression = adata.obs['tnp_score'].idxmax()
#print("Cell ID with highest gene expression:", cell_id_with_highest_expression)

In [None]:
adata.obs['startcell'] = 0.2
#ind = adata.obs.index[adata.obsm['X_msdiff'][:,2].argmax()]
ind = cell_id_with_highest_expression
adata.obs.loc[ind, 'startcell'] = 1
print(adata.obs[adata.obs.index == ind][['C_scANVI_original', 'phase', 'age']])
# Use sc.pl.embedding to visualize with color_column
sc.pl.embedding(adata, basis='msdiff', 
                color='startcell', 
                components=["1,2"],
                cmap='Greys', vmin=0, s=50)

In [None]:
sc.pl.embedding(adata, basis='draw_graph_fa', color=['startcell', 'phase', 'C_scANVI_original'],
                cmap='Greys', vmin=0, s=20)

In [None]:
sc.pl.embedding(adata, basis='umap', color=['startcell', 'phase', 'C_scANVI_original'],
                cmap='Greys', vmin=0, s=20)

In [None]:
sc.pp.neighbors(adata,n_neighbors=30,use_rep="X_msdiff")
adata.obsm["X_pca2d"]=adata.obsm["X_pca"][:,:2]
sc.tl.draw_graph(adata,init_pos='X_pca2d')

In [None]:
sc.pl.draw_graph(adata,color="SCX",color_map="RdBu_r")

In [None]:
sc.pl.embedding(adata, basis='draw_graph_fa', color=['startcell', 'phase', 'C_scANVI_original'],
                cmap='Greys', vmin=0, s=20)

In [None]:
pr_res = palantir.core.run_palantir(data=ms_data, early_cell=ind, 
                           knn=100, use_early_cell_as_start=True, 
                           n_jobs=10, num_waypoints=2000)

In [None]:
palantir.plot.plot_palantir_results(adata, pr_res, s=3, embedding_basis='X_umap')
plt.show()

In [None]:
palantir.plot.plot_palantir_results(adata, pr_res, s=3, embedding_basis='X_draw_graph_fa')
plt.show()

In [None]:
adata.obs['palantir_pseudotime'] = pr_res.pseudotime
adata.obs['palantir_entropy'] = pr_res.entropy
adata.uns['palantir_waypoints'] = pr_res.waypoints.values
adata.obsm['palantir_fate_probabilities'] = pr_res.branch_probs

In [None]:
adata

In [None]:
del adata.obs['outlier'], adata.obs['startcell'], adata.obs['mt_outlier']

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'dev_palantir.h5ad'))

In [None]:
adata=sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro2.h5ad'))

In [None]:
sc.tl.leiden(adata, resolution=0.2, key_added='fibro_leiden02')
sc.pl.umap(adata, color=['fibro_leiden02'], frameon=False)

In [None]:
adata.obsp

In [None]:
dm_res = palantir.utils.run_diffusion_maps(adata.obsm["X_aug_aff"], knn=80)
#dm_res = palantir.utils.run_diffusion_maps(pd.DataFrame(adata.obsm["X_scANVI"],index=adata.obs_names),knn=100)

In [None]:
adata.X

In [None]:
dm_res

In [None]:
imp_df = palantir.utils.run_magic_imputation(adata, dm_res=dm_res)

In [None]:
masks = palantir.presults.select_branch_cells(adata, eps=0)

In [None]:
palantir.plot.plot_branch_selection(adata)
plt.show()

In [None]:
adata.var_names_make_unique()
sc.pl.violin(
    adata,
    keys=["palantir_pseudotime"],
    groupby="C_scANVI",
    rotation=-90,
)

In [None]:
#del adata.varm
adata.write(os.path.join(RESULTS_FOLDERNAME, 'allages_tendonfibro.h5ad'))