In [None]:
# Import dependencies
%matplotlib inline
import os
import scanpy as sc
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import anndata as ad

import harmony
import harmony.core
import harmony.plot
import palantir

matplotlib.rcParams['font.family'] = 'sans-serif'

# Initialize random seed
import random
random.seed(111)

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
SCVI_FOLDERNAME = "foetal/results/scVI/"
RESULTS_FOLDERNAME = "foetal/results/Harmony"
FIGURES_FOLDERNAME = "foetal/figures/Harmony"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

### Harmony augmented affinity matrix
The following metadata information is necessary for Harmony:

- Timepoint at which each cell was measured
- Connections between timepoints for computation of mutually nearest neighbors
The timepoint at which each cell was measured for this dataset can be determined by string matching since the information has been added to the barcode name



In [None]:
adata = sc.read_h5ad(os.path.join(SCVI_FOLDERNAME, 'dev_scANVI.h5ad'))
adata

In [None]:
adata.X=adata.layers['scaled'].copy()

In [None]:
sc.pp.pca(adata, n_comps=50, svd_solver="arpack")
sc.pl.pca_loadings(adata, components='1,2,3,4,5,6,7,8')

In [None]:
sc.pl.pca(adata, components=['1,2', '3,4', '5,6', '7,8'], ncols=2, color='phase')

In [None]:
adata.obs['age'].value_counts()

In [None]:
adata.obs

# DATA TEMPORAL REORGANISATION

For this to work, I need to reorganise my anndata objects in a chronologically meaningful way so that the combined counts matrices and everything else starts with data from 12pcw and finishes with data from the oldest sample, 20w.

In [None]:
adata.obs['sample'].value_counts()

In [None]:
adata.obs['age'] = adata.obs['age'].astype('category')
adata.obs['age'].values

In [None]:
adata.var_names_make_unique()

In [None]:
# Split the data by sample into a dictionary of anndata objects
adata_dict = {}
for sample in adata.obs['sample'].unique():
    adata_dict[sample] = adata[adata.obs['sample'] == sample].copy()
    adata_dict[sample].var_names_make_unique()

# Sort the dictionary by the age of the sample
sorted_adata_dict = {}
for age in ['12w', '17w', '20w']:
    sorted_adata_dict.update({k: v for k, v in adata_dict.items() if v.obs['age'].cat.categories[0] == age})

del adata_dict
    
for key, adata in sorted_adata_dict.items():
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    adata.raw = adata

sorted_adata_list = list(sorted_adata_dict.values())

adata = ad.concat(sorted_adata_list, join='outer', index_unique=None)
adata

In [None]:
del sorted_adata_list

In [None]:
adata.obs

# CALCULATING AUGMENTED AFFINITY MATRIX

In [None]:
# specify the timepoint information for Harmony
tp = adata.obs['age'].astype(str)

# specify the timepoint connections for the augmented affinity matrix
timepoint_connections = pd.DataFrame({'from_timepoint': ['12w', '17w'], 'to_timepoint': ['17w', '20w']})

data_df=pd.DataFrame(adata.obsm["X_scANVI"],index=adata.obs_names)

In [None]:
data_df

In the `harmony.core.augmented_affinity_matrix` function, `aff` is a regular (non-augmented) affinity matrix computed based on the nearest neighbors graph of the input data.It can be useful for downstream analysis or visualization purposes, as it captures the intrinsic relationship between the cells in the original space.

The `aug_aff` output, on the other hand, is the augmented affinity matrix that incorporates the temporal information between the different timepoints. This matrix can be used in downstream analysis methods that take into account the developmental trajectory of the cells, such as RNA velocity or Palantir.

In [None]:
# generate the augmented affinity matrix (aug_aff) and the non-augmented matrix (aff)
aug_aff, aff = harmony.core.augmented_affinity_matrix(data_df, tp, timepoint_connections, n_neighbors=20, pc_components=None)

# add the augmented affinity matrix to obsm
adata.obsm['X_aug_aff'] = aug_aff

In [None]:
# computes force directed layout coordinates from the augmented aff matrix
layout = harmony.plot.force_directed_layout(aug_aff, data_df.index)

In [None]:
# makes the plot
harmony.plot.plot_timepoints(layout, tp)

In [None]:
# Add layout coordinates to anndata object
adata.obsm['X_force_directed_layout'] = layout
adata.obsm['force_directed_array'] = adata.obsm['X_force_directed_layout'].values
sc.pl.embedding(adata, basis='force_directed_array', color='age',
               frameon=False,
               save='_dev_harmony_age.png')

In [None]:
#from IPython.display import Image
#Image(filename=os.path.join(FIGURES_FOLDERNAME,'force_directed_array_dev_harmony_age.png'))

In [None]:
sc.pl.embedding(adata, basis='force_directed_array', color='age',
               frameon=False, groups = ['12w'],
               save='_dev_harmony_age_split12w.svg')
sc.pl.embedding(adata, basis='force_directed_array', color='age',
               frameon=False, groups = ['17w'],
               save='_dev_harmony_age_split17w.svg')
sc.pl.embedding(adata, basis='force_directed_array', color='age',
               frameon=False, groups = ['20w'],
               save='_dev_harmony_age_split20w.svg')

In [None]:
sc.pl.embedding(adata, basis='force_directed_array', color='C_scANVI',
               frameon=False,
               save='_dev_harmony_celltype.svg')

In [None]:
%%time
sc.pp.neighbors(adata, n_neighbors=20, use_rep="X_aug_aff", metric='correlation')
sc.tl.umap(adata)

In [None]:
def plot_umaps(anndata, parameters: list, filename: str):
    n_plots = len(parameters)
    fig, axs = plt.subplots(n_plots, 1, figsize=(7, 4*n_plots))
    for i, param in enumerate(parameters):
        sc.pl.umap(anndata, color=param, ax=axs[i], show=False, frameon=False)
        axs[i].set_title(param)
    plt.tight_layout()
    savesvg(filename, fig)
    plt.show()

In [None]:
plot_umaps(adata, ['sampletype', 'age', 'libbatch', 'sample', 'type', 'phase', 'sex', 'C_scANVI'], 
           filename = 'dev_UMAP_plots_augagg.svg')

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format('dev_harmony')))

The augmented affinity matrix aug_aff can be used for a variety of downstream analyses, including:

1) RNA velocity analysis: scVelo is one package that allows you to perform RNA velocity analysis using an augmented affinity matrix. You can calculate the RNA velocity for each cell by estimating the spliced and unspliced transcript counts from the scRNA-seq data and projecting them onto the low-dimensional embedding generated from the augmented affinity matrix.

2) Trajectory inference: CellRank and Palantir are two packages that can use the augmented affinity matrix to infer cell trajectories. These methods allow you to identify branching points and end points, and to visualize the trajectory in a low-dimensional embedding.

3) Cell state transitions: The augmented affinity matrix can be used to visualize cell state transitions across different time points. You can identify cells that change their state across different time points and visualize the trajectory of these cells in the low-dimensional embedding.

4) Differential gene expression analysis: You can use the augmented affinity matrix to perform differential gene expression analysis across different time points. You can identify genes that are differentially expressed between different time points and visualize their expression patterns in the low-dimensional embedding.

Using the augmented affinity matrix to visualize cell state transitions across different time points can provide insights into the dynamics of cell populations over time. Specifically, you can identify cells that change their state across different time points and visualize the trajectory of these cells in the low-dimensional embedding generated from the augmented affinity matrix.

To do this, you can first identify cells that correspond to the same biological sample at different time points. For example, if you have scRNA-seq data from fetal samples at different gestational ages, you can group cells by sample and time point. Next, you can calculate the cell-state scores for each cell using a gene signature that represents the cell state of interest. For example, if you're interested in identifying cells that undergo a mesenchymal-to-epithelial transition (MET), you can use a gene signature that represents the mesenchymal and epithelial states.

Once you have the cell-state scores, you can identify cells that change their state across different time points. For example, you can calculate the difference in cell-state scores between cells from adjacent time points, and identify cells with the largest changes in cell-state scores. You can then visualize the trajectory of these cells in the low-dimensional embedding generated from the augmented affinity matrix. Cells that change their state in a continuous and gradual manner will appear as a smooth trajectory in the embedding, while cells that undergo a more abrupt transition will appear as a discontinuous jump in the trajectory.

There are several packages that could be suitable for identifying marker genes associated with a particular biological process, such as MET, e.g. SCENIC: SCENIC (Single-Cell Regulatory Network Inference and Clustering) is a computational method that uses single-cell RNA sequencing data to infer gene regulatory networks and identify transcription factors that drive gene expression in specific cell types or biological processes. SCENIC can be used to identify putative regulatory factors that drive MET.

## Palantir trajectory detection
Palantir is an algorithm developed by the Pe'er lab to align cells along differentiation trajectories. Palantir models differentiation as a stochastic process where stem cells differentiate to terminally differentiated cells by a series of steps through a low dimensional phenotypic manifold. Palantir effectively captures the continuity in cell states and the stochasticity in cell fate determination.

The first step in Palantir trajectory detection is to project data onto diffusion maps. Harmony augmented affinity matrix is used as the input for identifying diffusion maps. Please see https://github.com/dpeerlab/Palantir for more details on Palantir

In [None]:
import palantir
dm_res = palantir.utils.run_diffusion_maps(aug_aff)
ms_data = palantir.utils.determine_multiscale_space(dm_res,n_eigs=8)
ms_data.index = data_df.index\

adata.obsp["T"]=dm_res["T"]
adata.obsm["X_msdiff"]=ms_data.values

In [None]:
ms_data

In [None]:
%%time
sc.tl.tsne(adata,use_rep="X_msdiff",perplexity=100,learning_rate=adata.shape[0]/12)

In [None]:
sc.pl.tsne(adata,color="C_scANVI")
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'dev_msdiff_TSNE.svg'), format='svg')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'dev_msdiff_TSNEpng.png'), format='png')
# Image(filename=os.path.join(FIGURES_FOLDERNAME,'msdiff_TSNEpng.png'))

In [None]:
sc.pl.tsne(adata,color="ageint",legend_loc='right margin', cmap='viridis')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'dev_msdiff_TSNEage.png'), format='png')
# Image(filename=os.path.join(FIGURES_FOLDERNAME,'msdiff_TSNEage.png'))

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, use_rep="X_msdiff")
sc.tl.draw_graph(adata, init_pos="X_tsne", maxiter=500)

In [None]:
sc.pl.draw_graph(adata,color="C_scANVI", save='dev_msdiff_graph.svg')
plt.savefig(os.path.join(FIGURES_FOLDERNAME, 'dev_msdiff_graph.png'), format='png')
# Image(filename=os.path.join(FIGURES_FOLDERNAME,'msdiff_graph.png'))

In [None]:
sc.pl.draw_graph(adata,color=["age"])

In [None]:
sc.pl.draw_graph(adata,color=["ageint"], cmap='viridis')

In [None]:
sc.pl.draw_graph(adata,color=["phase"])

In [None]:
adata

In [None]:
sc.tl.paga(adata,"C_scANVI")
sc.pl.paga(adata,threshold=.05, node_size_scale=1,
          fontsize=4, fontoutline=1, frameon=False, save='_paga.png')

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'dev_harmony.h5ad'))

In [None]:
adata.obs[['sampletype', 'age', 'sex', 'libbatch']].groupby('sampletype').first().to_csv(os.path.join(RESULTS_FOLDERNAME,'sample_metadata.csv'), index=True)

In [None]:
adata = sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_harmony.h5ad'))

In [None]:
adata.X=adata.layers['scaled'].copy()
sc.pp.pca(adata, n_comps=50, svd_solver="arpack")
sc.pl.pca_loadings(adata, components='1,2,3,4,5,6,7,8')

In [None]:
adata.obs.columns

In [None]:
sc.pl.pca(adata, components=['1,2', '3,4', '5,6', '7,8'], ncols=2, color='pct_counts_mt')