In [None]:
import pandas as pd
import scvi
import gc
import scanpy as sc
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sc.set_figure_params(figsize=(4, 4), frameon=False)
torch.set_float32_matmul_precision("high")
scvi.settings.seed = 0

In [None]:
# Fraction of marker peaks to plot
topFrac = 0.05

# *M. lignano*

In [None]:
peaks = scvi.data.read_h5ad( 'ArchROutputs/Mlig/Mlig.peaks.h5ad' )

peaks

In [None]:
peaks.obs['Injury'] = 'cut'
peaks.obs.loc[peaks.obs.Sample=='uncut','Injury'] = 'uncut'

In [None]:
peakdf = pd.read_csv( 'ArchROutputs/Mlig/Mlig.peaks.csv', index_col=0 )

peakdf.index = peakdf['seqnames'] + ':' + peakdf['start'].astype(str) + '-' + peakdf['end'].astype(str)
peakdf.head()

In [None]:
peaks.var = peakdf

del peakdf
gc.collect()

In [None]:
peaks.X = peaks.X.tocsr()

print(np.max(peaks.X))
print(np.min(peaks.X))

In [None]:
scvi.model.PEAKVI.setup_anndata( peaks, batch_key='Injury' )

In [None]:
pvi = scvi.model.PEAKVI(peaks)
pvi.train()

pvi.save( "scVI_models/MligPeakVI", overwrite=True )

In [None]:
pvi = scvi.model.PEAKVI.load( "scVI_models/MligPeakVI", peaks )

In [None]:
library_size_factor = pvi.get_library_size_factors()
sns.histplot(library_size_factor)
plt.show()

In [None]:
region_factor = pvi.get_region_factors()

sns.histplot(region_factor)
plt.show()

In [None]:
latent = pvi.get_latent_representation()
peaks.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=peaks.obs_names, 
              columns=['X'+str(i) for i in range(latent.shape[1])],
              data=latent, ).to_csv( 'scVI_models/Mlig.pvi_latent_dims.csv' )

## Marker peak identification

In [None]:
# Get all cluster labels
uLabels = np.sort(peaks.obs.GroupFigure.unique())
# Ignore this one small population that we can't identify
uLabels = uLabels[~np.isin(uLabels,['???-1'])]

# Get cluster labels per nucleus
labels = np.array(peaks.obs.GroupFigure.values)
daTot = pd.DataFrame()

scvi.settings.seed = 0
# For each cluster ...
for i, l in tqdm(enumerate(uLabels)):
    print( 'Computing DA for {0}'.format(l) )
    indTarg = (labels==l)
    da = pvi.differential_accessibility( idx1=indTarg, idx2=~indTarg )
    da['GroupFigure'] = l
    da = da.set_index( 'GroupFigure', append=True )
    print( '{0} putatively DA peaks identified'.format(da.is_da_fdr.sum()) )
    daTot = pd.concat( (daTot,da), axis=0 )
    
daTot.to_csv( 'scVI_models/Mlig.peakvi_marker_peaks.csv.gz', compression='gzip' )

In [None]:
# Get top most specific markers based on effect_size
markers = daTot[daTot.is_da_fdr]
toplot = []
for key, grp in markers.groupby('GroupFigure'):
    nTop = int(len(grp.index)*topFrac)
    toplot.extend( grp.sort_values('effect_size').index[:nTop] )
    
toplot = np.unique( toplot )
toplot = peaks.var_names[peaks.var_names.isin(toplot)]
toplot.size

In [None]:
# Average accessibility over each cluster
labels = np.array(peaks.obs.GroupFigure.values)
ctMeans = np.zeros((len(uLabels),peaks.n_vars))

for i, l in tqdm(enumerate(uLabels)):
    ctMeans[i,:] = peaks.X[labels==l,:].A.mean(0)

In [None]:
# Do an initial clustering to make it look nice
g = sns.clustermap( ctMeans[:,peaks.var_names.isin(toplot)], 
                    cmap='Reds', standard_scale=1, row_cluster=False )
g.ax_heatmap.set_yticklabels(uLabels, rotation=0)
plt.show()

# Sort them so it looks vaguely block diagonal
argMs = np.argmax( ctMeans, axis=0 )
argMs = argMs[peaks.var_names.isin(toplot)]
srtInd = np.array(g.dendrogram_col.reordered_ind)
argMs = argMs[srtInd]
srtInd = np.concatenate([srtInd[argMs==i] for i in range(uLabels.size)])

In [None]:
g = sns.clustermap( ctMeans[:,peaks.var_names.isin(toplot)][:,srtInd],
                    cmap='Reds', standard_scale=1, row_cluster=False, col_cluster=False )
g.ax_heatmap.set_yticklabels(uLabels,rotation=0)
plt.savefig('Plots/FigS2/PanelS2a_Mlig.pdf',format='pdf')
plt.show()

In [None]:
del daTot
del da
del labels
del uLabels
del ctMeans

gc.collect()

## Neural only

In [None]:
neuro = peaks[peaks.obs.GroupFigure.str.startswith('Neural'),:].copy()

In [None]:
# tidy up the rest of the data
del latent
del pvi

gc.collect()

In [None]:
# filter to be accessible in at least 30 cells (comes out to ~0.5% of all cells)
sc.pp.filter_genes( neuro, min_cells=30 )

neuro

In [None]:
scvi.model.PEAKVI.setup_anndata( neuro )

In [None]:
pvi = scvi.model.PEAKVI( neuro )
pvi.train()

In [None]:
pvi.save( 'scVI_models/MligPeakVINeural/', overwrite=True )

In [None]:
pvi = scvi.model.PEAKVI.load( 'scVI_models/MligPeakVINeural/', adata=neuro )

In [None]:
library_size_factor = pvi.get_library_size_factors()
sns.histplot(library_size_factor)
plt.show()

In [None]:
region_factor = pvi.get_region_factors()
sns.histplot(region_factor)
plt.show()

In [None]:
latent = pvi.get_latent_representation()
neuro.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=neuro.obs_names, data=latent )\
    .to_csv( 'scVI_models/Mlig.pvi_neural_latent_dims.csv' )

In [None]:
del latent
del neuro
del pvi

gc.collect()

## Muscle only

In [None]:
muscle = peaks[peaks.obs.GroupFigure.str.startswith('Muscle'),:].copy()

In [None]:
# tidy up the rest of the data
del latent
del peaks
del pvi

gc.collect()

In [None]:
# filter to be accessible in at least 30 cells (comes out to ~0.5% of all cells)
sc.pp.filter_genes( muscle, min_cells=30 )

neuro

In [None]:
scvi.model.PEAKVI.setup_anndata( muscle )

In [None]:
pvi = scvi.model.PEAKVI( muscle )
pvi.train()

In [None]:
pvi.save( 'scVI_models/MligPeakVIMuscle/', overwrite=True )

In [None]:
pvi = scvi.model.PEAKVI.load( 'scVI_models/MligPeakVIMuscle/', adata=muscle )

In [None]:
library_size_factor = pvi.get_library_size_factors()
sns.histplot(library_size_factor)
plt.show()

In [None]:
region_factor = pvi.get_region_factors()
sns.histplot(region_factor)
plt.show()

In [None]:
latent = pvi.get_latent_representation()
muscle.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=muscle.obs_names, data=latent )\
    .to_csv( 'scVI_models/Mlig.peakvi_muscle_latent_dims.csv' )

In [None]:
del latent
del muscle
del pvi

gc.collect()

# *S. mediterranea*

In [None]:
rna = scvi.data.read_h5ad( 'GEXCounts/Smed/Smed.raw_RNA_counts.h5ad' )
rna.X = rna.X.tocsr()

rna

In [None]:
sc.pp.filter_genes( rna, min_cells=10 )

rna

In [None]:
scvi.model.SCVI.setup_anndata( rna, batch_key='Sample' )

In [None]:
svi = scvi.model.SCVI( rna, n_latent=20 )
svi.train( check_val_every_n_epoch=1 )

In [None]:
plt.plot( svi.history['reconstruction_loss_validation'].values )
plt.show()
plt.plot( svi.history['elbo_validation'].values )
plt.show()

In [None]:
svi.save( 'scVI_models/SmedSCVI', overwrite=True )

In [None]:
svi = scvi.model.SCVI.load( 'scVI_models/SmedSCVI/', adata=rna )

In [None]:
latent = svi.get_latent_representation()
rna.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=rna.obs_names, 
              columns=['X'+str(i) for i in range(latent.shape[1])],
              data=latent, ).to_csv( 'scVI_models/Smed.scvi_latent_dims.csv' )

## Marker peak identification

Only used PeakVI here for marker peak identification since we were getting much nicer embeddings with scVI than with PeakVI

In [None]:
peaks = scvi.data.read_h5ad( 'ArchROutputs/Smed/Smed.peaks.h5ad' )

peaks

In [None]:
peakdf = pd.read_csv( 'ArchROutputs/Smed/Smed.peaks.csv', index_col=0 )

peakdf.index = peakdf['seqnames'] + ':' + peakdf['start'].astype(str) + '-' + peakdf['end'].astype(str)
peakdf.head()

In [None]:
peaks.var = peakdf

In [None]:
peaks.X = peaks.X.tocsr()

print(np.max(peaks.X))
print(np.min(peaks.X))

In [None]:
scvi.model.PEAKVI.setup_anndata(peaks,batch_key='Sample')

In [None]:
pvi = scvi.model.PEAKVI(peaks)
pvi.train()

pvi.save("scVI_models/SmedPeakVI", overwrite=True)

In [None]:
pvi = scvi.model.PEAKVI.load("scVI_models/SmedPeakVI", peaks)

In [None]:
# Same deal as before
uLabels = np.sort(peaks.obs.GroupFigure.unique())

labels = np.array(peaks.obs.GroupFigure.values)
daTot = pd.DataFrame()

scvi.settings.seed = 0
for i, l in tqdm(enumerate(uLabels)):
    
    print( 'Computing DA for {0}'.format(l) )
    indTarg = (labels==l)
    da = pvi.differential_accessibility( idx1=indTarg, idx2=~indTarg )
    da['GroupFigure'] = l
    da = da.set_index( 'GroupFigure', append=True )
    print( '{0} putative DA peaks detected'.format(da.is_da_fdr.sum()) )
    daTot = pd.concat( (daTot,da), axis=0 )
    
daTot.to_csv( 'scVI_models/Smed.peakvi_marker_peaks.csv.gz', compression='gzip' )

In [None]:
# Get top most specific markers based on effect_size
markers = daTot[daTot.is_da_fdr]
toplot = []
for key, grp in markers.groupby('GroupFigure'):
    nTop = int(len(grp.index)*topFrac)
    toplot.extend( grp.sort_values('effect_size').index[:nTop] )
    
toplot = np.unique( toplot )
toplot = peaks.var_names[peaks.var_names.isin(toplot)]
toplot.size

In [None]:
# Average accessibility over each cluster
labels = np.array(peaks.obs.GroupFigure.values)
ctMeans = np.zeros((len(uLabels),peaks.n_vars))

for i, l in tqdm(enumerate(uLabels)):
    ctMeans[i,:] = peaks.X[labels==l,:].A.mean(0)

In [None]:
# Do an initial clustering to make it look nice
g = sns.clustermap( ctMeans[:,peaks.var_names.isin(toplot)], 
                    cmap='Reds', standard_scale=1, row_cluster=False )
g.ax_heatmap.set_yticklabels(uLabels, rotation=0)
plt.show()

# Sort them so it looks vaguely block diagonal
argMs = np.argmax( ctMeans, axis=0 )
argMs = argMs[peaks.var_names.isin(toplot)]
srtInd = np.array(g.dendrogram_col.reordered_ind)
argMs = argMs[srtInd]
srtInd = np.concatenate([srtInd[argMs==i] for i in range(uLabels.size)])

In [None]:
g = sns.clustermap( ctMeans[:,peaks.var_names.isin(toplot)][:,srtInd],
                    cmap='Reds', standard_scale=1, row_cluster=False, col_cluster=False )
g.ax_heatmap.set_yticklabels(uLabels,rotation=0)
plt.savefig('Plots/FigS2/PanelS2a_Smed.pdf',format='pdf')
plt.show()

In [None]:
del peaks
del pvi
del daTot
del da
del labels
del uLabels
del ctMeans

gc.collect()

## Neural only

In [None]:
neuro = rna[rna.obs.GroupFigure.str.startswith('Neural'),:].copy()

In [None]:
# tidy up the rest of the data
del latent
del svi

gc.collect()

In [None]:
sc.pp.filter_genes( neuro, min_cells=10 )

neuro

In [None]:
scvi.model.SCVI.setup_anndata( neuro, batch_key='Sample' )

In [None]:
svi = scvi.model.SCVI( neuro, n_latent=20 )
svi.train( check_val_every_n_epoch=1 )

In [None]:
plt.plot( svi.history['reconstruction_loss_validation'].values )
plt.show()
plt.plot( svi.history['elbo_validation'].values )
plt.show()

In [None]:
svi.save( 'scVI_models/SmedSCVINeural', overwrite=True )

In [None]:
svi = scvi.model.SCVI.load( 'scVI_models/SmedSCVINeural/', adata=neuro )

In [None]:
latent = svi.get_latent_representation()
neuro.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=neuro.obs_names, data=latent )\
    .to_csv( 'scVI_models/Smed.scvi_neural_latent_dims.csv' )

In [None]:
del latent
del neuro
del svi

gc.collect()

## Muscle only

In [None]:
muscle = rna[rna.obs.GroupFigure.str.startswith('Muscle'),:].copy()

In [None]:
# tidy up the rest of the data
del latent
del rna
del svi

gc.collect()

In [None]:
sc.pp.filter_genes( muscle, min_cells=10 )

muscle

In [None]:
scvi.model.SCVI.setup_anndata( muscle, batch_key='Sample' )

In [None]:
svi = scvi.model.SCVI( muscle, n_latent=20 )
svi.train( check_val_every_n_epoch=1 )

In [None]:
plt.plot( svi.history['reconstruction_loss_validation'].values )
plt.show()
plt.plot( svi.history['elbo_validation'].values )
plt.show()

In [None]:
svi.save( 'scVI_models/SmedSCVIMuscle/', overwrite=True )

In [None]:
svi = scvi.model.SCVI.load( 'scVI_models/SmedSCVIMuscle/', adata=muscle )

In [None]:
latent = svi.get_latent_representation()
muscle.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=muscle.obs_names, data=latent )\
    .to_csv( 'scVI_models/Smed.scvi_muscle_latent_dims.csv' )

In [None]:
del latent
del muscle
del svi

gc.collect()

# *S. mansoni*

In [None]:
peaks = scvi.data.read_h5ad( 'ArchROutputs/Sman/Sman.peaks.h5ad' )

peaks

In [None]:
peakdf = pd.read_csv( 'ArchROutputs/Sman/Sman.peaks.csv', index_col=0 )

peakdf.index = peakdf['seqnames'] + ':' + peakdf['start'].astype(str) + '-' + peakdf['end'].astype(str)
peakdf.head()

In [None]:
peaks.var = peakdf

del peakdf
gc.collect()

In [None]:
peaks.X = peaks.X.tocsr()

print(np.max(peaks.X))
print(np.min(peaks.X))

In [None]:
scvi.model.PEAKVI.setup_anndata(peaks,batch_key='Sample')

In [None]:
pvi = scvi.model.PEAKVI(peaks)
pvi.train()

pvi.save("scVI_models/SmanPeakVI/", overwrite=True)

In [None]:
pvi = scvi.model.PEAKVI.load("scVI_models/SmanPeakVI/", peaks)

In [None]:
library_size_factor = pvi.get_library_size_factors()
sns.histplot(library_size_factor)
plt.show()

In [None]:
region_factor = pvi.get_region_factors()
sns.histplot(region_factor)
plt.show()

In [None]:
latent = pvi.get_latent_representation()
peaks.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=peaks.obs_names, 
              columns=['X'+str(i) for i in range(latent.shape[1])],
              data=latent ).to_csv( 'scVI_models/Sman.pvi_latent_dims.csv' )

## Marker peak identification

In [None]:
# Same deal as before
uLabels = np.sort(peaks.obs.GroupFigure.unique())

labels = np.array(peaks.obs.GroupFigure.values)
daTot = pd.DataFrame()

scvi.settings.seed = 0
for i, l in tqdm(enumerate(uLabels)):
    
    print( 'Computing DA for {0}'.format(l) )
    indTarg = (labels==l)
    da = pvi.differential_accessibility( idx1=indTarg, idx2=~indTarg )
    da['GroupFigure'] = l
    da = da.set_index( 'GroupFigure', append=True )
    print( '{0} putative DA peaks detected'.format(da.is_da_fdr.sum()) )
    daTot = pd.concat( (daTot,da), axis=0 )
    
daTot.to_csv( 'scVI_models/Sman.peakvi_marker_peaks.csv.gz', compression='gzip' )

In [None]:
# Get top most specific markers based on effect_size
markers = daTot[daTot.is_da_fdr]
toplot = []
for key, grp in markers.groupby('GroupFigure'):
    nTop = int(len(grp.index)*topFrac)
    toplot.extend( grp.sort_values('effect_size').index[:nTop] )
    
toplot = np.unique( toplot )
toplot = peaks.var_names[peaks.var_names.isin(toplot)]
toplot.size

In [None]:
# Average accessibility over each cluster
labels = np.array(peaks.obs.GroupFigure.values)
ctMeans = np.zeros((len(uLabels),peaks.n_vars))

for i, l in tqdm(enumerate(uLabels)):
    ctMeans[i,:] = peaks.X[labels==l,:].A.mean(0)

In [None]:
# Do an initial clustering to make it look nice
g = sns.clustermap( ctMeans[:,peaks.var_names.isin(toplot)], 
                    cmap='Reds', standard_scale=1, row_cluster=False )
g.ax_heatmap.set_yticklabels(uLabels, rotation=0)
plt.show()

# Sort them so it looks vaguely block diagonal
argMs = np.argmax( ctMeans, axis=0 )
argMs = argMs[peaks.var_names.isin(toplot)]
srtInd = np.array(g.dendrogram_col.reordered_ind)
argMs = argMs[srtInd]
srtInd = np.concatenate([srtInd[argMs==i] for i in range(uLabels.size)])

In [None]:
g = sns.clustermap( ctMeans[:,peaks.var_names.isin(toplot)][:,srtInd],
                    cmap='Reds', standard_scale=1, row_cluster=False, col_cluster=False )
g.ax_heatmap.set_yticklabels(uLabels,rotation=0)
plt.savefig('Plots/FigS2/PanelS2a_Sman.pdf',format='pdf')
plt.show()

In [None]:
del daTot
del da
del labels
del uLabels
del ctMeans

gc.collect()

## Neural only

In [None]:
neuro = peaks[peaks.obs.GroupFigure.str.startswith('Neural'),:].copy()

In [None]:
del latent
del pvi
gc.collect()

In [None]:
sc.pp.filter_genes( neuro, min_cells=10 )

neuro

In [None]:
scvi.model.PEAKVI.setup_anndata( neuro )

In [None]:
pvi = scvi.model.PEAKVI( neuro )
pvi.train( check_val_every_n_epoch=1 )

In [None]:
plt.plot( pvi.history['reconstruction_loss_validation'].values )
plt.show()

plt.plot( pvi.history['elbo_validation'].values )
plt.show()

In [None]:
pvi.save( 'scVI_models/SmanPeakVINeural/', overwrite=True )

In [None]:
pvi = scvi.model.PEAKVI.load( 'scVI_models/SmanPeakVINeural/', adata=neuro )

In [None]:
library_size_factor = pvi.get_library_size_factors()
sns.histplot(library_size_factor)
plt.show()

In [None]:
region_factor = pvi.get_region_factors()
sns.histplot(region_factor)
plt.show()

In [None]:
latent = pvi.get_latent_representation()
neuro.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=neuro.obs_names, data=latent )\
    .to_csv( 'scVI_models/Sman.pvi_neural_latent_dims.csv' )

In [None]:
del latent
del neuro
del pvi

gc.collect()

## Muscle only

In [None]:
muscle = peaks[peaks.obs.GroupFigure.str.startswith('Muscle'),:].copy()

In [None]:
del latent
del peaks
del pvi
gc.collect()

In [None]:
sc.pp.filter_genes( muscle, min_cells=10 )

neuro

In [None]:
scvi.model.PEAKVI.setup_anndata( muscle )

In [None]:
pvi = scvi.model.PEAKVI( muscle )
pvi.train( check_val_every_n_epoch=1 )

In [None]:
plt.plot( pvi.history['reconstruction_loss_validation'].values )
plt.show()

plt.plot( pvi.history['elbo_validation'].values )
plt.show()

In [None]:
pvi.save( 'scVI_models/SmanPeakVIMuscle/', overwrite=True )

In [None]:
pvi = scvi.model.PEAKVI.load( 'scVI_models/SmanPeakVIMuscle/', adata=muscle )

In [None]:
library_size_factor = pvi.get_library_size_factors()
sns.histplot(library_size_factor)
plt.show()

In [None]:
region_factor = pvi.get_region_factors()
sns.histplot(region_factor)
plt.show()

In [None]:
latent = pvi.get_latent_representation()
muscle.obsm["X_PeakVI"] = latent

print(latent.shape)

In [None]:
pd.DataFrame( index=muscle.obs_names, data=latent )\
    .to_csv( 'scVI_models/Sman.peakvi_muscle_latent_dims.csv' )

In [None]:
del latent
del muscle
del pvi

gc.collect()