In [None]:
import os
from re import search
import pandas as pd
import numpy as np

import scanpy as sc
import bbknn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

from scipy.stats import zscore
from scipy import sparse, stats

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import gc

import igraph as ig

import warnings

matplotlib.rcParams.update({'font.size': 12})
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
trainRatio = 0.7

In [None]:
# Check how well we can predict a set of labels from a given predictor matrix
# @param predictors: (n_observations,n_features) matrix of predictors,
#   as either a numpy array or scipy.sparse.csr_matrix
# @param labels: (n_observations,) vector of discrete labels per observation
#   as a numpy array
# @param labelOrder: optional iterable specifying the unique label values 
#   in labels, ordered as they should be in the output. Otherwise, unique
#   label values are sorted using np.sort to determine the order
# @param trainRatio: fraction of observations to use in training the models
# @param minGrpSize: minimum number of observations with a given label to 
#   attempt to train a model for. Groups with fewer observations will
#   be reported with a Matthews Correlation of 0
# @return (n_unique_labels,) vector of Matthews correlation between true and
#   predicted labels for the trained one-vs-rest logistic regression models
#   on the test datasets for each unique label in labels. If labelOrder is 
#   specified, these values correspond to the unique labels provided there,
#   otherwise, unique values are sorted using np.sort to determine order
def checkLogisticPredictability( predictors, labels, labelOrder=None, trainRatio=0.7, minGrpSize=1 ):
    # Determine the order if not provided
    if labelOrder is None:
        labelOrder = np.sort(np.unique(labels))
    nL = labelOrder.size
    mc = np.zeros(nL)
    # Set up one-vs-rest comparisons for each unique label
    for i in tqdm(range(nL)):
        Y = labels==labelOrder[i]
        if Y.sum() <= minGrpSize:
            mc[i] = 0
            continue
        # Split the data, trying to retain similar ratios of
        # positive and negative observations in the train
        # and test sets
        Xtr, Xte, Ytr, Yte = train_test_split( predictors, Y, 
                                               train_size=trainRatio, 
                                               random_state=0,
                                               stratify=Y )
        # train and test the model
        model = LogisticRegression( C=1e5, random_state=0, n_jobs=6 )
        model = model.fit( Xtr, Ytr )
        mc[i] = metrics.matthews_corrcoef( Yte, model.predict( Xte ) )
    return mc

# *M. lignano* subclustering

In [None]:
adataMP = sc.read_h5ad( 'ArchROutputs/Mlig/Mlig.peaks.h5ad' )
chipseekr = pd.read_csv( 'Metadata/Mlig.chipseekr_annots.csv' )
chipseekr.index = chipseekr.seqnames + ':' + \
                  chipseekr.start.astype(str) + '-' + \
                  chipseekr.end.astype(str)
adataMP.var['NearGene'] = chipseekr.loc[adataMP.var_names,'trainscriptId']
adataMP.var['Type'] = chipseekr.loc[adataMP.var_names,'annotation']
adataMP.var['GeneDist'] = chipseekr.loc[adataMP.var_names,'distanceToTSS']

adataMP = adataMP[adataMP.obs.GroupFigure.str.startswith('Muscle'),:].copy()
adataMP.obsm['X_PVI'] = pd.read_csv( 'scVI_models/Mlig.peakvi_muscle_latent_dims.csv',
                                     index_col=0 ).loc[adataMP.obs_names,:].values
adataMP.obs['SEACell'] = pd.read_csv( 'SEACellsOutput/Mlig.all_SEACell_assignments.csv', 
                                      index_col=0 ).loc[adataMP.obs_names,'SEACell']

adataMP

In [None]:
sc.pp.filter_genes( adataMP, min_cells=10 )

adataMP

In [None]:
sc.external.pp.harmony_integrate( adataMP, key='Sample', basis='X_PVI', adjusted_basis='X_Harmony' )

In [None]:
sc.tl.umap( adataMP, min_dist=0.05, spread=1.5 )
fig = plt.figure(figsize=(8,6))
sc.pl.umap( adataMP, color='GroupFigure', size=50, legend_loc='right margin', ax=fig.gca() )
plt.show()

In [None]:
# Run leiden clustering on the neural data alone
# The resolution parameter was tuned until we got
# decent predictability, shown below
sc.tl.leiden( adataMP, resolution=0.3 )
fig = plt.figure(figsize=(8,6))
sc.pl.umap( adataMP, color='leiden', size=50, legend_loc='right margin', ax=fig.gca() )
plt.show()

In [None]:
checkLogisticPredictability( adataMP.X, adataMP.obs.leiden.astype(int).values, trainRatio=trainRatio )

In [None]:
adataMP.obs[['SEACell','leiden']].to_csv( 'Metadata/Mlig.muscle_subcluster_annotations.csv' )

In [None]:
del adataMP

gc.collect()

# *S. mediterranea* subclustering

In [None]:
# Read in the RNA data and transfer annotations
adataPG = sc.read_h5ad( 'GEXCounts/Smed/Smed.raw_RNA_counts.h5ad' )
adataPG.obs = sc.read_h5ad( 'ArchROutputs/Smed/Smed.peaks.h5ad' )\
                .obs.loc[adataPG.obs_names,:]
sc.pp.normalize_per_cell( adataPG )
sc.pp.log1p( adataPG, base=2 )

adataPG = adataPG[adataPG.obs.GroupFigure.str.startswith('Muscle'),:].copy()
adataPG.obsm['X_PVI'] = pd.read_csv( 'scVI_models/Smed.scvi_muscle_latent_dims.csv',
                                     index_col=0 ).loc[adataPG.obs_names,:].values
adataPG.obs['SEACell'] = pd.read_csv( 'SEACellsOutput/Smed.all_SEACell_assignments.csv', 
                                      index_col=0 ).loc[adataPG.obs_names,'SEACell']

adataPG

In [43]:
sc.external.pp.harmony_integrate( adataPP, key='Sample', basis='X_PVI', adjusted_basis='X_Harmony' )

In [None]:
sc.tl.umap( adataPP, min_dist=0.05, spread=1.5 )
fig = plt.figure(figsize=(8,6))
sc.pl.umap( adataPP, color='GroupFigure', size=50, legend_loc='right margin', ax=fig.gca() )
plt.show()

In [None]:
sc.tl.leiden( adataPG, resolution=0.8 )
fig = plt.figure(figsize=(8,6))
sc.pl.umap( adataPG, color='leiden', size=50, legend_loc='right margin', ax=fig.gca() )
plt.show()

In [None]:
checkLogisticPredictability( adataPG.X, adataPG.obs.leiden.astype(int).values, trainRatio=trainRatio )

In [None]:
adataPG.obs[['SEACell','leiden']].to_csv( 'Metadata/Smed.muscle_subcluster_annotations.csv' )

In [None]:
del adataPG

gc.collect()

# *S. mansoni* subclustering

In [None]:
# Read in the RNA data and transfer annotations
adataSG = sc.read_h5ad( 'GEXCounts/Sman/Sman.raw_rna_counts.h5ad' )
adataSG.var_names = [ g.replace('-','_') for g in adataSG.var_names ]
# Reading in the ATAC data too since it's a bit higher quality
adataSP = sc.read_h5ad( 'ArchROutputs/Sman/Sman.peaks.h5ad' )[adataSG.obs_names,:].copy()
adataSG.obs = adataSP.obs.copy()
sc.pp.normalize_per_cell( adataSG )
sc.pp.log1p( adataSG, base=2 )

adataSG = adataSG[adataSG.obs.GroupFigure.str.startswith('Muscle'),:].copy()
adataSG.obsm['X_PVI'] = pd.read_csv( 'scVI_models/Sman.peakvi_muscle_latent_dims.csv',
                                     index_col=0 ).loc[adataSG.obs_names,:].values
adataSG.obs['SEACell'] = pd.read_csv( 'SEACellsOutput/Sman.all_SEACell_assignments.csv', 
                                      index_col=0 ).loc[adataSG.obs_names,'SEACell']

adataSG

In [None]:
sc.external.pp.harmony_integrate( adataSG, key='Sample', basis='X_PVI', adjusted_basis='X_Harmony' )

In [None]:
sc.tl.umap( adataSG, min_dist=0.05, spread=1.5 )
fig = plt.figure(figsize=(8,6))
sc.pl.umap( adataSG, color='GroupFigure', size=50, legend_loc='right margin', ax=fig.gca() )
plt.show()

In [None]:
sc.tl.leiden( adataSG, resolution=0.45 )
fig = plt.figure(figsize=(8,6))
sc.pl.umap( adataSG, color='leiden', size=50, legend_loc='right margin', ax=fig.gca() )
plt.show()

In [None]:
# Using the peak accessibilities as predictors here instead of the
# RNA expression since this is what the embedding was based on,
# and the data quality is a bit higher than the RNA 
checkLogisticPredictability( adataSP.X, adataSG.obs.leiden.astype(int).values, trainRatio=trainRatio )

In [None]:
adataSG.obs[['SEACell','leiden']].to_csv( 'Metadata/Sman.muscle_subcluster_annotations.csv' )