**!!! This notebook must be run from within the SAMap docker container !!!**

While there should actually be no conceptually necessary dependence on SAM to carry out this analysis, it's where we started working on it and so everything is kept there just for consistency

In [None]:
# Install packages not present in SAMap by default
! pip install logomaker
# This should fix an anndata compatibility issue
! pip install --upgrade scanpy

In [None]:
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split
from scipy import sparse, stats
import statsmodels.api as sm
import re
from samalg import SAM
from scipy.stats import zscore
import os
from tqdm import tqdm
from numba import njit, jit, prange
import seaborn as sns
import gc

from scipy.special import softmax
from scipy.spatial.distance import squareform, pdist
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram

matplotlib.rcParams.update({'font.size': 12})
%config InlineBackend.figure_format = 'retina'

# Utility functions

In [None]:
# Calculate confidence interval for 
# standard deviation estimate
@njit(parallel=True)
def bsStd( X, nBS=500, alpha=0.01 ):
    sds = np.zeros(nBS)
    for i in prange(nBS):
        sds[i] = np.random.choice(X,X.size).std()
    return ( np.quantile( sds, alpha/2 ),
             sds.mean(),
             np.quantile( sds, 1- alpha/2 ) )

bsStd( np.random.normal( size=500 ) )

In [None]:
import logomaker

def readMEME( fname ):
    outPFMs = {}
    outNames = []
    pfm = []
    name = ''
    with open( fname, 'r' ) as f:
        for line in f.readlines():
            l = line.strip()
            if len(l) == 0:
                if len(pfm) > 0:
                    outNames.append(name)
                    outPFMs[name] = np.array(pfm).astype('float')
                    pfm = []
                continue
                
            if l.startswith('MOTIF'):
                name = l.split()[1]
            elif l[0].isdigit():
                pfm.append( l.split()[:4] )
            elif len(pfm) > 0:
                outNames.append(name)
                outPFMs[name] = np.array(pfm).astype('float')
                pfm = []
        if len(pfm) > 0:
            outNames.append(name)
            outPFMs[name] = np.array(pfm).astype('float')
            pfm = []
    return outPFMs, np.array(outNames)

@njit
def addPFMPseudoCount( pfm, pseudo ):
    plusPseudo = pfm + (pseudo*pfm.sum(1)/(1-4*pseudo)).reshape(-1,1)
    return plusPseudo / plusPseudo.sum(1).reshape(-1,1)

addPFMPseudoCount( np.array([[1,0,0,0]]), 0.001 )

@jit(forceobj=True)
def relativeInfo( pfm, bkgGC=0.5 ):
    gc = bkgGC / 2
    at = 0.5 - gc
    background = np.array([[at,gc,gc,at]])
    # return (pfm*(np.nan_to_num(np.log2(pfm))+2)) + 0
    return (pfm*(np.nan_to_num(np.log2(pfm))-np.log2(background))) + 0

relativeInfo( np.array([[1,0,0,0],[0.25,0.25,0.25,0.25],[0.1,0.1,0.1,0.7]]) )

# Generate a logomaker Logo object from a PFM
# Input will be a N x 4 numpy array
# Pseudocounts will be added to avoid log(0)
def makeLogoFromPFM( pfm, pseudo=0.001, bkgGC=0.5 ):
    info = relativeInfo( addPFMPseudoCount(pfm,pseudo), bkgGC )
    df = pd.DataFrame( data=info, columns=['A','C','G','T'] )
    logo = logomaker.Logo( df )
    logo.ax.set_yticks([0,1,2])
    logo.style_spines(visible=False)
    logo.style_spines(spines=['left', 'bottom'], visible=True)
    return logo

pfms, motifNames = readMEME( 'motif_clustering_data/all_motif_clusters.annot.meme' )
motifNames.size

In [None]:
# Colormap with quadratically scaled Reds
custom_cmap = matplotlib.colors\
                .LinearSegmentedColormap.from_list( 'custom', plt.get_cmap('Reds')\
                                                                   (np.linspace(0,1,100)**(2)) )

# Load in and process the data

## *M. lignano*

In [None]:
adataM = sc.read_h5ad( 'ChromVARDeviations/Mlig.SEACells_devs.h5ad' )

In [None]:
adataM.var['Source'] = 'JASPAR'
adataM.var.loc[adataM.var_names.str.startswith('WC_'),'Source'] = 'MODISCO'
adataM.var

Process and save a SAM object

In [None]:
samM = SAM( counts=adataM[:,:] )
samM.preprocess_data( filter_genes=False, sum_norm=None, norm=None, min_expression=-np.inf )
samM.run( weight_mode='rms', batch_key='Sample', k=5 )

In [None]:
samM.save( 'ChromVARDeviations/Mlig.SEACells_devs_SAM.pkl' )

In [None]:
samM = SAM()
samM.load( 'ChromVARDeviations/Mlig.SEACells_devs_SAM.pkl' )

In [None]:
fig = plt.figure( figsize=(8,6) )
sc.pl.umap( samM.adata, ax=fig.gca(), color='Sample', size=50 )
plt.show()

In [None]:
fig = plt.figure( figsize=(8,6) )
sc.pl.umap( samM.adata, ax=fig.gca(), color='GroupFigure', size=50, legend_loc='right margin' )
plt.show()

In [None]:
samM.dispersion_ranking_NN(save_avgs=True)

## *S. mediterranea*

In [None]:
adataP = sc.read_h5ad( 'ChromVARDeviations/Smed.SEACells_devs.h5ad' )

In [None]:
adataP.var['Source'] = 'JASPAR'
adataP.var.loc[adataP.var_names.str.startswith('WC_'),'Source'] = 'MODISCO'
adataP.var

Process and save a SAM object

In [None]:
samP = SAM( counts=adataP[:,:] )
samP.preprocess_data( filter_genes=False, sum_norm=None, norm=None, min_expression=-np.inf )
samP.run( weight_mode='rms', batch_key='Sample', k=5 )

In [None]:
samP.save( 'ChromVARDeviations/Smed.SEACells_devs_SAM.pkl' )

In [None]:
samP = SAM()
samP.load( 'ChromVARDeviations/Smed.SEACells_devs_SAM.pkl' )

In [None]:
fig = plt.figure( figsize=(8,6) )
sc.pl.umap( samP.adata, ax=fig.gca(), color='Sample', size=50 )
plt.show()

In [None]:
fig = plt.figure( figsize=(8,6) )
sc.pl.umap( samP.adata, ax=fig.gca(), color='GroupFigure', size=50, legend_loc='right margin' )
plt.show()

In [None]:
samP.dispersion_ranking_NN(save_avgs=True)

## *S. mansoni*

In [None]:
adataS = sc.read_h5ad( 'ChromVARDeviations/Sman.SEACells_devs.h5ad' )

In [None]:
adataS.var['Source'] = 'JASPAR'
adataS.var.loc[adataS.var_names.str.startswith('WC_'),'Source'] = 'MODISCO'
adataS.var

Process and save a SAM object

In [None]:
samS = SAM( counts=adataS[:,:] )
samS.preprocess_data( filter_genes=False, sum_norm=None, norm=None, min_expression=-np.inf )
samS.run( weight_mode='rms', batch_key='Sample', k=5 )

In [None]:
samS.save( 'ChromVARDeviations/Sman.SEACells_devs_SAM.pkl' )

In [None]:
samS = SAM()
samS.load( 'ChromVARDeviations/Sman.SEACells_devs_SAM.pkl' )

In [None]:
fig = plt.figure( figsize=(8,6) )
sc.pl.umap( samS.adata, ax=fig.gca(), color='Sample', size=50 )
plt.show()

In [None]:
fig = plt.figure( figsize=(8,6) )
sc.pl.umap( samS.adata, ax=fig.gca(), color='GroupFigure', size=50, legend_loc='right margin' )
plt.show()

In [None]:
samS.dispersion_ranking_NN(save_avgs=True)

# Check some basic stats

In [None]:
# Comparing SAM weights between species
weightDF = pd.concat( (samM.adata.var.weights.to_frame(),
                       samP.adata.var.weights.to_frame(),
                       samS.adata.var.weights.to_frame()), 
                       axis=1 )
weightDF.columns = ['weight_Mlig','weight_Smed','weight_Sman']
sns.pairplot( weightDF )
plt.show()

## Which motifs are highly variable

In [None]:
sdM = np.zeros((3,samM.adata.n_vars))
np.random.seed(0)

for i in tqdm(range(samM.adata.n_vars)):
    sdM[0,i], sdM[1,i], sdM[2,i] = bsStd( samM.adata.X.A[:,i].flatten() )
    
print( (sdM[0,:] > 1).sum() ) # 1171

In [None]:
sdP = np.zeros((3,samP.adata.n_vars))
np.random.seed(0)

for i in tqdm(range(samP.adata.n_vars)):
    sdP[0,i], sdP[1,i], sdP[2,i] = bsStd( samP.adata.X.A[:,i].flatten() )
    
print( (sdP[0,:] > 1).sum() ) # 1152

In [None]:
sdS = np.zeros((3,samS.adata.n_vars))
np.random.seed(0)

for i in tqdm(range(samS.adata.n_vars)):
    sdS[0,i], sdS[1,i], sdS[2,i] = bsStd( samS.adata.X.A[:,i].flatten() )
    
print( (sdS[0,:] > 1).sum() ) # 1120

In [None]:
highlyVariable = (sdM[1,:] > 1) & (sdS[1,:] > 1) & (sdP[1,:] > 1)
highlyVariable.sum() # 1096

## Adding cell type family annotations

In [None]:
# Order to show the different families in
tOrder = np.array(['Epidermal','Germline','Intestine','Muscle','Neoblast',
                   'Neural','Protonephridia','Cathepsin','Parenchymal'])

# Get unique clusters for each species
uLabelsM = np.sort(samM.adata.obs.GroupFigure.unique())
uLabelsM = uLabelsM[~np.isin(uLabelsM,['???-1'])]
uLabelsS = np.sort(samS.adata.obs.GroupFigure.unique())
uLabelsP = np.sort(samP.adata.obs.GroupFigure.unique())

# Build a matrix of size (n_families,n_clusters) for
# mapping clusters to their families for each species
tMapM = np.zeros((len(tOrder),len(uLabelsM)))
tMapM[tOrder=='Epidermal',np.isin(uLabelsM,['Epidermal-1','Epidermal-2',
                                            'Epidermal Progenitors'])] = 1
tMapM[tOrder=='Germline',np.isin(uLabelsM,['GSC','Female Germline','Male Germline'])] = 1
tMapM[tOrder=='Intestine',np.isin(uLabelsM,['Intestine-1','Intestine-2'])] = 1
tMapM[tOrder=='Muscle',np.isin(uLabelsM,['Muscle'])] = 1
tMapM[tOrder=='Neoblast',np.isin(uLabelsM,['Neoblast'])] = 1
tMapM[tOrder=='Neural',np.isin(uLabelsM,['Neural-1','Neural-2','Neural-3','Neural-4',
                                         'Neural-5','Neural Progenitors'])] = 1
tMapM[tOrder=='Protonephridia',np.isin(uLabelsM,['Protonephridia'])] = 1
tMapM[tOrder=='Cathepsin',np.isin(uLabelsM,['Cathepsin'])] = 1
tMapM[tOrder=='Parenchymal',np.isin(uLabelsM,['Parenchymal-1','Parenchymal-2','Parenchymal-3',
                                              'Parenchymal-4','Parenchymal-5'])] = 1


tMapS = np.zeros((len(tOrder),len(uLabelsS)))
tMapS[tOrder=='Epidermal',np.isin(uLabelsS,['Tegument-1','Tegument-2','Tegument Progenitors'])] = 1
tMapS[tOrder=='Germline',np.isin(uLabelsS,['GSC'])] = 1
tMapS[tOrder=='Intestine',np.isin(uLabelsS,['Intestine'])] = 1
tMapS[tOrder=='Muscle',np.isin(uLabelsS,['Muscle-1','Muscle-2','Muscle-3','Muscle Progenitors'])] = 1
tMapS[tOrder=='Neoblast',np.isin(uLabelsS,['Neoblast'])] = 1
tMapS[tOrder=='Neural',np.isin(uLabelsS,['Neural-1','Neural-2','Neural-3','Neural-4','Neural-5',
                                         'Neural-6','Neural Progenitors'])] = 1
tMapS[tOrder=='Protonephridia',np.isin(uLabelsS,['Protonephridia'])] = 1
tMapS[tOrder=='Cathepsin',np.isin(uLabelsS,['Cathepsin'])] = 1

tMapP = np.zeros((len(tOrder),len(uLabelsP)))
tMapP[tOrder=='Epidermal',np.isin(uLabelsP,['Epidermal-1','Epidermal-2'])] = 1
tMapP[tOrder=='Germline',np.isin(uLabelsP,['GSC','GSC progeny/diff germline'])] = 1
tMapP[tOrder=='Intestine',np.isin(uLabelsP,['Intestine-1','Intestine-2'])] = 1
tMapP[tOrder=='Muscle',np.isin(uLabelsP,['Muscle-1','Muscle-2','Muscle-3'])] = 1
tMapP[tOrder=='Neoblast',np.isin(uLabelsP,['Neoblast-1','Neoblast-2'])] = 1
tMapP[tOrder=='Neural',np.isin(uLabelsP,['Neural-1','Neural-2','Neural-3',
                                         'Neural Progenitors'])] = 1
tMapP[tOrder=='Protonephridia',np.isin(uLabelsP,['Protonephridia'])] = 1
tMapP[tOrder=='Cathepsin',np.isin(uLabelsP,['Cathepsin'])] = 1
tMapP[tOrder=='Parenchymal',np.isin(uLabelsP,['Parenchymal'])] = 1

# Random deviations plots

In [None]:
# Deviations for the germline bZIP motif

tmp = samM.adata.obs.GroupFigure.to_frame()
tmp['GroupFigure'] = tmp.GroupFigure.astype('str')
tmp.loc[tmp.GroupFigure.str.contains('GSC'),'GroupFigure'] = 'GSC'
tmp.loc[~tmp.GroupFigure.str.contains('GSC'),'GroupFigure'] = 'Background'
tmp['Acc'] = (samM.adata[:,'JC_0117|FOSL1(0.043)'].X.A)
# Scale deviations to have the same standard deviations
tmp.loc[:,'Acc'] = tmp.Acc / tmp.Acc.std()
tmp['Species'] = 'Mlig'

df = tmp.copy()

tmp = samP.adata.obs.GroupFigure.to_frame()
tmp['GroupFigure'] = tmp.GroupFigure.astype('str')
tmp.loc[tmp.GroupFigure.str.contains('GSC'),'GroupFigure'] = 'GSC'
tmp.loc[~tmp.GroupFigure.str.contains('GSC'),'GroupFigure'] = 'Background'
tmp['Acc'] = (samP.adata[:,'JC_0117|FOSL1(0.043)'].X.A)
tmp.loc[:,'Acc'] = tmp.Acc / tmp.Acc.std()
tmp['Species'] = 'Smed'

df = pd.concat( (df,tmp), axis=0 ).reset_index(drop=True)

tmp = samS.adata.obs.GroupFigure.to_frame()
tmp['GroupFigure'] = tmp.GroupFigure.astype('str')
tmp.loc[tmp.GroupFigure.str.contains('GSC'),'GroupFigure'] = 'GSC'
tmp.loc[~tmp.GroupFigure.str.contains('GSC'),'GroupFigure'] = 'Background'
tmp['Acc'] = (samS.adata[:,'JC_0117|FOSL1(0.043)'].X.A)
tmp.loc[:,'Acc'] = tmp.Acc / tmp.Acc.std()
tmp['Species'] = 'Sman'

df = pd.concat( (df,tmp), axis=0 ).reset_index(drop=True)

# Identify points that are really far outside the 
# interquartile range and plot these as outliers
# to help with smoothing the violins a bit

df['Outlier'] = False
qrScale = 1.5

for key, grp in df.groupby(['Species','GroupFigure']):
    q = np.quantile( grp.Acc, [0.25,0.75] )
    qr = q[1] - q[0]
    outliers = (grp.Acc<q[0]-qrScale*qr) | (grp.Acc>q[1]+qrScale*qr)
    df.loc[grp.index,'Outlier'] = outliers

g = sns.catplot( data=df[~df.Outlier], x='Species', y='Acc', hue='GroupFigure',
                 kind='violin', hue_order=['GSC','Background'], 
                 order=['Mlig','Smed','Sman'], aspect=1, legend=False )
sns.stripplot( data=df[df.Outlier], x='Species', y='Acc', hue='GroupFigure',
               dodge=True, hue_order=['GSC','Background'], marker='D',
               order=['Mlig','Smed','Sman'], ax=g.ax, jitter=0, size=2.5 )
sns.despine( left=True, bottom=True )

g.ax.set_xlabel( 'Cell type' )
g.ax.set_ylabel( 'Motif Deviations' )
g.ax.axhline(0)
plt.savefig( 'Plots/EDFig5/PanelED5b.svg', format='svg' )
plt.show()

In [None]:
# Deviations for the NR4A-like motif in different schisto cell types

tmp = samS.adata.obs.GroupFigure.to_frame()
tmp['GroupFigure'] = tmp.GroupFigure.astype('str')
tmp.loc[tmp.GroupFigure.str.contains('Muscle'),'GroupFigure'] = 'Muscle'
tmp.loc[tmp.GroupFigure.str.contains('Vitellocytes'),'GroupFigure'] = 'Vitellocytes'
tmp.loc[~tmp.GroupFigure.str.contains('Muscle') &\
        ~tmp.GroupFigure.str.contains('Vitellocytes'),'GroupFigure'] = 'Background'
tmp['Acc'] = (samS.adata.X[:,samS.adata.var_names=='JC_0057|NR4A1(0.322)'].A)
tmp['Motif'] = 'Canonical'
tmp['Species'] = 'Sman'

df = tmp.copy()

tmp = samS.adata.obs.GroupFigure.to_frame()
tmp['GroupFigure'] = tmp.GroupFigure.astype('str')
tmp.loc[tmp.GroupFigure.str.contains('Muscle'),'GroupFigure'] = 'Muscle'
tmp.loc[tmp.GroupFigure.str.contains('Vitellocytes'),'GroupFigure'] = 'Vitellocytes'
tmp.loc[~tmp.GroupFigure.str.contains('Muscle') &\
        ~tmp.GroupFigure.str.contains('Vitellocytes'),'GroupFigure'] = 'Background'
tmp['Acc'] = (samS.adata.X[:,samS.adata.var_names=='WC_1094|NR4A1(0.838)'].A)
tmp['Motif'] = 'Novel'
tmp['Species'] = 'Sman'

df = pd.concat( (df,tmp), axis=0 ).reset_index(drop=True)

# Same deal with identifying outliers
df['Outlier'] = False
qrScale = 1.5

for key, grp in df.groupby(['Motif','GroupFigure']):
    q = np.quantile( grp.Acc, [0.25,0.75] )
    qr = q[1] - q[0]
    outliers = (grp.Acc<q[0]-qrScale*qr) | (grp.Acc>q[1]+qrScale*qr)
    df.loc[grp.index,'Outlier'] = outliers

g = sns.catplot( data=df[~df.Outlier], x='GroupFigure', y='Acc', hue='Motif',
                 kind='violin', hue_order=['Canonical','Novel'], 
                 order=['Muscle','Vitellocytes','Background'], aspect=1, legend=False )
sns.stripplot( data=df[df.Outlier], x='GroupFigure', y='Acc', hue='Motif',
               dodge=True, hue_order=['Canonical','Novel'], marker='D',
               order=['Muscle','Vitellocytes','Background'], ax=g.ax, jitter=0, size=2.5 )
sns.despine( left=True, bottom=True )

g.ax.set_xlabel( 'Tissue' )
g.ax.set_ylabel( 'Motif Deviations' )
g.ax.axhline(0)
plt.savefig( 'Plots/EDFig5/PanelED5d.svg', format='svg' )
plt.show()

# Comparing motif set predictive power

In [None]:
# Train a logistic regression model and
# return matthews correlations values 
# for predictions of each unique label
# @param inTup: tuple of ( initialized LogisticRegression model,
#                          predictor array used for training,
#                          training labels,
#                          predictor array used for testing,
#                          testing labels )
def _trainCalcMatthews( inTup ):
    if inTup is None:
        return -1
    # unpack the input
    model, Xtr, Ytr, Xte, Yte = inTup
    # Fit and test the model
    model = model.fit( Xtr, Ytr )
    pred = model.predict( Xte )
    # Get all unique labels
    uLabels = np.sort(np.unique(np.concatenate((Ytr,Yte))))
    # Calculate Matthews correlation for each
    M = np.zeros(uLabels.size)
    for i, label in enumerate(uLabels):
        if label not in Yte:
            continue
        y = Yte == label
        yhat = pred == label
        M[i] = matthews_corrcoef( y, yhat )
    # return array of Matthews correlations per label
    return M

# Train logistic regression models for one-vs-rest predictions
# of discrete labels over K unique train-test splits, 
# return Matthews correlations for true and predicted labels
# corresponding to each split
# @param data: (N_observations,N_features) predictors matrix
# @param labels: (N_observations,) vector of discrete labels
# @param nSplit: integer number of train-test splits to try
# @param testRatio: fraction of the data to hold out for testing
# @param nProc: number of threads to run in parallel
# @param seed: random seed
# @param downsampleFeatures: number of features to feed into the models
# @return tuple of ( unique labels from labels,
#                    list of nSplit Matthews correlation values for 
#                      each unique label,
#                    binary vector indicating whether each label is 
#                      linearly separable within the provided feature space )
def calcKSplitsMatthewsPerClass( data, labels, nSplit=5, testRatio=0.2, nProc=1, 
                                 seed=0, downsampleFeatures=None ):
    uLabels = np.sort(np.unique(labels))
    
    # First, check whether data are linearly separable
    # Do this by training and testing on the whole dataset
    print( 'Calculating linear separability' )
    L = _trainCalcMatthews( ( LogisticRegression( penalty='l2', C=1e5, n_jobs=nProc ),
                              data, labels, data, labels ) )
    
    # Next, do train/test splits to get an idea of consistency
    np.random.seed( seed )
    M = np.zeros( (nSplit,uLabels.size) )
    inputs = []
    print( 'Preparing splits' )
    for i in tqdm( range(nSplit) ):
        X = data.copy()
        # Downsample the data, if desired
        if downsampleFeatures is not None and \
           downsampleFeatures <= data.shape[1]:
            dsInd = np.random.choice( data.shape[1], downsampleFeatures, replace=False )
            X = X[:,dsInd]
        # Pick a random train-test split
        Xtr, Xte, Ytr, Yte = train_test_split( X, labels, 
                                               test_size=testRatio,
                                               stratify=labels )
        # Prepare the input tuple for this train-test split
        inputs.append( ( LogisticRegression( penalty='l2', C=1e5 ),
                         Xtr, Ytr, Xte, Yte ) )
    print( 'Calculating per-class consistency' )
    # Do the training and calculate performance metrics in parallel
    with Pool(processes=nProc) as p:
        M = np.array(p.map(_trainCalcMatthews, inputs))
    print( 'Done' )
    
    return ( uLabels, M, L )

In [None]:
nSplit = 10
testRatio = 0.3
matthewsDF = pd.DataFrame( columns=['Species','CellType','Tissue','Features','Matthews'] )

matthewsDF

## Looking at the complete motif set

In [None]:
# Run for Mlig
cts, M, L = calcKSplitsMatthewsPerClass( samM.adata.X.A, samM.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevs']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Run for Smed
cts, M, L = calcKSplitsMatthewsPerClass( samP.adata.X.A, samP.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevs']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Run for Sman
cts, M, L = calcKSplitsMatthewsPerClass( samS.adata.X.A, samS.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevs']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

## Repeat with the JC motifs only

In [None]:
# Run for Mlig
cts, M, L = calcKSplitsMatthewsPerClass( samM.adata[:,samM.adata.var.Source=='JASPAR'].X.A, 
                                         samM.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevsJASPAR']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                  'CellType': ct,
                                                  'Tissue': t,
                                                  'Features': f,
                                                  'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Run for Smed
cts, M, L = calcKSplitsMatthewsPerClass( samP.adata[:,samP.adata.var.Source=='JASPAR'].X.A, 
                                         samP.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevsJASPAR']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                  'CellType': ct,
                                                  'Tissue': t,
                                                  'Features': f,
                                                  'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Run for Sman
cts, M, L = calcKSplitsMatthewsPerClass( samS.adata[:,samS.adata.var.Source=='JASPAR'].X.A, 
                                         samS.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevsJASPAR']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                  'CellType': ct,
                                                  'Tissue': t,
                                                  'Features': f,
                                                  'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

## Repeat with the WC motifs only

In [None]:
# Run for Mlig
cts, M, L = calcKSplitsMatthewsPerClass( samM.adata[:,samM.adata.var.Source=='MODISCO'].X.A, 
                                         samM.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevsMODISCO']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                  'CellType': ct,
                                                  'Tissue': t,
                                                  'Features': f,
                                                  'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Run for Smed
cts, M, L = calcKSplitsMatthewsPerClass( samP.adata[:,samP.adata.var.Source=='MODISCO'].X.A, 
                                         samP.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevsMODISCO']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                  'CellType': ct,
                                                  'Tissue': t,
                                                  'Features': f,
                                                  'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Run for Sman
cts, M, L = calcKSplitsMatthewsPerClass( samS.adata[:,samS.adata.var.Source=='MODISCO'].X.A, 
                                         samS.adata.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )
print( 'Non linear-separable celltypes:' )
print( list( zip( cts[L<1], L[L<1] ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['MotifDevsMODISCO']*M.size
M = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                  'CellType': ct,
                                                  'Tissue': t,
                                                  'Features': f,
                                                  'Matthews': M} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
g = sns.catplot( data=matthewsDF[matthewsDF.Features.isin(['MotifDevs','MotifDevsJASPAR','MotifDevsMODISCO'])], 
                 col='Species', x='Matthews', y='CellType', hue='Features', 
                 sharey=False, sharex=True, kind='point', aspect=0.4, height=10,  
                 errorbar='sd', linestyle='none', dodge=False )
g.axes[0,0].set_xlim(-0.1,1.1)
g.axes[0,0].set_xticks([0,0.5,1])
plt.savefig( 'Plots/EDFig5/PanelED5g.svg', format='svg' )
plt.show()

# Marker motif comparison

In [None]:
minResponse = 1
nBS = 5000
alpha = 0.1

## Identify markers per-cluster

In [None]:
X = samM.adata.X.A
labels = np.array(samM.adata.obs.GroupFigure.values)
nL = len(uLabelsM)
# Median deviations per cluster
groupedM = np.zeros( (nL,X.shape[1]) )
# Binarized cluster accessibility
binnedM = np.zeros_like( groupedM )
# Difference between 'high' and 'low' group accessibility
deltasM = np.zeros( X.shape[1] )
# P-values associated with the wilcoxon tests
pM = np.zeros_like( groupedM )

# Loop through unique cluster labels
for i, l in tqdm(enumerate(uLabelsM)):
    
    fg = X[labels==l,:]
    groupedM[i,:] = np.median(fg,axis=0)
    
    bg = X[labels!=l,:]
    # Run the DA tests, store the adjusted p-values
    res = stats.mannwhitneyu( fg, bg, axis=0, alternative='greater' )
    _, pM[i,:], _, _ = sm.stats.multipletests( res.pvalue, alpha=alpha,
                                               method='fdr_bh' )
    
# Binarize based on the thresholds we specified
binnedM = ((pM < alpha)&(groupedM > minResponse)).astype(int)

# Calculate differences in accessibility for 
# the 'high' and 'low' groups
for i in tqdm(range(X.shape[1])):
    x = X[:,i].flatten()
    on = np.isin(labels,uLabelsM[binnedM[:,i].flatten()==1])
    if on.sum() > 0:
        deltasM[i] = np.median(x[on]) - np.median(x[~on])

plt.plot( np.sort( deltasM )[::-1] )
plt.show()
samM.adata.var_names[np.argsort(deltasM)[::-1]][:20]

In [None]:
# Same deal for Smed
X = samP.adata.X.A
labels = np.array(samP.adata.obs.GroupFigure.values)
nL = len(uLabelsP)
groupedP = np.zeros( (nL,X.shape[1]) )
binnedP = np.zeros_like( groupedP )
deltasP = np.zeros( X.shape[1] )
pP = np.zeros_like( groupedP )

for i, l in tqdm(enumerate(uLabelsP)):
    
    fg = X[labels==l,:]
    groupedP[i,:] = np.median(fg,axis=0)
    
    bg = X[labels!=l,:]
    res = stats.mannwhitneyu( fg, bg, axis=0, alternative='greater' )
    _, pP[i,:], _, _ = sm.stats.multipletests( res.pvalue, alpha=alpha,
                                               method='fdr_bh' )
    
binnedP = ((pP < alpha)&(groupedP > minResponse)).astype(int)

for i in tqdm(range(X.shape[1])):
    x = X[:,i].flatten()
    on = np.isin(labels,uLabelsP[binnedP[:,i].flatten()==1])
    if on.sum() > 0:
        deltasP[i] = np.median(x[on]) - np.median(x[~on].mean())

plt.plot( np.sort( deltasP )[::-1] )
plt.show()
samP.adata.var_names[np.argsort(deltasP)[::-1]][:20]

In [None]:
# Same deal for Sman
X = samS.adata.X.A
labels = np.array(samS.adata.obs.GroupFigure.values)
nL = len(uLabelsS)
groupedS = np.zeros( (nL,X.shape[1]) )
binnedS = np.zeros_like( groupedS )
deltasS = np.zeros( X.shape[1] )
pS = np.zeros_like( groupedS )

for i, l in tqdm(enumerate(uLabelsS)):
    
    fg = X[labels==l,:]
    groupedS[i,:] = np.median(fg,axis=0)
    
    bg = X[labels!=l,:]
    res = stats.mannwhitneyu( fg, bg, axis=0, alternative='greater' )
    _, pS[i,:], _, _ = sm.stats.multipletests( res.pvalue, alpha=alpha,
                                               method='fdr_bh' )
    
binnedS = ((pS < alpha)&(groupedS > minResponse)).astype(int)

for i in tqdm(range(X.shape[1])):
    x = X[:,i].flatten()
    on = np.isin(labels,uLabelsS[binnedS[:,i].flatten()==1])
    if on.sum() > 0:
        deltasS[i] = np.median(x[on]) - np.median(x[~on].mean())

plt.plot( np.sort( deltasS )[::-1] )
plt.show()
samS.adata.var_names[np.argsort(deltasS)[::-1]][:20]

In [None]:
anndata.AnnData( X=binnedM, obs=pd.DataFrame(index=uLabelsM),
                 var=pd.DataFrame(index=samM.adata.var_names) )\
       .write_h5ad( 'ChromVARDeviations/Mlig.cluster_binarized_devs.h5ad' )

In [None]:
anndata.AnnData( X=binnedP, obs=pd.DataFrame(index=uLabelsP),
                 var=pd.DataFrame(index=samP.adata.var_names) )\
       .write_h5ad( 'ChromVARDeviations/Smed.cluster_binarized_devs.h5ad' )

In [None]:
anndata.AnnData( X=binnedS, obs=pd.DataFrame(index=uLabelsS),
                 var=pd.DataFrame(index=samS.adata.var_names) )\
       .write_h5ad( 'ChromVARDeviations/Sman.cluster_binarized_devs.h5ad' )

## Aggregating over families

In [None]:
# Propagating the binarization
# Take the or over each cell type family
# to get the aggregate binarized accessibility
tBinnedM = tMapM @ binnedM
tBinnedM = (tBinnedM>0).astype(int)

tBinnedP = tMapP @ binnedP
tBinnedP = (tBinnedP>0).astype(int)

tBinnedS = tMapS @ binnedS
tBinnedS = (tBinnedS>0).astype(int)

deltas = np.concatenate((deltasM[:,None],deltasS[:,None],tmp[:,None]),axis=1)

# Concatenate the three species
tBinned = np.concatenate((tBinnedM[:,:,None],tBinnedS[:,:,None],tBinnedP[:,:,None]),axis=2)
# Number of motifs that show up in at least one
# of the same families across all three 
print( ((tBinned.sum(2)==3).any(0)).sum() ) # 534

Here we're taking the average of the group means for all clusters considered 'high'

In [None]:
tMeansM = np.zeros((len(tOrder),samM.adata.n_vars))
on = binnedM

for i in range(len(tOrder)):
    ind = tMapM[i,:].flatten()
    if ind.sum() == 0:
        continue
    onT = on[ind==1,:]
    for j in range(samM.adata.n_vars):
        if not onT.any(0)[j]:
            tMeansM[i,j] = groupedM[(ind==1),j].mean()
            continue
        tMeansM[i,j] = groupedM[(ind==1)&on[:,j].astype(bool).flatten(),j].mean()

In [None]:
tMeansP = np.zeros((len(tOrder),samP.adata.n_vars))
on = binnedP

for i in range(len(tOrder)):
    ind = tMapP[i,:].flatten()
    if ind.sum() == 0:
        continue
    onT = on[ind==1,:]
    for j in range(samP.adata.n_vars):
        if not onT.any(0)[j]:
            tMeansP[i,j] = groupedP[(ind==1),j].mean()
            continue
        tMeansP[i,j] = groupedP[(ind==1)&on[:,j].astype(bool).flatten(),j].mean()

In [None]:
tMeansS = np.zeros((len(tOrder),samS.adata.n_vars))
on = binnedS

for i in range(len(tOrder)):
    ind = tMapS[i,:].flatten()
    if ind.sum() == 0:
        continue
    onT = on[ind==1,:]
    for j in range(samS.adata.n_vars):
        if not onT.any(0)[j]:
            tMeansS[i,j] = groupedS[(ind==1),j].mean()
            continue
        tMeansS[i,j] = groupedS[(ind==1)&on[:,j].astype(bool).flatten(),j].mean()

Combine these family-aggregated data into a dataframe that's a bit easier to manipulate. Here we also do the min-max scaling per species to make it easier to compare between species

In [None]:
dropTissues = ['Germline','Parenchymal']

tmp = tMeansM.copy()[~np.isin(tOrder,dropTissues),:]
tmp = tmp - tMeansM.min(0)
maxes = tmp.max(0)
maxes[maxes==0] = 1
tmp = tmp / maxes
aggTMeans = pd.DataFrame( data=tmp, index=tOrder[~np.isin(tOrder,dropTissues)], columns=samM.adata.var_names )
aggTMeans = aggTMeans.rename_axis( index='Tissue' )
aggTMeans['Species'] = 'Mlig'
aggTMeans = aggTMeans.set_index('Species',append=True)

tmp = tMeansP.copy()[~np.isin(tOrder,dropTissues),:]
tmp = tmp - tMeansP.min(0)
maxes = tmp.max(0)
maxes[maxes==0] = 1
tmp = tmp / maxes
tmp = pd.DataFrame( data=tmp, index=tOrder[~np.isin(tOrder,dropTissues)], columns=samM.adata.var_names )
tmp = tmp.rename_axis( index='Tissue' )
tmp['Species'] = 'Smed'
tmp = tmp.set_index('Species',append=True)
aggTMeans = pd.concat( (aggTMeans,tmp), axis=0 )

tmp = tMeansS.copy()[~np.isin(tOrder,dropTissues),:]
tmp = tmp - tMeansS.min(0)
maxes = tmp.max(0)
maxes[maxes==0] = 1
tmp = tmp / maxes
tmp = pd.DataFrame( data=tmp, index=tOrder[~np.isin(tOrder,dropTissues)], columns=samS.adata.var_names )
tmp = tmp.rename_axis( index='Tissue' )
tmp['Species'] = 'Sman'
tmp = tmp.set_index('Species',append=True)
aggTMeans = pd.concat( (aggTMeans,tmp), axis=0 )

aggTMeans = aggTMeans.sort_index(level='Tissue',sort_remaining=False)
aggTMeans.iloc[:,(tBinned.sum(2)==3).any(0)] # 534

In [None]:
aggTMeans.to_csv( 'ChromVARDeviations/family_avg_motif_devs_minmax.csv' )

In [None]:
# Find markers of the same family in all 3 species
markerMask = (tBinned.sum(2)==3)
# Do an unconstrained clustering on these initially, 
# just to try to get it to look a little cleaner
g = sns.clustermap( aggTMeans.iloc[:,markerMask.any(0)], 
                    row_cluster=False, center=0, cmap='Reds', robust=True )
plt.close('all')
# Sort the data based on the clustering
dInd = np.array(g.dendrogram_col.reordered_ind)
# Reorder so we get blocks of families and their assigned motifs
srtT = aggTMeans.groupby('Tissue').min().idxmax(0)[markerMask.any(0)][dInd]
srtInd = np.concatenate( [dInd[srtT==t] for t in tOrder] )
# Plot again with this order
sns.clustermap( aggTMeans.iloc[:,markerMask.any(0)].iloc[:,srtInd], 
                row_cluster=False, col_cluster=False, 
                cmap=custom_cmap, robust=False, figsize=(12,12) )
plt.savefig( 'Plots/Fig2/Panel2e.svg', format='svg' )
plt.show()

In [None]:
# Plotting numbers of motifs for each family
list(zip(tOrder,(tBinned.sum(2)==3).sum(1)))

In [None]:
# Calculate ratio of conserved to high in  
# any species for each family 
# We use these values to plot panel 2f
consRates = (tBinned.sum(2)==3).sum(1) / tBinned.any(2).sum(1)
list(zip(tOrder,consRates))

In [None]:
fig, ax = plt.subplots(figsize=(3, 4))

df = pd.DataFrame( index=tOrder, data=consRates[:,None], columns=['Rate'] )
df.plot(kind='barh', legend=False, ax=ax, width=0.5)

ax.set_yticklabels(df.index, rotation=0, ha='right')

plt.tight_layout()
plt.savefig("Plots/Fig2/Panel2f.svg",format='svg')
plt.show()

In [None]:
# Get fraction of all motifs that are WC
(samM.adata.var.Source=='MODISCO').sum() / samM.adata.n_vars 
# 0.6619385342789598

In [None]:
# Calculate WC fractions for conserved motifs of each family
wcFracs = ((tBinned.sum(2)==3)&\
           (samM.adata.var.Source.values=='MODISCO')[None,:]).sum(1)\
             /(tBinned.sum(2)==3).sum(1)
list(zip(tOrder,wcFracs))

In [None]:
fig, ax = plt.subplots(figsize=(3, 4))

df = pd.DataFrame( index=tOrder, data=wcFracs[:,None], columns=['Fraction'] )
df.plot(kind='barh', legend=False, ax=ax, width=0.5)

constant_value = 0.662
ax.axvline(constant_value, color='black', linestyle='--', linewidth=1)

ax.set_yticklabels(df.index, rotation=0, ha='right')

plt.tight_layout()
plt.savefig("Plots/Fig2/Panel2g.svg",format='svg')
plt.show()

In [None]:
outDf = pd.DataFrame()
for t, row in aggTMeans.groupby('Tissue').min(0).iterrows():
    mask = tBinned.sum(2)[tOrder==t,:].flatten() == 3
    cons = row[mask].to_frame().reset_index(drop=False)
    cons.index.name = None
    cons.columns = [ 'Motif', 'MinScaledAcc' ]
    cons['Tissue'] = t
    cons = cons.sort_values('MinScaledAcc',ascending=False).reset_index(drop=True)
    outDf = pd.concat( (outDf,cons), axis=0 )
    
outDf = outDf.reset_index(drop=True)
# This is essentially Supplementary Table 3
outDf.to_csv( 'ChromVARDeviations/all_family_conserved_motifs.csv' )
outDf

In [None]:
# Count of motifs that are high in both neurons and muscles
samM.adata.var_names[(tBinned.sum(2)==3)[tOrder=='Neural',:].flatten()&\
                     (tBinned.sum(2)==3)[tOrder=='Muscle',:].flatten()].size
# 54

## Comparing cluster-average accessibilities

In [None]:
# For each species, z-score the data 
# and average over each cluster
Zm = np.zeros( (uLabelsM.size,samM.adata.n_vars))
labels = samM.adata.obs.GroupFigure.values
for i, label in enumerate(uLabelsM):
    Zm[i,:] = zscore(samM.adata.X.A,axis=0)[labels==label,:].mean(0)
    
Zs = np.zeros( (uLabelsS.size,samS.adata.n_vars))
labels = samS.adata.obs.GroupFigure.values
for i, label in enumerate(uLabelsS):
    Zs[i,:] = zscore(samS.adata.X.A,axis=0)[labels==label,:].mean(0)
    
Zp = np.zeros( (uLabelsP.size,samP.adata.n_vars))
labels = samP.adata.obs.GroupFigure.values
for i, label in enumerate(uLabelsP):
    Zp[i,:] = zscore(samP.adata.X.A,axis=0)[labels==label,:].mean(0)
    
stackedMeans = np.concatenate((Zm,Zs,Zp), axis=0)

crossCorrs = np.corrcoef( stackedMeans )
# Average it out since numpy seems to
# have some numerical imprecision
crossCorrs = ( crossCorrs + crossCorrs.T ) / 2
np.fill_diagonal( crossCorrs, 1 )
crossCorrs.shape

In [None]:
Y = squareform( 1-crossCorrs )
Z = linkage( Y, method='average' )
# Flip a few of the branches to  
# make the groups more obvious
Z[-2][[0,1]] = Z[-2][[1,0]]
Z[-7][[0,1]] = Z[-7][[1,0]]
Z[-14][[0,1]] = Z[-14][[1,0]]
Z[-16][[0,1]] = Z[-16][[1,0]]
g = sns.clustermap( crossCorrs, row_linkage=Z, col_linkage=Z, center=0 )
allLabels = np.concatenate((['Ml_{0}'.format(l) for l in uLabelsM],
                            ['Sm_{0}'.format(l) for l in uLabelsS],
                            ['Pl_{0}'.format(l) for l in uLabelsP]))
g.ax_heatmap.set_yticks(np.arange(crossCorrs.shape[0])+0.5)
g.ax_heatmap.set_yticklabels( allLabels[g.dendrogram_row.reordered_ind], rotation=0 ) 
plt.savefig( 'Plots/Fig3/Panel3a.svg', format='svg' )
plt.show()

## Checking *S. mansoni* accessibility of parenchymal motifs

In [None]:
parenMask = binnedM[np.isin(uLabelsM,['Parenchymal-1',
                                      'Parenchymal-2',
                                      'Parenchymal-3',
                                      'Parenchymal-4',
                                      'Parenchymal-5']),:].any(0)
parenMask = parenMask & (binnedP[uLabelsP=='Parenchymal',:].flatten()==1)
print( parenMask.sum() ) # 126
for m in samM.adata.var_names[parenMask]:
    print( m ) # See Supplementary Table 4

In [None]:
# Print the families where these are accessible in Sman
outStrs = []
for i, m in enumerate( samS.adata.var_names ):
    tStr = ';'.join( tOrder[tBinnedS[:,i].flatten()==1] )
    outStrs.append( tStr )
    
outStrs = np.array(outStrs)
# Print it to copy into Supplementary Table 4
for o in outStrs[parenMask]:
    print( o )

In [None]:
# Find markers of the same family in all 3 species
markerMask = (tBinned.sum(2)==3)

In [None]:
markerMask = tBinned[:,parenMask,2].reshape((tOrder.size,-1)) == 1
toPlot = aggTMeans.iloc[:,parenMask]
toPlot = aggTMeans.loc[aggTMeans.index.get_level_values('Species')=='Sman',:]

# Do an unconstrained clustering on these initially, 
# just to try to get it to look a little cleaner
g = sns.clustermap( toPlot, row_cluster=False, center=0, cmap='Reds', robust=True )
plt.close('all')

# Sort the data based on the clustering
dInd = np.array(g.dendrogram_col.reordered_ind)
# Reorder so we get blocks of families and their assigned motifs
srtT = toPlot.idxmax(0)[dInd]
srtInd = np.concatenate( [dInd[srtT==t] for t in tOrder] )
# Plot again with this order
sns.clustermap( toPlot.iloc[:,srtInd], 
                row_cluster=False, col_cluster=False, 
                cmap=custom_cmap, robust=False, figsize=(12,12) )
plt.savefig( 'Plots/EDFig6/PanelED6a.svg', format='svg' )
plt.show()

## Finding markers for neurons vs muscles

In [None]:
nmLabels = uLabelsM[tMapM[np.isin(tOrder,['Neural','Muscle']),:].any(0)]
X = samM.adata.X.A
labels = np.array(samM.adata.obs.GroupFigure.values)
nmMask = np.isin(labels,nmLabels)
X = X[nmMask,:]
labels = labels[nmMask]
nL = len(nmLabels)

grouped = np.zeros( (nL,X.shape[1]) )
p = np.zeros_like( grouped )

for i, l in tqdm(enumerate(nmLabels)):
    fg = X[labels==l,:]
    grouped[i,:] = np.median(fg,axis=0)
    
    bg = X[labels!=l,:]
    res = stats.mannwhitneyu( fg, bg, axis=0, alternative='greater' )
    _, p[i,:], _, _ = sm.stats.multipletests( res.pvalue, alpha=alpha,
                                              method='fdr_bh' )
    
binned = ((p < alpha)&(grouped > minResponse)).astype(int)
nmBinnedM = pd.DataFrame( data=binned, index=nmLabels, columns=samM.adata.var_names )
nmBinnedM['Tissue'] = 'Neural'
nmBinnedM.loc[nmBinnedM.index.str.startswith('Muscle'),'Tissue'] = 'Muscle'
nmBinnedM = nmBinnedM.set_index( 'Tissue', append=True )
nmBinnedM

In [None]:
nmLabels = uLabelsP[tMapP[np.isin(tOrder,['Neural','Muscle']),:].any(0)]
X = samP.adata.X.A
labels = np.array(samP.adata.obs.GroupFigure.values)
nmMask = np.isin(labels,nmLabels)
X = X[nmMask,:]
labels = labels[nmMask]
nL = len(nmLabels)

grouped = np.zeros( (nL,X.shape[1]) )
p = np.zeros_like( grouped )

for i, l in tqdm(enumerate(nmLabels)):
    fg = X[labels==l,:]
    grouped[i,:] = np.median(fg,axis=0)
    
    bg = X[labels!=l,:]
    res = stats.mannwhitneyu( fg, bg, axis=0, alternative='greater' )
    _, p[i,:], _, _ = sm.stats.multipletests( res.pvalue, alpha=alpha,
                                              method='fdr_bh' )
    
binned = ((p < alpha)&(grouped > minResponse)).astype(int)
nmBinnedP = pd.DataFrame( data=binned, index=nmLabels, columns=samP.adata.var_names )
nmBinnedP['Tissue'] = 'Neural'
nmBinnedP.loc[nmBinnedP.index.str.startswith('Muscle'),'Tissue'] = 'Muscle'
nmBinnedP = nmBinnedP.set_index( 'Tissue', append=True )
nmBinnedP

In [None]:
nmLabels = uLabelsS[tMapS[np.isin(tOrder,['Neural','Muscle']),:].any(0)]
X = samS.adata.X.A
labels = np.array(samS.adata.obs.GroupFigure.values)
nmMask = np.isin(labels,nmLabels)
X = X[nmMask,:]
labels = labels[nmMask]
nL = len(nmLabels)

grouped = np.zeros( (nL,X.shape[1]) )
p = np.zeros_like( grouped )

for i, l in tqdm(enumerate(nmLabels)):
    fg = X[labels==l,:]
    grouped[i,:] = np.median(fg,axis=0)
    
    bg = X[labels!=l,:]
    res = stats.mannwhitneyu( fg, bg, axis=0, alternative='greater' )
    _, p[i,:], _, _ = sm.stats.multipletests( res.pvalue, alpha=alpha,
                                              method='fdr_bh' )
    
binned = ((p < alpha)&(grouped > minResponse)).astype(int)
nmBinnedS = pd.DataFrame( data=binned, index=nmLabels, columns=samS.adata.var_names )
nmBinnedS['Tissue'] = 'Neural'
nmBinnedS.loc[nmBinnedS.index.str.startswith('Muscle'),'Tissue'] = 'Muscle'
nmBinnedS = nmBinnedS.set_index( 'Tissue', append=True )
nmBinnedS

In [None]:
# Make sure the markers are only high for muscles OR neurons
singleSided = (nmBinnedM.groupby('Tissue').any().sum(0)==1) & \
              (nmBinnedP.groupby('Tissue').any().sum(0)==1) & \
              (nmBinnedS.groupby('Tissue').any().sum(0)==1)
# Make sure the motifs are hitting the same family in all three species
neuralDefining = nmBinnedM.groupby('Tissue').any().loc['Neural',:] & \
                 nmBinnedP.groupby('Tissue').any().loc['Neural',:] & \
                 nmBinnedS.groupby('Tissue').any().loc['Neural',:]
muscleDefining = nmBinnedM.groupby('Tissue').any().loc['Muscle',:] & \
                 nmBinnedP.groupby('Tissue').any().loc['Muscle',:] & \
                 nmBinnedS.groupby('Tissue').any().loc['Muscle',:]

# Looking for motifs that are consistently biased 
# towards one of these two tissues but not both
# and the ones that are biased towards the same tissue
# for all three species
consNMDef = (singleSided&(neuralDefining|muscleDefining))
consNMDef.sum() # 127

In [None]:
nmBinnedM.groupby('Tissue').any().loc[:,consNMDef].sum(1)
# Muscle: 12
# Neural: 115

In [None]:
# Print the conserved neural markers for Supplementary Table 5
nmBinnedM.columns[nmBinnedM.groupby('Tissue').any().loc['Neural',:]&consNMDef]

In [None]:
# Print the conserved neural markers for Supplementary Table 5
nmBinnedM.columns[nmBinnedM.groupby('Tissue').any().loc['Muscle',:]&consNMDef]