In [None]:
import os
from re import search
import pandas as pd
import numpy as np

import scanpy as sc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.metrics import confusion_matrix

from scipy.special import softmax
from scipy.stats import zscore
from scipy import sparse

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from multiprocessing import Pool

from anndata import AnnData

from numba import njit, prange

from tqdm import tqdm

import pickle
import gc

import warnings

matplotlib.rcParams.update({'font.size': 12})
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
warnings.filterwarnings('ignore')

# Peak set predictivity analysis

## Load in the data

In [2]:
mligPeaks = sc.read_h5ad( 'SEACellsOutput/Mlig.SEACells_ATAC.h5ad' )

mligPeaks.var['Chr'] = [ n.split(':')[0] for n in mligPeaks.var_names ]
mligPeaks.var['Start'] = [ int(n.split(':')[1].split('-')[0]) for n in mligPeaks.var_names ]
mligPeaks.var['Stop'] = [ int(n.split(':')[1].split('-')[1]) for n in mligPeaks.var_names ]

chipseekr = pd.read_csv( 'Metadata/Mlig.chipseekr_annots.csv' )
chipseekr.index = chipseekr['seqnames'] + ':' + \
                  chipseekr['start'].astype(str) + '-' + \
                  chipseekr['end'].astype(str)
overlap = chipseekr.index.intersection( mligPeaks.var_names )
mligPeaks.var['PeakType'] = ''
mligPeaks.var.loc[overlap,'PeakType'] = chipseekr.loc[overlap,'annotation']
mligPeaks.var['NearestGene'] = ''
mligPeaks.var.loc[overlap,'NearestGene'] = chipseekr.loc[overlap,'transcriptId']
del chipseekr
gc.collect()

mligPeaks

AnnData object with n_obs × n_vars = 502 × 236016
    obs: 'n_counts', 'GroupFigure', 'TissueSAM', 'Sample', 'SEACell'
    var: 'GC_bin', 'counts_bin', 'n_cells', 'Chr', 'Start', 'Stop', 'PeakType', 'NearestGene'
    uns: 'GroupFigure_colors', 'Sample_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_svd', 'X_umap'
    layers: 'OpenPeaks', 'raw'
    obsp: 'connectivities', 'distances'

In [3]:
smedPeaks = sc.read_h5ad( 'SEACellsOutput/Smed.SEACells_ATAC.h5ad' )

smedPeaks.var['Chr'] = [ n.split(':')[0] for n in smedPeaks.var_names ]
smedPeaks.var['Start'] = [ int(n.split(':')[1].split('-')[0]) for n in smedPeaks.var_names ]
smedPeaks.var['Stop'] = [ int(n.split(':')[1].split('-')[1]) for n in smedPeaks.var_names ]

chipseekr = pd.read_csv( 'Metadata/Smed.chipseekr_annots.csv' )
chipseekr.index = chipseekr['seqnames'] + ':' + \
                  chipseekr['start'].astype(str) + '-' + \
                  chipseekr['end'].astype(str)
overlap = chipseekr.index.intersection( smedPeaks.var_names )
smedPeaks.var['PeakType'] = ''
smedPeaks.var.loc[overlap,'PeakType'] = chipseekr.loc[overlap,'annotation']
smedPeaks.var['NearestGene'] = ''
smedPeaks.var.loc[overlap,'NearestGene'] = chipseekr.loc[overlap,'transcriptId']
del chipseekr
gc.collect()

smedPeaks

AnnData object with n_obs × n_vars = 277 × 317488
    obs: 'n_counts', 'GroupFigure', 'TissueSAM', 'Sample', 'SEACell'
    var: 'GC_bin', 'counts_bin', 'n_cells', 'Chr', 'Start', 'Stop', 'PeakType', 'NearestGene'
    uns: 'GroupFigure_colors', 'Sample_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_svd', 'X_umap'
    layers: 'OpenPeaks', 'raw'
    obsp: 'connectivities', 'distances'

In [4]:
smanPeaks = sc.read_h5ad( 'SEACellsOutput/Sman.SEACells_ATAC.h5ad' )

smanPeaks.var['Chr'] = [ n.split(':')[0] for n in smanPeaks.var_names ]
smanPeaks.var['Start'] = [ int(n.split(':')[1].split('-')[0]) for n in smanPeaks.var_names ]
smanPeaks.var['Stop'] = [ int(n.split(':')[1].split('-')[1]) for n in smanPeaks.var_names ]

chipseekr = pd.read_csv( 'Metadata/Sman.chipseekr_annots.csv' )
chipseekr.index = chipseekr['seqnames'] + ':' + \
                  chipseekr['start'].astype(str) + '-' + \
                  chipseekr['end'].astype(str)
overlap = chipseekr.index.intersection( smanPeaks.var_names )
smanPeaks.var['PeakType'] = ''
smanPeaks.var.loc[overlap,'PeakType'] = chipseekr.loc[overlap,'annotation']
smanPeaks.var['NearestGene'] = ''
smanPeaks.var.loc[overlap,'NearestGene'] = chipseekr.loc[overlap,'transcriptId']
del chipseekr
gc.collect()

smanPeaks

AnnData object with n_obs × n_vars = 435 × 162349
    obs: 'n_counts', 'GroupFigure', 'TissueSAM', 'Sample', 'SEACell'
    var: 'GC_bin', 'counts_bin', 'n_cells', 'Chr', 'Start', 'Stop', 'PeakType', 'NearestGene'
    uns: 'GroupFigure_colors', 'Sample_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_svd', 'X_umap'
    layers: 'OpenPeaks', 'raw'
    obsp: 'connectivities', 'distances'

## Train the classifiers

In [None]:
# Train a logistic regression model and
# return matthews correlations values 
# for predictions of each unique label
# @param inTup: tuple of ( initialized LogisticRegression model,
#                          predictor array used for training,
#                          training labels,
#                          predictor array used for testing,
#                          testing labels )
def _trainCalcMatthews( inTup ):
    if inTup is None:
        return -1
    # unpack the input
    model, Xtr, Ytr, Xte, Yte = inTup
    # Fit and test the model
    model = model.fit( Xtr, Ytr )
    pred = model.predict( Xte )
    # Get all unique labels
    uLabels = np.sort(np.unique(np.concatenate((Ytr,Yte))))
    # Calculate Matthews correlation for each
    M = np.zeros(uLabels.size)
    for i, label in enumerate(uLabels):
        if label not in Yte:
            continue
        y = Yte == label
        yhat = pred == label
        M[i] = matthews_corrcoef( y, yhat )
    # return array of Matthews correlations per label
    return M

# Train logistic regression models for one-vs-rest predictions
# of discrete labels over K unique train-test splits, 
# return Matthews correlations for true and predicted labels
# corresponding to each split
# @param data: (N_observations,N_features) predictors matrix
# @param labels: (N_observations,) vector of discrete labels
# @param nSplit: integer number of train-test splits to try
# @param testRatio: fraction of the data to hold out for testing
# @param nProc: number of threads to run in parallel
# @param seed: random seed
# @param downsampleFeatures: number of features to feed into the models
# @return tuple of ( unique labels from labels,
#                    list of nSplit Matthews correlation values for 
#                      each unique label,
#                    binary vector indicating whether each label is 
#                      linearly separable within the provided feature space )
def calcKSplitsMatthewsPerClass( data, labels, nSplit=5, testRatio=0.2, nProc=1, 
                                 seed=0, downsampleFeatures=None ):
    uLabels = np.sort(np.unique(labels))
    
    # First, check whether data are linearly separable
    # Do this by training and testing on the whole dataset
    print( 'Calculating linear separability' )
    L = _trainCalcMatthews( ( LogisticRegression( penalty='l2', C=1e5, n_jobs=nProc ),
                              data, labels, data, labels ) )
    
    # Next, do train/test splits to get an idea of consistency
    np.random.seed( seed )
    M = np.zeros( (nSplit,uLabels.size) )
    inputs = []
    print( 'Preparing splits' )
    for i in tqdm( range(nSplit) ):
        X = data.copy()
        # Downsample the data, if desired
        if downsampleFeatures is not None and \
           downsampleFeatures <= data.shape[1]:
            dsInd = np.random.choice( data.shape[1], downsampleFeatures, replace=False )
            X = X[:,dsInd]
        # Pick a random train-test split
        Xtr, Xte, Ytr, Yte = train_test_split( X, labels, 
                                               test_size=testRatio,
                                               stratify=labels )
        # Prepare the input tuple for this train-test split
        inputs.append( ( LogisticRegression( penalty='l2', C=1e5 ),
                         Xtr, Ytr, Xte, Yte ) )
    print( 'Calculating per-class consistency' )
    # Do the training and calculate performance metrics in parallel
    with Pool(processes=nProc) as p:
        M = np.array(p.map(_trainCalcMatthews, inputs))
    print( 'Done' )
    
    return ( uLabels, M, L )

In [11]:
nSplit = 10
testRatio = 0.3
matthewsDF = pd.DataFrame( columns=['Species','CellType','Tissue','Features','Matthews'] )

matthewsDF

Unnamed: 0,Species,CellType,Tissue,Features,Matthews


Train on all of the different kinds of peaks for Mlig

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( mligPeaks.X.A, mligPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Peaks']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( mligPeaks[:,mligPeaks.var.PeakType=='Promoter'].X.A, 
                                         mligPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Promoters']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( mligPeaks[:,mligPeaks.var.PeakType!='Promoter'].X.A, 
                                         mligPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['NonPromoters']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( mligPeaks[:,mligPeaks.var.PeakType=='Distal Intergenic'].X.A, 
                                         mligPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Mlig']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Distal']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

Same deal for Smed

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smedPeaks.X.A, smedPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Peaks']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smedPeaks[:,smedPeaks.var.PeakType=='Promoter'].X.A, 
                                         smedPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Promoters']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smedPeaks[:,smedPeaks.var.PeakType!='Promoter'].X.A, 
                                         smedPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['NonPromoters']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smedPeaks[:,smedPeaks.var.PeakType=='Distal Intergenic'].X.A, 
                                         smedPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Smed']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Distal']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

Same deal for Sman

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smanPeaks.X.A, smanPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Peaks']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smanPeaks[:,smanPeaks.var.PeakType=='Promoter'].X.A, 
                                         smanPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Promoters']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smanPeaks[:,smanPeaks.var.PeakType!='Promoter'].X.A, 
                                         smanPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['NonPromoters']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
cts, M, L = calcKSplitsMatthewsPerClass( smanPeaks[:,smanPeaks.var.PeakType=='Distal Intergenic'].X.A, 
                                         smanPeaks.obs.GroupFigure.values,
                                         nSplit=nSplit, testRatio=testRatio, nProc=8 )
print( 'Linear separability:' )
print( list( zip( cts, L ) ) )

sp = ['Sman']*M.size
ct = np.concatenate([[c]*nSplit for c in cts])
t = ['']*M.size
f = ['Distal']*M.size
m = np.concatenate([M[:,i] for i in range(cts.size)])

matthewsDF = pd.concat( (matthewsDF,pd.DataFrame( {'Species': sp,
                                                   'CellType': ct,
                                                   'Tissue': t,
                                                   'Features': f,
                                                   'Matthews': m} )), axis=0 ).reset_index(drop=True)
matthewsDF

In [None]:
# Plot it
g = sns.catplot( data=matthewsDF, 
                 col='Species', x='Matthews', y='CellType', hue='Features', 
                 sharey=False, sharex=True, kind='point', aspect=0.4, height=10,  
                 errorbar='sd', linestyle='none', dodge=False )
g.axes[0,0].set_xlim(-0.1,1.1)
g.axes[0,0].set_xticks([0,0.5,1])
plt.savefig( 'Plots/EDFig4/PanelED4b.svg', format='svg' )
plt.show()

# Per-cluster sample composition

In [None]:
dfM = pd.read_csv( 'Metadata/Mlig.final_cluster_annots.csv', index_col=0 )
dfM = dfM.loc[dfM.GroupFigure!='???-1',:]
dfM = pd.crosstab( dfM.GroupFigure, dfM.Sample )
dfM = dfM.div( dfM.sum(1), axis=0 )
plotOrder = ['Cathepsin', 'Intestine-1', 'Intestine-2', 'GSC', 'Female Germline', 'Male Germline', 'Neoblast', 'Neural Progenitors', 'Neural-1', 'Neural-2', 'Neural-3',
             'Neural-4', 'Neural-5', 'Muscle', 'Epidermal Progenitors', 'Epidermal-1', 'Epidermal-2', 'Parenchymal-1', 'Parenchymal-2', 'Parenchymal-3', 'Parenchymal-4',
             'Parenchymal-5', 'Protonephridia', 'Anchor Cells']

dfM = dfM.reindex(plotOrder).reset_index()
ax = dfM.plot.barh(x='GroupFigure', stacked=True, figsize=(6,12)).legend(bbox_to_anchor=(1.0,1.0))
plt.gca().invert_yaxis()
plt.savefig('Plots/EDFig2/PanelED2b_Mlig.svg',format = 'svg')

In [None]:
dfP = pd.read_csv( 'Metadata/Smed.final_cluster_annots.csv', index_col=0 )
dfP = pd.crosstab( dfP.GroupFigure, dfP.Sample )
dfP = dfP.div( dfM.sum(1), axis=0 )
plotOrder = ['Cathepsin', 'Intestine-1', 'Intestine-2', 'Ophis', 'GSC', 'GSC progeny/diff germline', 'Neoblast-1', 'Neoblast-2', 'Neural Progenitors', 
             'Neural-1', 'Neural-2', 'Neural-3', 'Muscle-1', 'Muscle-2', 'Muscle-3', 'Epidermal-1', 'Epidermal-2', 'Pharynx', 'Parenchymal', 
             'Protonephridia']

dfP = dfP.reindex(plotOrder).reset_index()
ax = dfP.plot.barh(x='GroupFigure', stacked=True, figsize=(6,12)).legend(bbox_to_anchor=(1.0,1.0))
plt.gca().invert_yaxis()
plt.savefig('Plots/EDFig2/PanelED2b_Smed.svg',format = 'svg')

In [None]:
dfS = pd.read_csv( 'Metadata/Sman.final_cluster_annots.csv', index_col=0 )
dfS = pd.crosstab( dfS.GroupFigure, dfM.Sample )
dfS = dfS.div( dfS.sum(1), axis=0 )
plotOrder = ['Cathepsin', 'Intestine', 'S1', 'GSC', 'Neoblast', 'Neural Progenitors', 'Neural-1', 'Neural-2', 'Neural-3', 'Neural-4', 
             'Neural-5', 'Neural-6', 'Muscle Progenitors', 'Muscle-1', 'Muscle-2', 'Muscle-3', 'Tegument Progenitors', 'Tegument-1', 
             'Tegument-2', 'Vitellocytes', 'Esophageal Gland', 'Protonephridia']

dfS = dfS.reindex(plotOrder).reset_index()
ax = dfS.plot.barh(x='GroupFigure', stacked=True, figsize=(6,12)).legend(bbox_to_anchor=(1.0,1.0))
plt.gca().invert_yaxis()
plt.savefig('Plots/EDFig2/PanelED2b_Sman.svg',format = 'svg')