In [1]:
import anndata
import pickle
import pandas as pd
import numpy as np
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

from copy import copy
from matplotlib import rc_context
from scipy.stats import norm
from scipy import sparse
from scipy import stats
from sklearn.neighbors import KernelDensity
from ete3 import Tree
from typing import Tuple

In [2]:
sc.settings.verbosity = 0
sc.settings.figdir = '/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/'

clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)
    

TLS_barcodes = ['Bar1', 'Bar2', 'Bar4', 'Bar5', 'Bar7', 'Bar8', 'Bar10', 'Bar11', 'Bar13', 'Bar16', 'Bar19', 'Bar22']
TLSCL_barcodes = ['Bar3', 'Bar6', 'Bar9', 'Bar12', 'Bar14', 'Bar15', 'Bar20', 'Bar21', 'Bar23', 'Bar24']

# Read in tables of the cell states for each cell
TLS1_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_120h_1_cellBC_cellState.tsv', sep='\t')
TLS2_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_120h_2_cellBC_cellState.tsv', sep='\t')
TLS_M_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_TLSCL_cellBC_cellState.tsv', sep='\t')
TLS1_cell_state_table.set_index('cellBC', inplace = True)
TLS2_cell_state_table.set_index('cellBC', inplace = True)
TLS_M_cell_state_table.set_index('cellBC', inplace = True)

In [3]:
# Load the integrated object
TLS_integrated = sc.read_h5ad('/Genomics/chanlab/blaw/TLS/raw_data/scRNA/Integrated/TLS_TLSCL_1_2_Time_integrated.h5ad')

# Load the multiseq barcodes metadata
multiseq_barcodes = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/multiseq_barcodes.txt', sep='\t')

# Add the multiseq group numbers and whether the sample is a TLS or TLSCL (in the multiseq experiment)
# The TLS replicates from TLS1, TLS2, TIME ones are counted as NA in both of these categories to seperate them from the multiseq
multiseq_barcode_dict = dict(zip(multiseq_barcodes['Sequence'], multiseq_barcodes['ID']))
TLS_TLSCL_dict = {}
for key, value in multiseq_barcode_dict.items():
    multiseq_barcode_dict[key] = str(int(value[3:]))
for row in multiseq_barcodes.iterrows():
    TLS_TLSCL_dict[row[1][2]] = 'TLS'
for row in multiseq_barcodes[multiseq_barcodes['TLS ID'].str.startswith('TLSCL')].iterrows():
    TLS_TLSCL_dict[row[1][2]] = 'TLSCL'
    
multiseq_barcode_dict['Doublet'] = 'Doublet'
multiseq_barcode_dict['Negative'] = 'Negative'
TLS_TLSCL_dict['Doublet'] = 'Doublet'
TLS_TLSCL_dict['Negative'] = 'Negative'

# Merge the multiseq group metadata with the entire integrated dataset
TLS_integrated.obs['MultiSeqGroup'] = TLS_integrated.obs['MultiSeqBCseq'].map(multiseq_barcode_dict)
TLS_integrated.obs['TLSCL'] = TLS_integrated.obs['MultiSeqBCseq'].map(TLS_TLSCL_dict)

# Load the monocle position values
pseudotime = pd.read_csv('/Genomics/chanlab/blaw/TLS/raw_data/monocle_results/TLS_TLSCL_1_2_Time_integrated_Monocle_Pos.txt', sep = '\t', index_col = 0)

# Rename the index column
pseudotime.rename(columns = {'x': 'pseudotime'}, inplace = True)

# Merge the pseudotime values into the integrated object
TLS_integrated.obs = pd.merge(TLS_integrated.obs, pseudotime, left_index = True, right_index = True)

# Recenter the pseudotime value to put the NMP median in the center since both trajectories originate from NMPs
NMP_median = np.median(TLS_integrated.obs[TLS_integrated.obs['cell_state'] == 'NMPs']['pseudotime'].values)
TLS_integrated.obs['NT-Somite_Traj'] = TLS_integrated.obs['pseudotime'] - NMP_median

# look at pseudotime over the full TLS dataset

In [4]:
# restrict the analysis to just NMP cells that are part of the pseudotime axis
NMP_state_list = ['NMPs', 'NeuralTube1', 'NeuralTube2', 'Somite', 'Somite0', 'Somite-1', 'SomiteDermo', 'SomiteSclero', 'aPSM', 'pPSM']

NMP_states = TLS_integrated[(TLS_integrated.obs['cell_state'].isin(NMP_state_list)) & (TLS_integrated.obs['NT-Somite_Traj'] < 100)].copy()

In [5]:
with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(NMP_states, color=['NT-Somite_Traj', 'cell_state'], palette = colorDict, title='NT - Somite Trajectory', save = 'NMP_states', show = False)

... storing 'orig.ident' as categorical
... storing 'integrated_snn_res.0.5' as categorical
... storing 'seurat_clusters' as categorical
... storing 'BC' as categorical
... storing 'Phase' as categorical
... storing 'old.ident' as categorical
... storing 'integrated_snn_res.0.8' as categorical
... storing 'cluster' as categorical
... storing 'cell_state' as categorical
... storing 'predicted.id' as categorical
... storing 'cellBC' as categorical
... storing 'MultiSeqBCseq' as categorical
... storing 'MultiSeqGroup' as categorical
... storing 'TLSCL' as categorical


In [6]:
# Collect the pseduotime values of each cell in the whole dataset
all_pseudotime_values = NMP_states.obs['NT-Somite_Traj']

# Collect the pseudotime values of each cell in the timecourse
TLS_96_pseudotime_values = NMP_states.obs[NMP_states.obs['orig.ident'] == 'mGast_96h']['NT-Somite_Traj']
TLS_108_pseudotime_values = NMP_states.obs[NMP_states.obs['orig.ident'] == 'mGast_108h']['NT-Somite_Traj']
TLS_120_pseudotime_values = NMP_states.obs[NMP_states.obs['orig.ident'] == 'mGast_120h']['NT-Somite_Traj']

# Collect the pseudotime values of each cell in TLS1 and TLS2
TLS1_pseudotime_values = NMP_states.obs[NMP_states.obs['orig.ident'] == 'TLS_120h_rep1']['NT-Somite_Traj']
TLS2_pseudotime_values = NMP_states.obs[NMP_states.obs['orig.ident'] == 'TLS_120h_rep2']['NT-Somite_Traj']

# Collect the pseudotime values of each cell in TLS and TLSCL
TLS_m_pseudotime_values = NMP_states.obs[(NMP_states.obs['TLSCL'] == 'TLS')]['NT-Somite_Traj']
TLSCL_m_pseudotime_values = NMP_states.obs[(NMP_states.obs['TLSCL'] == 'TLSCL')]['NT-Somite_Traj']

# Plot the density results
fig, axs = plt.subplots(2, 2, figsize = (10, 5))

ax1 = axs[0, 0]
sns.kdeplot(all_pseudotime_values, color = 'black', ax = ax1)
ax1.legend(['TLS_integrated'])
ax1.set_xlim(-60, 30)
ax1.set_ylim(0, 0.125)

ax2 = axs[0, 1]
sns.kdeplot(TLS_96_pseudotime_values, color = 'red', ax = ax2)
sns.kdeplot(TLS_108_pseudotime_values, color = 'green', ax = ax2)
sns.kdeplot(TLS_120_pseudotime_values, color = 'blue', ax = ax2)
ax2.legend(['TLS_96h', 'TLS_108h', 'TLS_120h'])
ax2.set_xlim(-60, 30)
ax2.set_ylim(0, 0.125)


ax3 = axs[1, 0]
sns.kdeplot(TLS1_pseudotime_values, color = 'darkgreen', ax = ax3)
sns.kdeplot(TLS2_pseudotime_values, color = 'pink', ax = ax3)
ax3.legend(['TLS1', 'TLS2'])
ax3.set_xlim(-60, 30)
ax3.set_ylim(0, 0.125)

ax4 = axs[1, 1]
sns.kdeplot(TLS_m_pseudotime_values, color = 'darkgreen', ax = ax4)
sns.kdeplot(TLSCL_m_pseudotime_values, color = 'pink', ax = ax4)
ax4.legend(['TLS_Replicates', 'TLSCL_Replicates'])
ax4.set_xlim(-60, 30)
ax4.set_ylim(0, 0.125)

plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/pseudotime_density.pdf', dpi = 300)
#plt.show()
plt.close()

# Look at NMPs over pseudotime

In [7]:
NMPs = TLS_integrated[TLS_integrated.obs['cell_state'] == 'NMPs'].copy()

In [8]:
# NMP specific density plots
all_NMP_pseudotime_values = NMPs.obs['NT-Somite_Traj']
TLS_96_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'mGast_96h')]['NT-Somite_Traj']
TLS_108_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'mGast_108h')]['NT-Somite_Traj']
TLS_120_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'mGast_120h')]['NT-Somite_Traj']

TLS1_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'TLS_120h_rep1')]['NT-Somite_Traj']
TLS2_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'TLS_120h_rep2')]['NT-Somite_Traj']

TLS_m_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['TLSCL'] == 'TLS')]['NT-Somite_Traj']
TLSCL_m_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['TLSCL'] == 'TLSCL')]['NT-Somite_Traj']

fig, axs = plt.subplots(2, 2, figsize = (10, 5))

ax1 = axs[0, 0]
sns.kdeplot(all_NMP_pseudotime_values, color = 'black', ax = ax1)
ax1.legend(['TLS_integrated'])
ax1.set_xlim(-20, 20)
ax1.set_ylim(0, 0.5)

ax2 = axs[0, 1]
sns.kdeplot(TLS_96_NMP_pseudotime_values, color = 'red', ax = ax2)
sns.kdeplot(TLS_108_NMP_pseudotime_values, color = 'green', ax = ax2)
sns.kdeplot(TLS_120_NMP_pseudotime_values, color = 'blue', ax = ax2)
ax2.legend(['TLS_96h', 'TLS_108h', 'TLS_120h'])
ax2.set_xlim(-20, 20)
ax2.set_ylim(0, 0.5)


ax3 = axs[1, 0]
sns.kdeplot(TLS1_NMP_pseudotime_values, color = 'darkgreen', ax = ax3)
sns.kdeplot(TLS2_NMP_pseudotime_values, color = 'pink', ax = ax3)
ax3.legend(['TLS1', 'TLS2'])
ax3.set_xlim(-20, 20)
ax3.set_ylim(0, 0.5)

ax4 = axs[1, 1]
sns.kdeplot(TLS_m_NMP_pseudotime_values, color = 'darkgreen', ax = ax4)
sns.kdeplot(TLSCL_m_NMP_pseudotime_values, color = 'pink', ax = ax4)
ax4.legend(['TLS_Replicates', 'TLSCL_Replicates'])
ax4.set_xlim(-20, 20)
ax4.set_ylim(0, 0.5)

plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/pseudotime_NMP_density.pdf', dpi = 300)
#plt.show()
plt.close()

# Classify NMPs based off their lineage neighbors

In [9]:
# Open TLS1 and TLS2 trees
TLS1_loc = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-097/lineage/2_lineage_reconstruction/AM-DNA-097_hybrid_newick_noMutationlessEdges_Labeled.nwk'
t1 = Tree(TLS1_loc, format=1)

TLS2_loc = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/lineage/2_lineage_reconstruction/AM-DNA-098_hybrid_newick_noMutationlessEdges_Labeled.nwk'
t2 = Tree(TLS2_loc, format=1)

# Save lists of all the leaves that appear TLS1 and TLS2
TLS1_leaves = [leaf.name for leaf in t1.get_leaves()]
TLS2_leaves = [leaf.name for leaf in t2.get_leaves()]

# Open TLS M and TLSCL trees to grab the leaves
TLS_m_leaves = []
TLSCL_leaves = []

# For each barcode in TLS and TLSCL, open the tree and add the leaves (cells) to the total tree leaf lists
for barcode in TLS_barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)

    for leaf in t.get_leaves():
        TLS_m_leaves.append(leaf.name)

for barcode in TLSCL_barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)

    for leaf in t.get_leaves():
        TLSCL_leaves.append(leaf.name)
        
# Reformat the leaf names so that they can be used to index the integrated object
tree_cells = ['TLS_120h_rep1_' + i for i in TLS1_leaves] + ['TLS_120h_rep2_' + i for i in TLS2_leaves] + ['TLS_TLSCL_' + i for i in TLS_m_leaves] + ['TLS_TLSCL_' + i for i in TLSCL_leaves]

In [10]:
# Make a list of all the NMPs in the integrated object
NMP_index = list(NMPs.obs.index)

# Make a list of all the NMPs that appear on a tree
NMP_tree_cells = []

for i in tree_cells:
    #print(i)
    if i in NMP_index:
        NMP_tree_cells.append(i)

# copy a dataframe of the obs table of the NMPs that are in trees
tree_NMPs = NMPs[NMPs.obs.index.isin(NMP_tree_cells)].copy()

In [11]:
# Check if lineage tracing causes NMPs to behave differently over the pseudotime
tree_NMPs_values = list(tree_NMPs.obs['NT-Somite_Traj'])
all_NMPs_values = list(NMPs.obs[NMPs.obs['orig.ident'].isin(['TLS_120h_rep1', 'TLS_120h_rep2', 'TLS_TLSCL'])]['NT-Somite_Traj'])

fig, ax = plt.subplots(1, 1, figsize = (5, 2.5))

sns.kdeplot(all_NMPs_values, color = 'black')
sns.kdeplot(tree_NMPs_values, color = 'red')
ax.legend(['All NMPs', 'Tree NMPs'])
ax.set_title('Tree NMPs Distribution')
ax.set_xlim(-20, 20)
ax.set_ylim(0, 0.5)
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/pseudotime_Tree_NMP_density.pdf', dpi = 300)
#plt.show()
plt.close()

In [12]:
# Add a column in the tree_NMPs table to classify the NMPs
tree_NMPs.obs['NMP Type'] = ''
tree_NMPs.obs['frac_Somitic'] = 0
tree_NMPs.obs['frac_Neural'] = 0
tree_NMPs.obs['Somite_Neural_ratio'] = 0

# Populate the new column by manually looking at all nodes in TLS1
for node in t1.traverse():
    # find nodes so that we know the parent node for a leaf
    if not node.is_leaf():
        # check if any of the children are an NMP
        for child in node.children:
            if child.is_leaf() and TLS1_cell_state_table.loc[child.name, 'cell_state'] == 'NMPs':
                # classify the NMP based on all the cell types that get made from the same parent
                #leaves = [leaf.name for leaf in node.get_leaves()]
                
                # Classify the NMP based on the cell types of sibling cells (leafs that are children of the same parent node)
                leaves = []
                for subnode in node.children:
                    if subnode.is_leaf():
                        leaves.append(subnode.name)
                
                states = set()
                neural_count = 0
                somitic_count = 0
                for leaf in leaves:
                    if TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
                        states.add('Somitic')
                        somitic_count += 1
                    elif TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['NeuralTube1', 'NeuralTube2']:
                        states.add('Neural')
                        neural_count += 1
                    elif TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['NMPs']:
                        states.add('NMPs')
                    elif TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['Endothelial', 'Endoderm', 'PCGLC']:
                        states.add('Drop')
                        
                tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'frac_Somitic'] = somitic_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'frac_Neural'] = neural_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'Somite_Neural_ratio'] = (somitic_count + 1) / (neural_count + 1)
                        
                if states == set(['Neural', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Neural'
                elif states == set(['Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Somitic'
                elif states == set(['Neural', 'Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Bipotent'
                elif states == set(['NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Renewing'
                else:
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'Dropped'
                    
                    
for node in t2.traverse():
    # find nodes so that we know the parent node for a leaf
    if not node.is_leaf():
        # check if any of the children are an NMP
        for child in node.children:
            if child.is_leaf() and TLS2_cell_state_table.loc[child.name, 'cell_state'] == 'NMPs':
                # classify the NMP based on all the cell types that get made from the same parent
                #leaves = [leaf.name for leaf in node.get_leaves()]
                
                # Classify the NMP based on the cell types of sibling cells (leafs that are children of the same parent node)
                leaves = []
                for subnode in node.children:
                    if subnode.is_leaf():
                        leaves.append(subnode.name)
                
                states = set()
                neural_count = 0
                somitic_count = 0
                for leaf in leaves:
                    if TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
                        states.add('Somitic')
                        somitic_count += 1
                    elif TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['NeuralTube1', 'NeuralTube2']:
                        states.add('Neural')
                        neural_count += 1
                    elif TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['NMPs']:
                        states.add('NMPs')
                    elif TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['Endothelial', 'Endoderm', 'PCGLC']:
                        states.add('Drop')
                        
                tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'frac_Somitic'] = somitic_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'frac_Neural'] = neural_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'Somite_Neural_ratio'] = (somitic_count + 1) / (neural_count + 1)
                        
                if states == set(['Neural', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Neural'
                elif states == set(['Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Somitic'
                elif states == set(['Neural', 'Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Bipotent'
                elif states == set(['NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Renewing'
                else:
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'Dropped'
                    

In [13]:
for barcode in TLS_barcodes + TLSCL_barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)
    
    for node in t.traverse():
        # only do analysis on non leaves
        if not node.is_leaf():
            # check if any of the children are an NMP
            for child in node.children:
                if child.is_leaf() and TLS_M_cell_state_table.loc[child.name, 'cell_state'] == 'NMPs':
                    #leaves = [leaf.name for leaf in node.get_leaves()]

                    # Classify the NMP based on the cell types of sibling cells (leafs that are children of the same parent node)
                    leaves = []
                    for subnode in node.children:
                        if subnode.is_leaf():
                            leaves.append(subnode.name)
                            
                    states = set()
                    neural_count = 0
                    somitic_count = 0
                    for leaf in leaves:
                        if TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
                            states.add('Somitic')
                            somitic_count += 1
                        elif TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['NeuralTube1', 'NeuralTube2']:
                            states.add('Neural')
                            neural_count += 1
                        elif TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['NMPs']:
                            states.add('NMPs')
                        elif TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['Endoderm', 'Endothelial', 'PCGLC']:
                            states.add('Drop')
                            
                    tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'frac_Somitic'] = somitic_count / len(leaves)
                    tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'frac_Neural'] = neural_count / len(leaves)
                    tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'Somite_Neural_ratio'] = (somitic_count + 1) / (neural_count + 1)
                        

                    if states == set(['Neural', 'NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Neural'
                    elif states == set(['Somitic', 'NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Somitic'
                    elif states == set(['Neural', 'Somitic', 'NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Bipotent'
                    elif states == set(['NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Renewing'
                    else:
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'Dropped'

In [14]:
# NMP specific density plots after filtering the NMPs that share siblings that contain endoderm, endothelial, or PGC
# classifying these NMPs using only sibling cells (not taking into account sibling nodes)
fig, ax = plt.subplots(2, 2, figsize = (20, 10))
order = ['NMPs-Bipotent', 'NMPs-Somitic', 'NMPs-Neural', 'NMPs-Renewing']
fig.suptitle('Classifying NMPs using only sibling cells') # or plt.suptitle('Main title')

ax1 = ax[0, 0]
temp = tree_NMPs.obs[tree_NMPs.obs['orig.ident'] == 'TLS_120h_rep1']
labels = []
for i in order:
    sns.kdeplot(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'], ax = ax1)
    labels.append(i + ' n = ' + str(len(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'])))
ax1.legend(labels)
ax1.set_title('TLS1')
ax1.set_xlim(-10, 8)
ax1.set_ylim(0, 0.5)

ax2 = ax[0, 1]
temp = tree_NMPs.obs[tree_NMPs.obs['orig.ident'] == 'TLS_120h_rep2']
labels = []
for i in order:
    if len(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj']) > 0:
        sns.kdeplot(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'], ax = ax2)
        labels.append(i + ' n = ' + str(len(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'])))
ax2.legend(labels)
ax2.set_title('TLS2')
ax2.set_xlim(-10, 8)
ax2.set_ylim(0, 0.5)

ax3 = ax[1, 0]
temp = tree_NMPs.obs[(tree_NMPs.obs['orig.ident'] == 'TLS_TLSCL') & (tree_NMPs.obs['TLSCL'] == 'TLS')]
labels = []
for i in order:
    sns.kdeplot(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'], ax = ax3)
    labels.append(i + ' n = ' + str(len(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'])))
ax3.legend(labels)
ax3.set_title('TLS MultiSeq')
ax3.set_xlim(-10, 8)
ax3.set_ylim(0, 0.5)

ax4 = ax[1, 1]
temp = tree_NMPs.obs[(tree_NMPs.obs['orig.ident'] == 'TLS_TLSCL') & (tree_NMPs.obs['TLSCL'] == 'TLSCL')]
labels = []
for i in order:
    sns.kdeplot(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'], ax = ax4)
    labels.append(i + ' n = ' + str(len(temp[(temp['NMP Type'] == i)]['NT-Somite_Traj'])))
ax4.legend(labels)
ax4.set_title('TLSCL MultiSeq')
ax4.set_xlim(-10, 8)
ax4.set_ylim(0, 0.5)

plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/pseudotime_NMP_type_density.pdf', dpi = 300)
#plt.show()
plt.close()



# Plot NMP types over the pseudotime umap

In [15]:
# Plot each of the dataset's NMPs over th epseudotime
temp = tree_NMPs.copy()

fig, axs = plt.subplots(1, 4, figsize=(20, 5))

count = 0
done = False
for ident in ['TLS_120h_rep1', 'TLS_120h_rep2', 'TLS_TLSCL', 'TLS_TLSCL']:
    ax = axs[count]
    ax.plot(TLS_integrated.obsm['X_umap'][:,0], TLS_integrated.obsm['X_umap'][:,1], '.', color= "lightgray", alpha = 0.8, ms = 5, rasterized = True, markeredgewidth = 0.0)
    
    if ident == 'TLS_TLSCL':
        if done:
            temp_ident = temp[(temp.obs['orig.ident'] == ident) & (temp.obs['TLSCL'] == 'TLSCL')].copy()
        else:
            temp_ident = temp[(temp.obs['orig.ident'] == ident) & (temp.obs['TLSCL'] == 'TLS')].copy()
            done = True
    else:
        temp_ident = temp[(temp.obs['orig.ident'] == ident)].copy()
    
    ax.plot(temp_ident.obsm['X_umap'][:,0], temp_ident.obsm['X_umap'][:,1], '.', alpha = 1, ms = 5, markeredgewidth = 0.0)
    ax.set_title(ident + ' NMPs')
    ax.axis([-15, 11, -15, 15])
    ax.set_yticks([])
    ax.set_xticks([])
    count += 1

plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/umap_NMP_types.pdf', dpi = 300)
#plt.show()
plt.close()

In [16]:
colors = {'NMPs-Renewing': 'turquoise',
          'NMPs-Neural': 'green',
          'NMPs-Somitic': 'magenta',
          'NMPs-Bipotent': 'blue',
          'Dropped': 'gray'}

temp = tree_NMPs[(tree_NMPs.obs['orig.ident'] == 'TLS_TLSCL') & (tree_NMPs.obs['TLSCL'] == 'TLS')].copy()

fig, ax = plt.subplots(figsize=(5, 5))

ax.plot(TLS_integrated.obsm['X_umap'][:,0], TLS_integrated.obsm['X_umap'][:,1], '.', color= "lightgray", alpha = 0.8, ms = 5, rasterized = True, markeredgewidth = 0.0)
    
for NMP in ['Dropped', 'NMPs-Renewing', 'NMPs-Neural', 'NMPs-Bipotent', 'NMPs-Somitic']:
    NMP_temp = temp[temp.obs['NMP Type'] == NMP].copy()
    ax.plot(NMP_temp.obsm['X_umap'][:,0], NMP_temp.obsm['X_umap'][:,1], '.', alpha = 1, color = colors[NMP], ms = 7.5, markeredgecolor = 'black', markeredgewidth = 0.25, label = NMP)

ax.set_title('NMP Types')
ax.legend()
ax.axis([-15, 11, -15, 15])
ax.set_yticks([])
ax.set_xticks([])

plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/umap_combined_TLS-M_NMP_types.pdf', dpi = 300)
#plt.show()
plt.close()

In [17]:
temp = tree_NMPs[(tree_NMPs.obs['orig.ident'] == 'TLS_TLSCL') & (tree_NMPs.obs['TLSCL'] == 'TLS')].copy()

fig, ax = plt.subplots(1, 1, figsize=(6, 5))

ax.scatter(TLS_integrated.obsm['X_umap'][:,0], TLS_integrated.obsm['X_umap'][:,1], color= "lightgray", alpha = 0.5, s = 5)

x = TLS_integrated[(TLS_integrated.obs['cell_state'] != 'Endoderm') & (TLS_integrated.obs['cell_state'] != 'Unknown') & (TLS_integrated.obs['cell_state'] != 'Endothelial') & (TLS_integrated.obs['cell_state'] != 'PCGLCs')].obsm['X_umap'][:,0]
y =  TLS_integrated[(TLS_integrated.obs['cell_state'] != 'Endoderm') & (TLS_integrated.obs['cell_state'] != 'Unknown') & (TLS_integrated.obs['cell_state'] != 'Endothelial') & (TLS_integrated.obs['cell_state'] != 'PCGLCs')].obsm['X_umap'][:,1]
z = TLS_integrated[(TLS_integrated.obs['cell_state'] != 'Endoderm') & (TLS_integrated.obs['cell_state'] != 'Unknown') & (TLS_integrated.obs['cell_state'] != 'Endothelial') & (TLS_integrated.obs['cell_state'] != 'PCGLCs')].obs['NT-Somite_Traj']
points = ax.scatter(x, y, c = z, cmap = 'viridis', alpha = 0.5, s = 5)

for NMP in ['Dropped', 'NMPs-Renewing', 'NMPs-Neural', 'NMPs-Bipotent', 'NMPs-Somitic']:
    NMP_temp = temp[temp.obs['NMP Type'] == NMP].copy()
    ax.plot(NMP_temp.obsm['X_umap'][:,0], NMP_temp.obsm['X_umap'][:,1], '.', color = colors[NMP], alpha = 0.8, markersize = 7.5, markeredgecolor = 'black', markeredgewidth = 0.5, label = NMP)

ax.set_title('NMPs on a Tree')
ax.legend()
ax.axis([-15, 11, -15, 15])
ax.set_yticks([])
ax.set_xticks([])
fig.colorbar(points)

plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/umap_combined_TLS-M_NMP_types_over_pseudotime.pdf', dpi = 300)
#plt.show()
plt.close()

# Investigating monocle modules across the pseudotime

- From monocle, I have created a dataframe of gene modules that vary significantly across the pseudotime. This table can be found here:

'/Genomics/chanlab/blaw/TLS/raw_data/monocle_results/TLS_TLSCL_1_2_Time_Integrated_monocle_full_subset_gene_modules_high_res_genes_above_0.1morans.txt'

In [18]:
module_genes_df = pd.read_csv('/Genomics/chanlab/blaw/TLS/raw_data/monocle_results/TLS_TLSCL_1_2_Time_Integrated_monocle_full_subset_gene_modules_high_res_genes_above_0.1morans.txt', index_col = 0)

In [19]:
modules = module_genes_df['module'].unique()

# use the scanpy score genes function to calculate the module score of each module for each cell
for temp_module in modules:
    gene_ids = module_genes_df[module_genes_df['module'] == temp_module]['id']
    sc.tl.score_genes(NMP_states, gene_ids, ctrl_size = len(gene_ids), score_name = 'module_{}_score'.format(temp_module), use_raw = False)

  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df

  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df

  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  for cut in np.unique(obs_cut.loc[gene_list]):
  df

In [20]:
with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(NMP_states, color=['module_1_score', 'module_2_score', 'module_3_score', 'module_4_score', 'module_5_score',
                                  'module_6_score', 'module_7_score', 'module_8_score', 'module_9_score', 'module_10_score',
                                  'module_11_score', 'module_12_score', 'module_13_score', 'module_14_score', 'module_15_score',
                                  'module_16_score', 'module_17_score', 'module_18_score', 'module_19_score', 'module_20_score',
                                  'module_21_score', 'module_22_score', 'module_23_score', 'module_24_score', 'module_25_score',
                                  'module_26_score', 'module_27_score', 'module_28_score'], save = '_module_scores', show = False)

In [21]:
# investigate module scores vs NMP types in TLS M
temp = NMP_states[(NMP_states.obs['orig.ident'] == 'TLS_TLSCL') & (NMP_states.obs['TLSCL'] == 'TLS')].copy()
temp.obs['NMP Type'] = 'Non-Tree'

for i in tree_NMPs.obs.index:
    if i in temp.obs.index:
        temp.obs.loc[i, 'NMP Type'] = tree_NMPs.obs.loc[i, 'NMP Type']

  df_sub[k].cat.remove_unused_categories(inplace=True)


In [22]:
for module in [14, 7, 1, 4]:
    data = []

    cell_states = ['aPSM', 'pPSM', 'NMPs-Somitic', 'NMPs-Neural', 'NeuralTube1', 'NeuralTube2']
    for i in cell_states:
        if i.startswith('NMPs'):
            data.append(temp.obs[temp.obs['NMP Type'] == i]['module_{}_score'.format(module)])
        else:
            data.append(temp.obs[temp.obs['cell_state'] == i]['module_{}_score'.format(module)])

    fig, ax = plt.subplots(figsize = (10, 5))

    plt.boxplot(data, labels = cell_states)
    plt.title('Module {} Score - TLS M'.format(module))
    plt.xlabel('Module Score')
    plt.tight_layout()
    plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/module_{}_boxplots_NMP_Types_with_bookends.pdf'.format(module), dpi = 300)
    #plt.show()
    plt.close()

In [23]:
# plot modules for just NMP types
for module in [14, 7, 1, 4]:
    data = []

    cell_states = ['NMPs-Renewing', 'NMPs-Somitic', 'NMPs-Neural', 'NMPs-Bipotent']
    for i in cell_states:
        if i.startswith('NMPs'):
            data.append(temp.obs[temp.obs['NMP Type'] == i]['module_{}_score'.format(module)])
        else:
            data.append(temp.obs[temp.obs['cell_state'] == i]['module_{}_score'.format(module)])

    fig, ax = plt.subplots(figsize = (10, 5))

    plt.boxplot(data, labels = cell_states)
    plt.title('Module {} Score - TLS M'.format(module))
    plt.xlabel('Module Score')
    plt.tight_layout()
    plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/module_{}_boxplots_NMP_Types.pdf'.format(module), dpi = 300)
    #plt.show()
    plt.close()

# Correlate the module score with the ratio of somite / neural

In [24]:
# Look at only TLSM cells that are on the tree
test_tree_cells = []

for i in temp.obs.index:
    if i in tree_NMPs.obs.index:
        test_tree_cells.append(i)
        
temp = temp[temp.obs.index.isin(test_tree_cells)].copy()

temp.obs['frac_Neural'] = ''
temp.obs['frac_Somitic'] = ''
temp.obs['Somite_Neural_ratio'] = ''

for i in temp.obs.index:
    temp.obs.loc[i, 'frac_Neural'] = tree_NMPs.obs.loc[i, 'frac_Neural']
    temp.obs.loc[i, 'frac_Somitic'] = tree_NMPs.obs.loc[i, 'frac_Somitic']
    temp.obs.loc[i, 'Somite_Neural_ratio'] = tree_NMPs.obs.loc[i, 'Somite_Neural_ratio']

  df_sub[k].cat.remove_unused_categories(inplace=True)


In [25]:
# test the correlation between the somite neural ratio and module score in TLS M tree NMPs
module_corr = pd.DataFrame(index = modules, columns = ['slope', 'statistic', 'pval', 'adj_pval'])

for temp_module in module_corr.index:
    slope, intercept, r, p, se = stats.linregress(temp.obs['module_{}_score'.format(temp_module)].tolist(), temp.obs['Somite_Neural_ratio'].tolist())

    module_corr.loc[temp_module, 'statistic'] = r
    module_corr.loc[temp_module, 'pval'] = p
    module_corr.loc[temp_module, 'slope'] = slope
    
# adjust the pval using multiple hypothesis testing
module_corr['adj_pval'] = module_corr['pval'] * len(module_corr.index)

module_corr.to_csv('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/TreeNMPs_TLSM_module_correlations_full_pseudotime_high_res_genes_above_0.1morans.txt', sep = '\t')

In [26]:
bars = []
heights = []
for bar in module_corr.sort_values(by = 'slope').index:
    heights.append(module_corr.loc[bar, 'slope'])
    bars.append(str(bar))

fig, ax = plt.subplots(figsize = (10, 5))
plt.bar(x = bars, height = heights)
plt.xlabel('module number')
plt.ylabel('Slope')
plt.title('Module Correlation Slopes')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/module_slope_full_pseudotime_high_res_genes_above_0.1morans.pdf', dpi = 300)
#plt.show()
plt.close()

# Investigating gene expression of NMPs over the pseudotime

- In this section I will extract the average expression of NMPs across the pseudotime for genes identified in the module analysis

- I will focus on the -10 to 8range in the pseudotime, removing non-NMPs from the dataset

- I will average the expression of all cells in 1 step bins over the pseudotime

- Remove the 96h dataset and remove the TLSCL dataset

In [27]:
Mature_NMPs = NMPs[(NMPs.obs['orig.ident'].isin(['mGast_120h', 'mGast_108h', 'TLS_120h_rep1', 'TLS_120h_rep2', 'TLS_TLSCL']))
                   & (NMPs.obs['NT-Somite_Traj'] >= -10) & (NMPs.obs['NT-Somite_Traj'] <= 8) & (NMPs.obs['TLSCL'] != 'TLSCL')
                   & (NMPs.obs['MultiSeqGroup'] != 'Doublet') & (NMPs.obs['MultiSeqGroup'] != 'Negative')].copy()

In [29]:
ordered_cells = list(Mature_NMPs.obs.sort_values(by = 'NT-Somite_Traj').index)

# split the cells into 1 step bins along the pseudotime
itr_dict = {}

previous = 0
for i in range(1, 19):
    lower = -10 + previous
    upper = -10 + i
    cells = list(Mature_NMPs.obs[(Mature_NMPs.obs['NT-Somite_Traj'] >= lower) & (Mature_NMPs.obs['NT-Somite_Traj'] < upper)].index)
    previous = i
    temp = str(lower) + '_' + str(upper)
    itr_dict[temp] = cells

with open('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/NMPs_pseudotime_-10-8_1_step_bins.pickle', 'wb') as handle:
    pickle.dump(itr_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
# Calculate the avg expression of each of the 1 step bins
avg_itr_expr_df = pd.DataFrame(columns = Mature_NMPs.var['features'], index = itr_dict.keys())

for i in avg_itr_expr_df.index:
    cells = itr_dict[i]
    temp = Mature_NMPs[Mature_NMPs.obs.index.isin(cells)].copy()
    
    expr = temp.X.todense()

    avg_itr_expr_df.loc[i] = expr.mean(axis = 0)

avg_itr_expr_df.to_csv('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/NMPs_pseudotime_-10-8_1_step_avg_expr.txt', sep = '\t')

# Plot the expression of these bins for NMPS

In [31]:
avg_itr_expr_df = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/NMPs_pseudotime_-10-8_1_step_avg_expr.txt', sep = '\t', index_col = 0)

In [32]:
# calculate the zscores of genes
z_itr_expr_df = avg_itr_expr_df.copy()

mean_vals = z_itr_expr_df.mean(axis = 0)
std_vals = z_itr_expr_df.std(axis = 0)

z_itr_expr_df = (z_itr_expr_df - mean_vals) / (std_vals)
z_itr_expr_df.fillna(0, inplace = True)

In [35]:
temp = z_itr_expr_df.copy()
genes_14 = module_genes_df[module_genes_df['module'] == 14]['id'].tolist()
genes_4 = module_genes_df[module_genes_df['module'] == 4]['id'].tolist()
#genes_1 = module_genes_df[module_genes_df['module'] == 1]['id'].tolist()
genes_7 = module_genes_df[module_genes_df['module'] == 7]['id'].tolist()
genes_of_interest = genes_14 + genes_4 + genes_7
temp = temp[genes_of_interest]
#temp.drop(index = ['-15_-14', '-14_-13', '-13_-12', '-12_-11', '-11_-10', '8_9', '9_10', '10_11', '11_12', '12_13', '13_14', '14_15'], inplace = True)

vcenter = 0
#vmin, vmax = temp.min().min(), temp.max().max()
vmin, vmax = -2, 3
normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
colormap = cm.RdYlBu_r

g = sns.clustermap(temp.T, figsize = (10, 60), col_cluster = False, row_cluster = False, norm=normalize, cmap=colormap)
plt.subplots_adjust(bottom = 0.3, right = 0.7)
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/2_pseudotime/module_14-4-7_NMPs_pseudotime_high_res_genes_above_0.1morans.pdf', dpi = 300)
#plt.show()
plt.close()