In [42]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import pickle
import itertools

from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from ete3 import Tree
from typing import Tuple

In [43]:
# Load a dict with color codes for the cell states
clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)
    
# create a dict with color codes for progenitor states
progenitor_colorDict = {'Extended Progenitor':'Black', 'Pluripotent Progenitor':'Orange', 'Bipotent Progenitor':'MediumBlue',
                        'Neural Progenitor':'DarkGreen', 'Somitic Progenitor':'Purple', 'PGCLC Progenitor': 'Red',
                        'Endoderm Progenitor':'Gold', 'Dropped': 'Gray'}    

# Load the cell state table
total_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/TLS_Explant_Total_cellBC_cellState.tst', sep = '\t')

# Helpful lists
barcodes = ['Bar1', 'Bar2', 'Bar3', 'Bar4', 'Bar5', 'Bar6']
TLS_barcodes = ['Bar1', 'Bar2', 'Bar3']
TLSCL_barcodes = ['Bar4', 'Bar5', 'Bar6']
progenitor_list = ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', 'PGCLC Progenitor', 'Endoderm Progenitor', 'Dropped']
colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', 
          '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', 
          '#ffffff', '#000000']

# Helpful functions

In [44]:
def maxDepth(node):
    '''
    Input:
        - a node in an ete tree
    returns:
        - The max depth of any branch in that node
    '''
    if node.is_leaf():
        return 0
    children_depths = []
    
    for child in node.children:
        test = maxDepth(child)
        
        children_depths.append(test)
        
    return max(children_depths) + 1

In [45]:
def countNMP_Somite_Neural_other(node, cell_state_table):
    '''
    Input:
        - a node in an ete tree
        - a table of cell states for each cellBC
    return:
        - A tuple of the number of NMP, somitic, and neural cells that are leaves of the node
    '''
    leaves = [leaf.name for leaf in node.get_leaves()]

    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()

    # Group the Neural and Somite cell states into 1 category and remove PGCLC, Endoderm, Unknown, Epithelial
    somitic_count = 0
    NMP_count = 0
    neural_count = 0
    endoderm_count = 0
    endothelial_count = 0
    pgc_count = 0
    unknown_count = 0
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            somitic_count += 1
        elif state in ['NeuralTube1', 'NeuralTube2']:
            neural_count += 1
        elif state in ['NMPs']:
            NMP_count += 1
        elif state == 'Endothelial':
            endothelial_count += 1
        elif state == 'Endoderm':
            endoderm_count += 1
        elif state == 'PCGLC':
            pgc_count += 1
        elif state == 'Unknown':
            unknown_count += 1

    return (NMP_count, somitic_count, neural_count, pgc_count, endoderm_count, endothelial_count, unknown_count)

In [46]:
def countTimepoint(node):
    '''
    Input:
        - a node in an ete tree. Assumes that the leaf names contain the explant timepoint information. 120h timepoints start
        with Tracer_Explant_ on the beginning of each leaf. 144h timespoints start with Tracer_Outgrowth_1 or Tracer_Outgrowth_2
        at the beginning of each leaf
    return:
        - A tuple of the number of 120h and 144h cells for the node
    '''
    leaves = [leaf.name for leaf in node.get_leaves()]

    # Group the Neural and Somite cell states into 1 category and remove PGCLC, Endoderm, Unknown, Epithelial
    count_120 = 0
    count_144 = 0
    for cell in leaves:
        # use the cell names to identify timepoint
        if cell.startswith('Tracer_Explant'):
            count_120 += 1
        elif cell.startswith('Tracer_Outgrowth'):
            count_144 += 1

    return (count_120, count_144)

In [47]:
def getProgenitorType(node, cell_state_table):
    '''
    input:
        - a tree node to test
        - a table that contains the annotated cell state for each cellBC
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    - Extended progenitors: PGCs, Endoderm, Somitic, Neural
    - Pluripotent progenitors: Endoderm, Somitic, Neural
    - Bipotent progenitors: Somitic, Neural (both if contains or not NMPs still count)
    - Endoderm progenitors: Endoderm only
    - PGCLC progenitors: PGCLC only
    - Somitic progenitors: somitic only
    - Neural progenitors: neural only
    
    Neural class is made from NeuralTube1 and NeuralTube2
    Somite class is pPSM, aPSM, Somite-1, Somite0, Somite, Somite1, SomiteSclero, SomiteDermo
    
    NMPs are left out of the analysis. +/- an NMP does not change the category that a node gets
    
    exclude nodes that are Endoderm without both somitic and neural (unless it is alone)
    exclude nodes that are PGC without all 3 endoderm, somitic, and neural (unless it is alone)
    
    Endothelial is allowed (+/-) in extended progenitors and pluripotent progenitors
    Endothelial is not allowed (-) in all other progenitors
    
    Unassigned / Unknown cells are not looked at for this classification (+/-)
    
    '''
    progenitor_types = {'Extended Progenitor': set(['PCGLC', 'Endoderm', 'Somitic', 'Neural']),
                        'Pluripotent Progenitor': set(['Endoderm', 'Somitic', 'Neural']),
                        'Bipotent Progenitor' : set(['Somitic', 'Neural']),
                        'Endoderm Progenitor': set(['Endoderm']),
                        'PGCLC Progenitor': set(['PCGLC']),
                        'Somitic Progenitor': set(['Somitic']),
                        'Neural Progenitor': set(['Neural'])}
    
    
    leaves = [leaf.name for leaf in node.get_leaves()]
    
    # make a list of the cell states in a given node
    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()
    
    # Group the cell states into neural and somite categories
    grouped_states = []
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            grouped_states.append('Somitic')
        elif state in ['NeuralTube1', 'NeuralTube2']:
            grouped_states.append('Neural')
        elif state in ['PCGLC', 'Endoderm', 'Endothelial']:
            grouped_states.append(state)
    
    state_set = set(grouped_states)
    
    for progenitor in progenitor_types.keys():
        if state_set == progenitor_types[progenitor]:
            return progenitor
    if state_set == set(['PCGLC', 'Endoderm', 'Somitic', 'Neural', 'Endothelial']):
        return 'Extended Progenitor'
    if state_set == set(['Endoderm', 'Somitic', 'Neural', 'Endothelial']):
        return 'Pluripotent Progenitor'
    
    return 'Dropped'

In [48]:
def getProgenitorType_FC(node, cell_state_table):
    '''
    input:
        - a tree node to test
        - a table that contains the annotated cell state for each cellBC
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    Record the cell states that are connected to a node from the following 6 states:
    - PGCLC
    - Endoderm
    - Endothelial
    - NMPs
    - Somitic
    - Neural
    
    Neural class is made from NeuralTube1 and NeuralTube2
    Somite class is pPSM, aPSM, Somite-1, Somite0, Somite, Somite1, SomiteSclero, SomiteDermo
    
    Unassigned / Unknown cells are not looked at for this classification (+/-)
    
    '''
    leaves = [leaf.name for leaf in node.get_leaves()]
    
    states = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
    total_combinations = []
    for i in range(5):
        for j in itertools.combinations(states, i+2):
            total_combinations.append(j)

    progenitor_type_dict = {'PCGLC': set(['PCGLC']),
                       'Endoderm': set(['Endoderm']),
                       'Endothelial': set(['Endothelial']),
                       'NMPs': set(['NMPs']),
                       'Somitic': set(['Somitic']),
                       'Neural': set(['Neural'])}
    for i in total_combinations:
        label = '_'.join(i)
        progenitor_type_dict[label] = set(i)
    
    # make a list of the cell states in a given node
    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()
    
    # Group the cell states into neural and somite categories
    grouped_states = []
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            grouped_states.append('Somitic')
        elif state in ['NeuralTube1', 'NeuralTube2']:
            grouped_states.append('Neural')
        elif state in ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs']:
            grouped_states.append(state)
    
    state_set = set(grouped_states)
    
    for progenitor in progenitor_type_dict.keys():
        if state_set == progenitor_type_dict[progenitor]:
            return progenitor
    
    # Return dropped if a node is not in the progenitor type dict
    return 'Dropped'

# Create a node table across all experiments

I will be doing the progenitor analysis using only hybrid trees

In [8]:
method = 'hybrid'
node_columns = ['barcode', 'TLS', 'method', 'node_name', 'node_size', 'clone_name', 'clone_size', 'clone_time',
                'max_clone_depth', 'dist_to_root', 'norm_dist_to_clone', 'frac_NMP', 'frac_somitic', 'frac_neural',
                'frac_PGC', 'frac_endoderm', 'frac_endothelial', 'frac_unknown', 'frac_120', 'frac_144',
                'progenitor_type', 'progenitor_type_FC', 'node_time']
node_info = pd.DataFrame(columns = node_columns)

for barcode in barcodes:
    # Load tree
    treeFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/120_144/hybrid/{}_120_144_hybrid_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, barcode)
    t = Tree(treeFile, format = 1)

    # Add all nodes in the tree to the node_info dataframe
    node_names = []
    for node in t.traverse():
        if node.name != 'node0' and not node.is_leaf():
            node_names.append('{}_{}'.format(barcode, node.name))

    temp_node_info = pd.DataFrame(index = node_names, columns = node_columns)
    node_info = pd.concat((node_info, temp_node_info))

    if barcode in TLS_barcodes:
        TLS = 'TLS'
    elif barcode in TLSCL_barcodes:
        TLS = 'TLSCL'

    # fill in node information, don't keep the root nodes since we know that these structures started from multiple cells
    for clone in t.children:
        clone_max_depth = maxDepth(clone)
        clone_120, clone_144 = countTimepoint(clone)
        if clone_120 != 0 and clone_144 != 0:
            clone_time = '120_144'
        elif clone_120 != 0:
            clone_time = '120'
        elif clone_144 != 0:
            clone_time = '144'

        for node in clone.traverse():
            if not node.is_leaf():
                ID = '{}_{}'.format(barcode, node.name)
                leaves = [leaf.name for leaf in node.get_leaves()]
                cell_types = total_cell_state_table[total_cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()

                NMP_count, somitic_count, neural_count, pgc_count, endoderm_count, endothelial_count, unknown_count = countNMP_Somite_Neural_other(node, total_cell_state_table)
                count_120, count_144 = countTimepoint(node)

                total_time = count_120 + count_144
                total = somitic_count + NMP_count + neural_count + pgc_count + endoderm_count + endothelial_count + unknown_count

                NMP_frac = NMP_count / total
                somitic_frac = somitic_count / total
                neural_frac = neural_count / total
                pgc_frac = pgc_count / total
                endoderm_frac = endoderm_count / total
                endothelial_frac = endothelial_count / total
                unknown_frac = unknown_count / total

                frac_120 = count_120 / total_time
                frac_144 = count_144 / total_time

                progenitor = getProgenitorType(node, total_cell_state_table)
                progenitor_FC = getProgenitorType_FC(node, total_cell_state_table)
                if frac_120 != 0 and frac_144 != 0:
                    timepoint = '120_144'
                elif frac_120 != 0:
                    timepoint = '120'
                elif frac_144 != 0:
                    timepoint = '144'

                # Record node information to the large table
                node_info.loc[ID, 'barcode'] = barcode
                node_info.loc[ID, 'TLS'] = TLS
                node_info.loc[ID, 'method'] = method
                node_info.loc[ID, 'node_name'] = '{}_{}'.format(barcode, node.name)
                node_info.loc[ID, 'node_size'] = len(node.get_leaves())
                node_info.loc[ID, 'clone_name'] = '{}_{}'.format(barcode, clone.name)
                node_info.loc[ID, 'clone_size'] = len(clone.get_leaves())
                node_info.loc[ID, 'clone_time'] = clone_time
                node_info.loc[ID, 'max_clone_depth'] = clone_max_depth
                node_info.loc[ID, 'dist_to_root'] = t.get_distance(t, node)
                node_info.loc[ID, 'norm_dist_to_clone'] = t.get_distance(clone, node) / clone_max_depth
                node_info.loc[ID, 'frac_NMP'] = NMP_frac
                node_info.loc[ID, 'frac_somitic'] = somitic_frac
                node_info.loc[ID, 'frac_neural'] = neural_frac
                node_info.loc[ID, 'frac_PGC'] = pgc_frac
                node_info.loc[ID, 'frac_endoderm'] = endoderm_frac
                node_info.loc[ID, 'frac_endothelial'] = endothelial_frac
                node_info.loc[ID, 'frac_unknown'] = unknown_frac
                node_info.loc[ID, 'frac_120'] = frac_120
                node_info.loc[ID, 'frac_144'] = frac_144

                node_info.loc[ID, 'progenitor_type'] = progenitor
                node_info.loc[ID, 'progenitor_type_FC'] = progenitor_FC
                node_info.loc[ID, 'node_time'] = timepoint

# add a column to record if a node would be trimmed via the time or progenitor state
progenitor trimming
- a node would be trimmed if the node is unipotent and it's parent is unipotent

time trimming:
- a node would be trimmed if the node is a singular time (all extant cells are 120 or 144) and its parent node is as well

In [49]:
# a dictionary where the key is each node, and the values are the parent node
parent_dict = {}
# a dictionary where the key is the last node in a branch, and the values are a list of the nodes in the branch
branches = {}

for barcode in barcodes:
    treeFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/120_144/hybrid/{}_120_144_hybrid_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, barcode)
    t = Tree(treeFile, format = 1)
    
    for node in t.traverse():
        if not node.is_root() and not node.is_leaf():
            parent_dict['{}_{}'.format(barcode, node.name)] = '{}_{}'.format(barcode, node.up.name)
            
            # if the node has a child that is a leaf, then it is the end of a branch
            # this will mean that some branches are subsets of other branches
            for child in node.children:
                if child.is_leaf():
                    branches['{}_{}'.format(barcode, node.name)] = []
                    
# fill in the branch information for every key
for end in branches.keys():
    # the key is the last step in the branch
    bar = end.split('_')[0]
    current = end
    
    # don't record the root node, branches start at clones
    while current != '{}_node0'.format(bar):
        # identify the parent of the current node
        top = parent_dict[current]
        # add the parent to the front of the branch
        branches[end].insert(0, current)
        # set the current node to the parent
        current = top

In [10]:
# add a column to record the trimming information
node_info['time_trim'] = 'No'
node_info['progenitor_trim'] = 'No'

# for every node that is not a clone, classify it based off its parent node
# clones will never be trimmed because they are the first node in the branch
for node in node_info[node_info['dist_to_root'] > 1].index:
    node_name = node_info.loc[node, 'node_name']
    parent_name = parent_dict[node_name]
    
    # collect progenitor type and time of the parent node
    parent_progenitor = node_info[node_info['node_name'] == parent_name]['progenitor_type'].to_list()[0]
    parent_time = node_info[node_info['node_name'] == parent_name]['node_time'].to_list()[0]

    # If the parent node is unipotent or a singular time, then the child node will also be that type
    if parent_progenitor in ['Somitic Progenitor', 'Neural Progenitor', 'Endoderm Progenitor', 'PGCLC Progenitor']:
        node_info.loc[node, 'progenitor_trim'] = 'Yes'
    if parent_time in ['120', '144']:
        node_info.loc[node, 'time_trim'] = 'Yes'

In [11]:
node_info.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_explant_hybrid_node_info.txt', sep = '\t', index = False)

with open('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_explant_hybrid_parent_dict.pickle'.format(method), 'wb') as handle:
    pickle.dump(parent_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)
with open('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_explant_hybrid_branches_dict.pickle'.format(method), 'wb') as handle:
    pickle.dump(branches, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [50]:
# load node info table
node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_explant_hybrid_node_info.txt', sep = '\t')

# Count how many clones come from each timepoint

- For these counts, we will not be counting clones that only have a single cell because our sampling is not enough to capture if they had interspersed potential

In [13]:
clone_counts_without_scClones = pd.DataFrame(index = barcodes, columns = ['gen_1', 'gen_2', 'interspersed'])

# Fill in the # of unique clones per barcode per clone timepoint
for barcode in barcodes:
    clone_counts_without_scClones.loc[barcode, 'gen_1'] = len(node_info[(node_info['barcode'] == barcode) & (node_info['clone_time'] == '120')]['clone_name'].unique())
    clone_counts_without_scClones.loc[barcode, 'gen_2'] = len(node_info[(node_info['barcode'] == barcode) & (node_info['clone_time'] == '144')]['clone_name'].unique())
    clone_counts_without_scClones.loc[barcode, 'interspersed'] = len(node_info[(node_info['barcode'] == barcode) & (node_info['clone_time'] == '120_144')]['clone_name'].unique())
    
clone_counts_without_scClones.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/Explant_clone_counts_without_scClones.txt', sep = '\t')

# Remove nodes that belong to clones from only 1 timepoint

In [51]:
node_info_filtered = node_info[node_info['clone_time'] == '120_144'].copy()

In [15]:
node_info_filtered.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_explant_hybrid_node_info_shared_clones.txt', sep = '\t', index=False)

# Plotting progenitor dynamics

In [20]:
# Plot the distribution of normalized depth of nodes in each time (only in shared clones)
fig, ax = plt.subplots(figsize = (5, 3))

temp = node_info_filtered[node_info_filtered['TLS'] == 'TLS'].copy()

data = [temp[temp['node_time'] == '120']['norm_dist_to_clone'].tolist(),
        temp[temp['node_time'] == '144']['norm_dist_to_clone'].tolist(),
        temp[temp['node_time'] == '120_144']['norm_dist_to_clone'].tolist()]

sns.violinplot(data = data, ax = ax, scale = 'width', cut = 0)
ax.set_xticklabels(['Gen1', 'Gen2', 'Combined'])
ax.set_title('TLS Node Depths')
ax.set_ylabel('Norm Dist to Clone')
ax.set_ylim(0, 1)
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_node_depths.pdf', dpi = 300)
#plt.show()
plt.close()

In [11]:
# make a random distribution from 0 to 1 with 1500 items to scale the violin plots in TLS
random_vals = []
for i in range(1500):
    random_vals.append(np.random.uniform(0, 1))

# Make a violin plot of progenitor node depths for nodes in each timepoint
temp_node_info = node_info_filtered[(node_info_filtered['TLS'] == 'TLS')].copy()
fig, ax = plt.subplots(3, 1, figsize = (15, 15))

count = 0
for time in ['120_144', '120', '144']: 
    if time == '120':
        time_label = 'Gen 1'
    elif time == '144':
        time_label = 'Gen 2'
    else:
        time_label = 'Interspersed'
    temp_ax = ax[count]
    count += 1
    
    data = []

    for progenitor in ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor']:
        data.append(temp_node_info[(temp_node_info['progenitor_type'] == progenitor) & (temp_node_info['node_time'] == time)]['norm_dist_to_clone'])

    data.append(random_vals)
    sns.violinplot(data = data, scale = 'count', ax = temp_ax, cut = 0)
    temp_ax.set_ylim(0, 1)
    temp_ax.set_xticklabels(['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', '1500_random'])
    temp_ax.set_ylabel('Norm Dist')
    temp_ax.set_title('{}_{}'.format(time_label, 'TLS'))
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_Progenitor_Types_by_Timepoint.pdf', dpi = 300)
#plt.show()
plt.close()

In [53]:
random_vals = []
for i in range(750):
    random_vals.append(np.random.uniform(0, 1))

temp_node_info = node_info_filtered[(node_info_filtered['TLS'] == 'TLSCL')].copy()
fig, ax = plt.subplots(3, 1, figsize = (15, 15))

count = 0
for time in ['120_144', '120', '144']: 
    if time == '120':
        time_label = 'Gen 1'
    elif time == '144':
        time_label = 'Gen 2'
    else:
        time_label = 'Interspersed'
    temp_ax = ax[count]
    count += 1
    
    data = []

    for progenitor in ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor']:
        data.append(temp_node_info[(temp_node_info['progenitor_type'] == progenitor) & (temp_node_info['node_time'] == time)]['norm_dist_to_clone'])
        
    data.append(random_vals)
    sns.violinplot(data = data, scale = 'count', ax = temp_ax, cut = 0)
    temp_ax.set_ylim(0, 1)
    temp_ax.set_xticklabels(['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', '750_Scale'])
    temp_ax.set_ylabel('Norm Dist')
    temp_ax.set_title('{}_{}'.format(time_label, 'TLSCL'))
plt.tight_layout()
#plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLSCL_Progenitor_Types_by_Timepoint.pdf', dpi = 300)
#plt.show()
plt.close()

# Create singular timepoint subtrees
I am interested in looking at the subtrees that occur after a interspersed node becomes entirely Gen 1 or Gen 2

From these trees, I can investigate how the potential of nodes changes once they restrict to a timepoint

This does necessarily mean that this potential occurs at this timepoint, because there is a spatial component to this experiment

In [54]:
# a dict that stores the subtrees after interspersed. The key is the first node after the interspersed node and value is a list of the nodes under the tree 
sub_trees = {}
# a dict of the max depth of each subtree
sub_trees_depths = {}

for node_ID in node_info_filtered.index:
    node_name = node_info_filtered.loc[node_ID, 'node_name']
    parent_name = parent_dict[node_name]
    
    # remove nodes that have both 120 and 144 cells
    if node_info_filtered.loc[node_ID, 'node_time'] != '120_144':
        # if the parent node has both 120 and 144 cells, then the node is the start of a subtree
        if node_info_filtered[node_info_filtered['node_name'] == parent_name]['node_time'].tolist()[0] == '120_144':
            sub_trees[node_name] = []
            
# remove all nodes that have both 120 and 144 cells
node_info_subtrees = node_info_filtered[node_info_filtered['node_time'] != '120_144'].copy()

for barcode in barcodes:
    treeFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/120_144/hybrid/{}_120_144_hybrid_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, barcode)
    t = Tree(treeFile, format = 1)
    
    for node in t.traverse():
        name = '{}_{}'.format(barcode, node.name)
        
        if name in sub_trees.keys():
            sub_trees_depths[name] = maxDepth(node)
            
            for subnode in node.traverse():
                if not subnode.is_leaf():
                    sub_trees[name].append('{}_{}'.format(barcode, subnode.name))
    
node_info_subtrees.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/TLS_explant_hybrid_node_info_subtrees.txt', sep = '\t')

In [55]:
progenitors_120 = {}
progenitors_144 = {}

for progenitor in node_info_subtrees['progenitor_type'].unique():
    progenitors_120[progenitor] = len(node_info_subtrees[(node_info_subtrees['progenitor_type'] == progenitor) & (node_info_subtrees['node_time'] == '120')])
    progenitors_144[progenitor] = len(node_info_subtrees[(node_info_subtrees['progenitor_type'] == progenitor) & (node_info_subtrees['node_time'] == '144')])
    
rows = []
for bar in barcodes:
    rows.append(bar + '_120')
    rows.append(bar + '_144')
subtree_progenitors = pd.DataFrame(index = rows, columns = progenitor_list + ['timepoint'])


for i in subtree_progenitors.index:
    barcode = i.split('_')[0]
    time = i.split('_')[1]
    barcode_list = []
    for progenitor in progenitor_list:
        barcode_list.append(len(node_info_subtrees[(node_info_subtrees['barcode'] == barcode) & (node_info_subtrees['progenitor_type'] == progenitor) & (node_info_subtrees['node_time'] == time)]))
        
    barcode_list.append(time)
    subtree_progenitors.loc[i] = barcode_list
    

subtree_progenitors_frac = subtree_progenitors.drop(columns = ['timepoint']).copy()
subtree_progenitors_frac = subtree_progenitors_frac.div(subtree_progenitors_frac.sum(axis=1), axis=0)
subtree_progenitors_frac['timepoint'] = subtree_progenitors['timepoint']

In [56]:
# record counts per subtree
subtrees_df = pd.DataFrame(index = sub_trees.keys(), columns = progenitor_list + ['barcode', 'TLS', 'timepoint', 'size', 'parent'])

for subtree in sub_trees.keys():
    subtree_info = []
    barcode = subtree.split('_')[0]
    subnodes = sub_trees[subtree]
    
    TLS = node_info_subtrees[node_info_subtrees['node_name'].isin(subnodes)]['TLS'].tolist()[0]
    Timepoint = node_info_subtrees[node_info_subtrees['node_name'].isin(subnodes)]['node_time'].tolist()[0]
    size = len(subnodes)
    
    for progenitor in progenitor_list:
        subtree_info.append(len(node_info_subtrees[(node_info_subtrees['node_name'].isin(subnodes)) & (node_info_subtrees['progenitor_type'] == progenitor)]))
        
    subtree_info.append(barcode)
    subtree_info.append(TLS)
    subtree_info.append(Timepoint)
    subtree_info.append(size)
    subtree_info.append(parent_dict[subtree])
    
    subtrees_df.loc[subtree] = subtree_info
    
subtrees_df['barcode'] = [int(i[3:]) for i in subtrees_df['barcode']]

for col in progenitor_list:
    subtrees_df[col] = pd.to_numeric(subtrees_df[col])
    
subtrees_frac = subtrees_df.copy()
for index in subtrees_frac.index:
    subtrees_frac.loc[index, progenitor_list] = subtrees_frac.loc[index, progenitor_list] / subtrees_df.loc[index, progenitor_list].sum()
    
subtrees_df.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_progenitor_counts.txt', sep = '\t')
subtrees_frac.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_progenitor_fracs.txt', sep = '\t')

In [60]:
# plot the fraction of all subtree nodes per progenitor type
pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_progenitor_fractions.pdf')
for progenitor in progenitor_list:
    fig, ax = plt.subplots(1, 1, figsize = (15, 5))
    x = []
    y = []
    for index in subtree_progenitors_frac.index:
        x.append(index)
        y.append(subtree_progenitors_frac.loc[index, progenitor])
    
    plt.bar(x = x, height = y)
    plt.title('{} Fraction of Nodes'.format(progenitor))
    plt.ylim(0, 1)
    plt.ylabel('Fraction of Subtree Nodes')
    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
pp.close()

In [59]:
# plot the distribution of subtree progenitor fractions
pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_progenitor_fraction_distributions.pdf')
for progenitor in progenitor_list:
    fig, ax = plt.subplots(1, 1, figsize = (15, 5))
    sns.violinplot(data = subtrees_frac, x = 'barcode', y = progenitor, split = True, hue = 'timepoint', cut = 0, ax = ax, scale = 'width')

    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
pp.close()

# Plot TLS vs TLSCL mean fraction of progenitor nodes

In [39]:
data = []
error = []
xlabels = ['TLS - Gen1', 'TLS - Gen2', 'TLSCL - Gen1', ' TLSCL - Gen2']

data.append(subtree_progenitors_frac.loc[['Bar1_120', 'Bar2_120', 'Bar3_120']]['Bipotent Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar1_120', 'Bar2_120', 'Bar3_120']]['Bipotent Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar1_144', 'Bar2_144', 'Bar3_144']]['Bipotent Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar1_144', 'Bar2_144', 'Bar3_144']]['Bipotent Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar4_120', 'Bar5_120', 'Bar6_120']]['Bipotent Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar4_120', 'Bar5_120', 'Bar6_120']]['Bipotent Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar4_144', 'Bar5_144', 'Bar6_144']]['Bipotent Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar4_144', 'Bar5_144', 'Bar6_144']]['Bipotent Progenitor'].std())

plt.bar(x = xlabels, height = data, yerr = error)
plt.ylabel('Fraction Bipotent')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_bipotent_frac.pdf', dpi = 300)
#plt.show()
plt.close()

In [40]:
data = []
error = []
xlabels = ['TLS - Gen1', 'TLS - Gen2', 'TLSCL - Gen1', ' TLSCL - Gen2']

data.append(subtree_progenitors_frac.loc[['Bar1_120', 'Bar2_120', 'Bar3_120']]['Somitic Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar1_120', 'Bar2_120', 'Bar3_120']]['Somitic Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar1_144', 'Bar2_144', 'Bar3_144']]['Somitic Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar1_144', 'Bar2_144', 'Bar3_144']]['Somitic Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar4_120', 'Bar5_120', 'Bar6_120']]['Somitic Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar4_120', 'Bar5_120', 'Bar6_120']]['Somitic Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar4_144', 'Bar5_144', 'Bar6_144']]['Somitic Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar4_144', 'Bar5_144', 'Bar6_144']]['Somitic Progenitor'].std())

plt.bar(x = xlabels, height = data, yerr = error)
plt.ylabel('Fraction Somitic')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_somitic_frac.pdf', dpi = 300)
#plt.show()
plt.close()

In [41]:
data = []
error = []
xlabels = ['TLS - Gen1', 'TLS - Gen2', 'TLSCL - Gen1', ' TLSCL - Gen2']

data.append(subtree_progenitors_frac.loc[['Bar1_120', 'Bar2_120', 'Bar3_120']]['Neural Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar1_120', 'Bar2_120', 'Bar3_120']]['Neural Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar1_144', 'Bar2_144', 'Bar3_144']]['Neural Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar1_144', 'Bar2_144', 'Bar3_144']]['Neural Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar4_120', 'Bar5_120', 'Bar6_120']]['Neural Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar4_120', 'Bar5_120', 'Bar6_120']]['Neural Progenitor'].std())

data.append(subtree_progenitors_frac.loc[['Bar4_144', 'Bar5_144', 'Bar6_144']]['Neural Progenitor'].mean())
error.append(subtree_progenitors_frac.loc[['Bar4_144', 'Bar5_144', 'Bar6_144']]['Neural Progenitor'].std())

plt.bar(x = xlabels, height = data, yerr = error)
plt.ylabel('Fraction Neural')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_neural_frac.pdf', dpi = 300)
#plt.show()
plt.close()

# Plot stacked bar plots of the progenitor nodes in the subtrees
- These plots will be for TLS and TLSCL

- I will show the weighted average of the 3 replicates

- For this I will count all nodes, splitting them into 3 categories (gen1 nodes, gen2 nodes, and interspersed)

In [91]:
# Calculate the weighted avg fraction of each progenitor in the TLS subtrees
TLS_subtree_weighted_fracs = pd.DataFrame(0, index = ['120', '120_144', '144'], columns = progenitor_list)

for barcode in TLS_barcodes:
    # count the progenitor counts for each progenitor in each timepoint only using the top node from each subtree
    temp = subtrees_df[(subtrees_df['barcode'] == int(barcode[-1]))].copy()
    
    # collect the fractions of progenitor types in gen1 and gen2
    for progenitor in progenitor_list:
        if temp[temp['timepoint'] == '120'][progenitor_list].sum().sum() == 0:
            TLS_subtree_weighted_fracs.loc['120', progenitor] = 0
            TLS_subtree_weighted_fracs.loc['144', progenitor] = 0
        else:
            TLS_subtree_weighted_fracs.loc['120', progenitor] += temp[temp['timepoint'] == '120'][progenitor].sum() / temp[temp['timepoint'] == '120'][progenitor_list].sum().sum()
            TLS_subtree_weighted_fracs.loc['144', progenitor] += temp[temp['timepoint'] == '144'][progenitor].sum() / temp[temp['timepoint'] == '144'][progenitor_list].sum().sum()

    # add the 120_144 counts for TLS
    for progenitor in progenitor_list:
        TLS_subtree_weighted_fracs.loc['120_144', progenitor] += len(node_info_filtered[(node_info_filtered['barcode'] == barcode) & (node_info_filtered['node_time'] == '120_144') & node_info_filtered['progenitor_type'].isin([progenitor])]) / len(node_info_filtered[(node_info_filtered['barcode'] == barcode) & (node_info_filtered['node_time'] == '120_144') & node_info_filtered['progenitor_type'].isin(progenitor_list)])

TLS_subtree_weighted_fracs = TLS_subtree_weighted_fracs.div(3)

In [92]:
# Calculate the weighted avg fraction of each progenitor in the TLS subtrees
TLSCL_subtree_weighted_fracs = pd.DataFrame(0, index = ['120', '120_144', '144'], columns = progenitor_list)

for barcode in TLSCL_barcodes:
    # count the progenitor counts for each progenitor in each timepoint only using the top node from each subtree
    temp = subtrees_df[(subtrees_df['barcode'] == int(barcode[-1]))].copy()
    
    for progenitor in progenitor_list:
        if temp[temp['timepoint'] == '120'][progenitor_list].sum().sum() == 0:
            TLSCL_subtree_weighted_fracs.loc['120', progenitor] = 0
            TLSCL_subtree_weighted_fracs.loc['144', progenitor] = 0
        else:
            TLSCL_subtree_weighted_fracs.loc['120', progenitor] += temp[temp['timepoint'] == '120'][progenitor].sum() / temp[temp['timepoint'] == '120'][progenitor_list].sum().sum()
            TLSCL_subtree_weighted_fracs.loc['144', progenitor] += temp[temp['timepoint'] == '144'][progenitor].sum() / temp[temp['timepoint'] == '144'][progenitor_list].sum().sum()

    # add the 120_144 counts for TLS
    for progenitor in progenitor_list:
        TLSCL_subtree_weighted_fracs.loc['120_144', progenitor] += len(node_info_filtered[(node_info_filtered['barcode'] == barcode) & (node_info_filtered['node_time'] == '120_144') & node_info_filtered['progenitor_type'].isin([progenitor])]) / len(node_info_filtered[(node_info_filtered['barcode'] == barcode) & (node_info_filtered['node_time'] == '120_144') & node_info_filtered['progenitor_type'].isin(progenitor_list)])

TLSCL_subtree_weighted_fracs = TLSCL_subtree_weighted_fracs.div(3)

In [94]:
fig, ax = plt.subplots(1, 2, figsize = (15, 6))

ax1 = ax[0]
TLS_subtree_weighted_fracs.plot.bar(stacked = True, color = progenitor_colorDict, ax = ax1, legend = False)
ax1.set_xticklabels(['Gen 1', 'All Interspersed', 'Gen 2'])
ax1.set_title('TLS Subtree Weighted Avg Fractions')
ax1.set_ylabel('Fraction of Nodes')

ax2 = ax[1]
TLSCL_subtree_weighted_fracs.plot.bar(stacked = True, color = progenitor_colorDict, ax = ax2)
ax2.set_xticklabels(['Gen 1', 'All Interspersed', 'Gen 2'])
ax2.set_title('TLSCL Subtree Weighted Avg Fractions')
ax2.legend(bbox_to_anchor=(1.1, 1.05))
ax2.set_ylabel('Fraction of Nodes')

plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/progenitor_analysis/subtree_weighted_progenitor_fractions.pdf', dpi = 300)
#plt.show()
plt.close()