# Info

- In this script, I investigated the variability in ternary plots that occurs when different tree reconstructions are created from the same tree

- To do this, I used the 30 different random seeds for each of the TLS M and TLSCL barcodes to investigate how these subtly different trees compare

- I will compare the ternary distances between seeds of the same tree (intra) and random comparisons of seeds from seperate trees (inter)

In [1]:
import cassiopeia as cas
import pandas as pd
import numpy as np
import pickle
import os
from ete3 import Tree
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [28]:
# Load the table of cell states
cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/TLS_TLSCL_cellBC_cellState.tsv', sep='\t')

clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)
    
barcodes = ["Bar1", 'Bar2', 'Bar3', 'Bar4', 'Bar5', 'Bar6','Bar7','Bar8','Bar9','Bar10','Bar11', 'Bar12', 'Bar13','Bar14',
            'Bar15', 'Bar16', 'Bar18', 'Bar19','Bar20','Bar21', 'Bar22','Bar23','Bar24']

TLS_barcodes = ['Bar1', 'Bar2', 'Bar4', 'Bar5', 'Bar7', 'Bar10', 'Bar11', 'Bar13', 'Bar16', 'Bar19', 'Bar22']
TLSCL_barcodes = ['Bar3', 'Bar6', 'Bar9', 'Bar12', 'Bar14', 'Bar15', 'Bar20', 'Bar21', 'Bar23', 'Bar24']

Seeds = [3882, 721488, 2299, 31525, 1493, 845228, 7051, 622697, 7966, 602613, 2513, 695825, 9902, 941621, 4622, 766561, 1228,
 597307, 6152, 927685, 6064, 356182, 3252, 732589, 44, 539499, 8802, 989417, 222, 107820]

# Process all the datasets:
When performing ILP, cassiopeia creates a set of proposed solutions. Ideally i would test X number of each of these proposed trees. Unfortunately, I am having trouble accessing that set, so I will use the 30 random see trees that were made previously to setup the pipeline / investigate initial information. These random seeds affect the path that gurobi takes when solving the tree problem, creating slight variations in the tree

In [3]:
# Label internal nodes
def nameInteriorNodes(nwkFile,outnwkFile):

    t = Tree(nwkFile,format=1)
    labelID = 0

    for node in t.traverse():
        if node.is_leaf() == False:
            node.name = "node{}".format(labelID)
            labelID = labelID + 1

    t.write(format=8,outfile=outnwkFile)

In [4]:
def add_node0(nwkFile, outnwkFile):
    infile = open(nwkFile, 'r')
    outfile = open(outnwkFile, 'w')
    
    outfile.write(infile.readline()[:-1])
    outfile.write('node0;')
          
    infile.close()
    outfile.close()

In [24]:
# create labeled nwk files for each of the seed trees
for barcode in barcodes:
    folder_loc = '/Genomics/chanlab/blaw/TLS/sandbox/AM-DNA-258/ilp_seed_study/trees/{}/newick/'.format(barcode)
    
    for file in os.listdir(folder_loc):
        seed = file.split('_')[2]
        
        if not os.path.exists('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/'.format(barcode)):
            os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/'.format(barcode))
        if not os.path.exists('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/newick_files/'.format(barcode)):
            os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/newick_files/'.format(barcode))

        
        nameInteriorNodes(folder_loc + file,
                          '/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/newick_files/{}_seed_{}_node0_missing.txt'.format(barcode, barcode, seed))
        
        add_node0('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/newick_files/{}_seed_{}_node0_missing.txt'.format(barcode, barcode, seed),
                  '/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/newick_files/{}_seed_{}_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, barcode, seed))

# Create a node info table for all 30 trees from the same seed

In [5]:
def maxDepth(node):
    '''
    Input:
        - a node in an ete tree
    returns:
        - The max depth of any branch in that node
    '''
    if node.is_leaf():
        return 0
    children_depths = []
    
    for child in node.children:
        test = maxDepth(child)
        
        children_depths.append(test)
        
    return max(children_depths) + 1

In [6]:
def countNMP_Somite_Neural(node, cell_state_table):
    '''
    Input:
        - a node in an ete tree
        - a table of cell states for each cellBC. Assumes that each leaf of the ete tree is a cellBC in the table
    return:
        - A tuple of the number of NMP, somitic, and neural cells that are leaves of the node
    '''
    leaves = [leaf.name for leaf in node.get_leaves()]

    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()

    # Group the Neural and Somite cell states into 1 category and remove PGCLC, Endoderm, Unknown, Epithelial
    somitic_count = 0
    NMP_count = 0
    neural_count = 0
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            somitic_count += 1
        elif state in ['NeuralTube1', 'NeuralTube2']:
            neural_count += 1
        elif state in ['NMPs']:
            NMP_count += 1

    return (NMP_count, somitic_count, neural_count)

In [7]:
node_columns = ['Barcode', 'Seed', 'Node', 'Node Size', 'Clone', 'Clone Size', 'Max Clone Depth', 'Dist to Clone', 'Percent NMP', 
                'Percent Somitic', 'Percent Neural']

for barcode in barcodes:
    node_info = pd.DataFrame(columns = node_columns)

    # Store the node information of nodes to filter
    bad_nodes = []
    small_nodes = []

    for seed in Seeds:
        treeFile = "/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/newick_files/{}_seed_{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode, seed)
        t = Tree(treeFile, format = 1)

        # Add all nodes in the tree to the node_info dataframe
        node_names = []
        for node in t.traverse():
            if node.name != 'node0' and not node.is_leaf():
                node_names.append('{}_seed_{}_{}'.format(barcode, seed, node.name))

        temp_node_info = pd.DataFrame(index = node_names, columns = node_columns)
        node_info = pd.concat((node_info, temp_node_info))

        # fill in node information, don't keep roots
        for clone in t.children:
            clone_max_depth = maxDepth(clone)

            for node in clone.traverse():
                if not node.is_leaf():
                    leaves = [leaf.name for leaf in node.get_leaves()]
                    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()

                    # Record nodes that need to be removed, currently removing any non NMP cell type and nodes with less than 4 leaves
                    if 'Endoderm' in cell_types or 'PCGLC' in cell_types or 'Unknown' in cell_types or 'Endothelial' in cell_types:
                        bad_nodes.append('{}_seed_{}_{}'.format(barcode, seed, node.name))
                    elif len(leaves) < 4:
                        small_nodes.append('{}_seed_{}_{}'.format(barcode, seed, node.name))

                    NMP_count, somitic_count, neural_count = countNMP_Somite_Neural(node, cell_state_table)

                    total = somitic_count + NMP_count + neural_count
                    if total > 0:
                        NMP_frac = NMP_count / total
                        somitic_frac = somitic_count / total
                        neural_frac = neural_count / total
                    else:
                        NMP_frac = 0
                        somitic_frac = 0
                        neural_frac = 0

                    # Record node information to the large table
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Barcode'] = barcode
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Seed'] = seed
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Node'] = node.name
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Node Size'] = len(node.get_leaves())
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Clone'] = clone.name
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Clone Size'] = len(clone.get_leaves())
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Max Clone Depth'] = clone_max_depth
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Dist to Clone'] = t.get_distance(clone, node)
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Percent NMP'] = NMP_frac
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Percent Somitic'] = somitic_frac
                    node_info.loc['{}_seed_{}_{}'.format(barcode, seed, node.name), 'Percent Neural'] = neural_frac

    node_info.drop(index = bad_nodes, inplace = True)
    node_info.drop(index = small_nodes, inplace = True)

    # Calculate the normalized Depth of each node
    node_info['Normalized Dist'] = node_info['Dist to Clone'] / node_info['Max Clone Depth']

    # Classify nodes by progenitor type and % extant NMPs
    node_info['Progenitor Type'] = '-'
    node_info['Progenitor Pool'] = '-'

    for node in node_info.index.values:
        # A node is bipotent if it produces both neural and somitic cells
        # Bipotent and transient bipotent (bipotent without NMPs) are both labeled as bipotent
        if (node_info.loc[node, 'Percent Neural'] > 0) and (node_info.loc[node, 'Percent Somitic'] > 0):
            node_info.loc[node, 'Progenitor Type'] = 'Bipotent'

        # A node is committed towards the somitic lineage if it produces somites but not neural cells
        elif (node_info.loc[node, 'Percent Somitic'] > 0) and (node_info.loc[node, 'Percent Neural'] == 0):
            node_info.loc[node, 'Progenitor Type'] = 'Somitic Committed'

        # A node is committed towards the neural lineage if it produces neural but not somitic cells
        elif (node_info.loc[node, 'Percent Somitic'] == 0) and (node_info.loc[node, 'Percent Neural'] > 0):
            node_info.loc[node, 'Progenitor Type'] = 'Neural Committed'

        # A node is proliferating if it does not produce somitic or neural cells (exclusively produces NMPs)
        elif (node_info.loc[node, 'Percent Somitic'] == 0) and (node_info.loc[node, 'Percent Neural'] == 0):
            node_info.loc[node, 'Progenitor Type'] = 'Proliferating'


        # A node is proliferating if it produces exclusively NMPs
        if node_info.loc[node, 'Percent NMP'] == 1:
            node_info.loc[node, 'Progenitor Pool'] = 'Proliferating'

        # A node is differentiating if it produces some NMPS
        elif node_info.loc[node, 'Percent NMP'] > 0:
            node_info.loc[node, 'Progenitor Pool'] = 'Differentiating'

        # A node is exhausted if it produces 0 NMPs
        elif node_info.loc[node, 'Percent NMP'] == 0:
            node_info.loc[node, 'Progenitor Pool'] = 'Exhausted'

    # Save the node table for nodes >= 4
    node_info.to_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode, barcode), sep = '\t')

# Calculate the distance between seeds of the same barcode

Ternary Distance:
- Identify the 2 nodes that have the smallest distance between them from tree A and tree B.
- Remove both nodes from the dataset
- Repeat until A or B runs out of nodes
- The total ternary distance is calculated as the sum of the distances between the set of minimum distance node pairs
- Add a penalty for any remaining nodes

In [8]:
def calcDist(node1, node2, node_info):
    '''
    input:
        node1 - a name of the first node, assumed to be an index in node_info
        node2 - a name of the second node, assumed to be an index in node_info
        node_info - a node info table that has the fraction of NMP, Neural, and Somitic columns
        
    returns:
        The euclidean distance between the 2 nodes from the 3 fractions
    '''
    x1 = node_info.loc[node1, 'Percent NMP']
    y1 = node_info.loc[node1, 'Percent Neural']
    z1 = node_info.loc[node1, 'Percent Somitic']
    
    x2 = node_info.loc[node2, 'Percent NMP']
    y2 = node_info.loc[node2, 'Percent Neural']
    z2 = node_info.loc[node2, 'Percent Somitic']
    
    return np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)

In [9]:
def calcTernaryDist(tree1_name, tree2_name, node_info, meta_col = 'Seed'):
    '''
    input:
        tree1 - string to subset all nodes in tree1 using the meta_col, assumed to all be in the node_info table
        tree2 - string to subset all nodes in tree2 using the meta_col, assumed to all be in the node_info table
        node_info - a node_info table that contains all nodes in tree1 and tree2, the column from meta_col and the fractions of NMP, Neural, and Somitic.
        meta_col - A column name in node_info that can be used to split tree1 and tree2
    returns:
        a value of the distance between tree1 and tree2
    '''
    tree1_nodes = node_info[node_info[meta_col] == tree1_name].index
    tree2_nodes = node_info[node_info[meta_col] == tree2_name].index
    
    pairwise_dists_df = pd.DataFrame(index = tree1_nodes, columns = tree2_nodes)
    
    for i in pairwise_dists_df.index:
        for j in pairwise_dists_df.columns:
            pairwise_dists_df.loc[i, j] = calcDist(i, j, node_info)
            
    total_dist = 0
    pen = 0 
    temp_dists_df = pairwise_dists_df.copy()
    min_nodes = min(len(temp_dists_df.columns), len(temp_dists_df.index))
    
    for i in range(max(len(temp_dists_df.columns), len(temp_dists_df.index))):
        if len(temp_dists_df.columns) == 0 or len(temp_dists_df.index) == 0:
            pen += 1
        else:
            min_val = 100
            min_col = '' 
            min_row = ''
            for col in temp_dists_df.columns:
                temp_dists_df[col] = pd.to_numeric(temp_dists_df[col])

                temp_row = temp_dists_df[col].idxmin(axis = 0)
                if min_val > temp_dists_df.loc[temp_row, col]:
                    min_val = temp_dists_df.loc[temp_row, col]
                    min_row = temp_row
                    min_col = col

            total_dist += temp_dists_df.loc[min_row, min_col]

            temp_dists_df.drop(index = min_row, inplace = True)
            temp_dists_df.drop(columns = min_col, inplace = True)    
            
    return (total_dist / min_nodes, (total_dist / min_nodes) + pen)

# Calculate the ternary distances between different seeds of the same barcode

- to compare barcodes across structures, we normalize the ternary distance by the # of nodes in the tree, making the maximum distance between 2 trees 1

In [16]:
# Create  a dataframe of all the possible seed combinations (only intra)
barcode_tree_pairs = []
pairs = list(itertools.combinations(Seeds, (2)))

for barcode in barcodes:
    for pair in pairs:
        seed1 = pair[0]
        seed2 = pair[1]
        barcode_tree_pairs.append('{}_{}_{}'.format(barcode, seed1, seed2))
        
multiseq_all_dists_df = pd.DataFrame(index = barcode_tree_pairs, columns = ['Barcode', 'Seed1', 'Seed2', 'Dist', 'Pen_Dist'])

# add metadata and seed info to the dataframe
for i in multiseq_all_dists_df.index:
    barcode = i.split('_')[0]
    Seed1 = i.split('_')[1]
    Seed2 = i.split('_')[2]
    
    multiseq_all_dists_df.loc[i, 'Barcode'] = barcode
    multiseq_all_dists_df.loc[i, 'Seed1'] = Seed1
    multiseq_all_dists_df.loc[i, 'Seed2'] = Seed2

In [18]:
# calculate the ternary distance for each comparison
for barcode in multiseq_all_dists_df['Barcode'].unique():
    temp_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode, barcode), sep = '\t', index_col = 0)
    
    for index in multiseq_all_dists_df[multiseq_all_dists_df['Barcode'] == barcode].index:
        Seed1 = multiseq_all_dists_df.loc[index, 'Seed1']
        Seed2 = multiseq_all_dists_df.loc[index, 'Seed2']
        
        dist, pen_dist = calcTernaryDist(int(Seed1), int(Seed2), temp_node_info)
        
        multiseq_all_dists_df.loc[index, 'Dist'] = dist
        multiseq_all_dists_df.loc[index, 'Pen_Dist'] = pen_dist

In [19]:
multiseq_all_dists_df.to_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/multiseq_pairwise_Seeds_in_same_Bar_dists.txt', sep = '\t')

In [10]:
multiseq_all_dists_df = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/multiseq_pairwise_Seeds_in_same_Bar_dists.txt', sep = '\t', index_col = 0)

In [11]:
seed_comparisons = set()
barcodes = set()

for i in multiseq_all_dists_df.index:
    seed_comparison = i.split('_')[1] + '_' + i.split('_')[2]
    
    seed_comparisons.add(seed_comparison)
    barcodes.add(i.split('_')[0])

In [12]:
barcode_dists_df = pd.DataFrame(index = seed_comparisons, columns = barcodes)
barcode_pen_dists_df = pd.DataFrame(index = seed_comparisons, columns = barcodes)

In [13]:
for i in barcode_dists_df.index:
    for j in barcode_dists_df.columns:
        full_index = j + '_' + i
        
        barcode_dists_df.loc[i, j] = multiseq_all_dists_df.loc[full_index, 'Dist']
        barcode_pen_dists_df.loc[i, j] = multiseq_all_dists_df.loc[full_index, 'Pen_Dist']

In [14]:
node_sizes = {}

for barcode in multiseq_all_dists_df['Barcode'].unique():
    temp_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode, barcode), sep = '\t', index_col = 0)
    
    node_sizes[barcode] = len(temp_node_info[temp_node_info['Seed'] == 222])

In [15]:
for col in barcode_dists_df.columns:
    barcode_dists_df[col] = barcode_dists_df[col] / node_sizes[col]

In [16]:
barcode_sizes = {}

for barcode in barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)
    
    barcode_sizes[barcode] = len(t.get_leaves())

In [17]:
sorted_barcode_sizes = sorted(barcode_sizes.items(), key=lambda x:x[1], reverse=False)

In [18]:
sorted_barcode_names = []
for i in sorted_barcode_sizes:
    sorted_barcode_names.append(i[0])

In [23]:
fig, ax = plt.subplots(figsize = (20, 5))
sns.boxplot(data = barcode_dists_df, order = sorted_barcode_names)
ax.set_ylabel('Normalized Ternary Distance')
ax.set_title('Intra Seed Distances')
#plt.show()
plt.close()

In [25]:
fig, ax = plt.subplots(figsize = (20, 5))
sns.boxplot(data = barcode_pen_dists_df, order = sorted_barcode_names)
ax.set_ylabel('Normalized Distance + Penalty')
ax.set_title('Distances With Penalty')
#plt.show()
plt.close()

# Randomly sample 2 TLS trees from different barcodes to see how the distances between different trees looks

In [29]:
dists = []
pen_dists = []

for i in range(435):
    barcode1, barcode2 = random.sample(TLS_barcodes, 2)
    
    temp_bar1_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode1, barcode1), sep = '\t', index_col = 0)
    temp_bar2_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode2, barcode2), sep = '\t', index_col = 0)

    barcode1_seed = random.sample(Seeds, 1)[0]
    barcode2_seed = random.sample(Seeds, 1)[0]
    
    temp_node_info = pd.concat([temp_bar1_node_info[temp_bar1_node_info['Seed'] == barcode1_seed], temp_bar2_node_info[temp_bar2_node_info['Seed'] == barcode2_seed]])
    
    dist, pen_dist = calcTernaryDist(barcode1, barcode2, temp_node_info, meta_col = 'Barcode')
    
    dists.append(dist)
    pen_dists.append(pen_dist)

In [30]:
TLSCL_dists = []
TLSCL_pen_dists = []

for i in range(435):
    barcode1, barcode2 = random.sample(TLSCL_barcodes, 2)
    
    temp_bar1_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode1, barcode1), sep = '\t', index_col = 0)
    temp_bar2_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/{}/{}_node_information_filtered.txt'.format(barcode2, barcode2), sep = '\t', index_col = 0)

    barcode1_seed = random.sample(Seeds, 1)[0]
    barcode2_seed = random.sample(Seeds, 1)[0]
    
    temp_node_info = pd.concat([temp_bar1_node_info[temp_bar1_node_info['Seed'] == barcode1_seed], temp_bar2_node_info[temp_bar2_node_info['Seed'] == barcode2_seed]])
    
    TLSCL_dist, TLSCL_pen_dist = calcTernaryDist(barcode1, barcode2, temp_node_info, meta_col = 'Barcode')
    
    TLSCL_dists.append(TLSCL_dist)
    TLSCL_pen_dists.append(TLSCL_pen_dist)

In [31]:
barcode_dists_df['TLS_Trees'] = dists
barcode_pen_dists_df['TLS_Trees'] = pen_dists
barcode_dists_df['TLSCL_Trees'] = TLSCL_dists
barcode_pen_dists_df['TLSCL_Trees'] = TLSCL_pen_dists

In [34]:
fig, ax = plt.subplots(figsize = (25, 5))
sns.boxplot(data = barcode_dists_df, order = sorted_barcode_names + ['TLS_Trees', 'TLSCL_Trees'])
ax.set_ylabel('Normalized Ternary Distance')
ax.set_title('Distances')
plt.ylim(0, 0.5)
plt.savefig('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/results/tree_seed_variability_normalized_dist.pdf', dpi = 300)
#plt.show()
plt.close()

In [35]:
fig, ax = plt.subplots(figsize = (25, 5))
sns.boxplot(data = barcode_pen_dists_df, order = sorted_barcode_names + ['TLS_Trees', 'TLSCL_Trees'])
ax.set_ylabel('Normalized Distance + Penalty')
ax.set_title('Distances With Penalty')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/ternary_dists/tree_seed_variability/results/tree_seed_variability_normalized_dist_penalty.pdf', dpi = 300)
#plt.show()
plt.close()