In [2]:
import cassiopeia as cas
import pandas as pd
import numpy as np
import pickle
import os
from ete3 import Tree
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import random
from random import randrange
import itertools
from scipy import stats

In [3]:
# Load the table of cell states
cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/TLS_120h_1_cellBC_cellState.tsv', sep='\t')

clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)

# Subsample TLS1 to 200 cell allele tables

In [3]:
TLS1_allele_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-097/lineage/2_lineage_reconstruction/allele_table_filtered.txt', sep = '\t')

In [60]:
'''
# don't rerun this unless you want to create new downsampled allele tables
for i in range(30):
    temp_allele_cells = np.random.choice(TLS1_allele_table['cellBC'].unique(), size = 200, replace = False)
    temp_allele = TLS1_allele_table[TLS1_allele_table['cellBC'].isin(temp_allele_cells)].copy()

    if not os.path.exists('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/'.format('TLS1')):
        os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/'.format('TLS1'))
        os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/newick/'.format('TLS1'))
        os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/allele_tables/'.format('TLS1'))
        os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/trees/'.format('TLS1'))
        os.mkdir('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/trees/logs/'.format('TLS1'))

    temp_allele.to_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/allele_tables/{}_{}_to_200_downsample_allele_table.txt'.format('TLS1', i, 'TLS1'), sep = '\t')
    
'''

# Reconstruct these allele tables in the following python script

- 'TLS1_to_200_downsampling_Lineage_Reconstruction.py'

# Process the tree files

In [4]:
# Label internal nodes
def nameInteriorNodes(nwkFile,outnwkFile):

    t = Tree(nwkFile,format=1)
    labelID = 0

    for node in t.traverse():
        if node.is_leaf() == False:
            node.name = "node{}".format(labelID)
            labelID = labelID + 1

    t.write(format=8,outfile=outnwkFile)

In [5]:
def add_node0(nwkFile, outnwkFile):
    infile = open(nwkFile, 'r')
    outfile = open(outnwkFile, 'w')
    
    outfile.write(infile.readline()[:-1])
    outfile.write('node0;')
          
    infile.close()
    outfile.close()

In [6]:
for barcode in ['TLS1']:
    for i in range(30):
        nameInteriorNodes('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/trees/{}_{}_to_200_downsampling_hybrid_newick_noMutationlessEdges.txt'.format(barcode, i, barcode),
                          '/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/trees/{}_{}_to_200_downsampling_node0_missing.txt'.format(barcode, i, barcode))

        add_node0('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/trees/{}_{}_to_200_downsampling_node0_missing.txt'.format(barcode, i, barcode),
                  '/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/newick/{}_{}_to_200_downsampling_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, i, barcode))

# Make a node table that has all the replicates

In [6]:
def maxDepth(node):
    '''
    Input:
        - a node in an ete tree
    returns:
        - The max depth of any branch in that node
    '''
    if node.is_leaf():
        return 0
    children_depths = []
    
    for child in node.children:
        test = maxDepth(child)
        
        children_depths.append(test)
        
    return max(children_depths) + 1

In [7]:
def countNMP_Somite_Neural(node, cell_state_table):
    '''
    Input:
        - a node in an ete tree
        - a table of cell states for each cellBC
    return:
        - A tuple of the number of NMP, somitic, and neural cells that are leaves of the node
    '''
    leaves = [leaf.name for leaf in node.get_leaves()]

    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()

    # Group the Neural and Somite cell states into 1 category and remove PGCLC, Endoderm, Unknown, Epithelial
    somitic_count = 0
    NMP_count = 0
    neural_count = 0
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            somitic_count += 1
        elif state in ['NeuralTube1', 'NeuralTube2']:
            neural_count += 1
        elif state in ['NMPs']:
            NMP_count += 1

    return (NMP_count, somitic_count, neural_count)

In [8]:
node_columns = ['Sample_ID', 'Barcode', 'Node', 'Node Size', 'Clone', 'Clone Size', 'Max Clone Depth', 'Dist to Clone', 'Percent NMP', 
                'Percent Somitic', 'Percent Neural']

node_info = pd.DataFrame(columns = node_columns)

# Store the node information of nodes to filter
bad_nodes = []
small_nodes = []

for barcode in ['TLS1']:
    for i in range(30):
        treeFile = '/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/newick/{}_{}_to_200_downsampling_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, i, barcode)
        t = Tree(treeFile, format = 1)

        # Add all nodes in the tree to the node_info dataframe
        node_names = []
        for node in t.traverse():
            if node.name != 'node0' and not node.is_leaf():
                node_names.append('{}_{}_{}'.format(i, barcode, node.name))

        temp_node_info = pd.DataFrame(index = node_names, columns = node_columns)
        node_info = pd.concat((node_info, temp_node_info))

        # fill in node information, don't keep roots
        for clone in t.children:
            clone_max_depth = maxDepth(clone)

            for node in clone.traverse():
                if not node.is_leaf():
                    leaves = [leaf.name for leaf in node.get_leaves()]
                    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()

                    # Record nodes that need to be removed, currently removing any non NMP cell type and nodes with less than 4 leaves
                    if 'Endoderm' in cell_types or 'PCGLC' in cell_types or 'Unknown' in cell_types or 'Endothelial' in cell_types:
                        bad_nodes.append('{}_{}_{}'.format(i, barcode, node.name))
                    elif len(leaves) < 4:
                        small_nodes.append('{}_{}_{}'.format(i, barcode, node.name))

                    NMP_count, somitic_count, neural_count = countNMP_Somite_Neural(node, cell_state_table)

                    total = somitic_count + NMP_count + neural_count
                    if total > 0:
                        NMP_frac = NMP_count / total
                        somitic_frac = somitic_count / total
                        neural_frac = neural_count / total
                    else:
                        NMP_frac = 0
                        somitic_frac = 0
                        neural_frac = 0

                    # Record node information to the large table
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Sample_ID'] = i
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Barcode'] = barcode
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Node'] = node.name
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Node Size'] = len(node.get_leaves())
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Clone'] = clone.name
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Clone Size'] = len(clone.get_leaves())
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Max Clone Depth'] = clone_max_depth
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Dist to Clone'] = t.get_distance(clone, node)
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Percent NMP'] = NMP_frac
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Percent Somitic'] = somitic_frac
                    node_info.loc['{}_{}_{}'.format(i, barcode, node.name), 'Percent Neural'] = neural_frac

node_info.drop(index = bad_nodes, inplace = True)
node_info.drop(index = small_nodes, inplace = True)

In [9]:
# Calculate the normalized Depth of each node
node_info['Normalized Dist'] = node_info['Dist to Clone'] / node_info['Max Clone Depth']

# Classify nodes by progenitor type and % extant NMPs
node_info['Progenitor Type'] = '-'
node_info['Progenitor Pool'] = '-'

for node in node_info.index.values:
    # A node is bipotent if it produces both neural and somitic cells
    # Bipotent and transient bipotent (bipotent without NMPs) are both labeled as bipotent
    if (node_info.loc[node, 'Percent Neural'] > 0) and (node_info.loc[node, 'Percent Somitic'] > 0):
        node_info.loc[node, 'Progenitor Type'] = 'Bipotent'

    # A node is committed towards the somitic lineage if it produces somites but not neural cells
    elif (node_info.loc[node, 'Percent Somitic'] > 0) and (node_info.loc[node, 'Percent Neural'] == 0):
        node_info.loc[node, 'Progenitor Type'] = 'Somitic Committed'

    # A node is committed towards the neural lineage if it produces neural but not somitic cells
    elif (node_info.loc[node, 'Percent Somitic'] == 0) and (node_info.loc[node, 'Percent Neural'] > 0):
        node_info.loc[node, 'Progenitor Type'] = 'Neural Committed'

    # A node is proliferating if it does not produce somitic or neural cells (exclusively produces NMPs)
    elif (node_info.loc[node, 'Percent Somitic'] == 0) and (node_info.loc[node, 'Percent Neural'] == 0):
        node_info.loc[node, 'Progenitor Type'] = 'Proliferating'


    # A node is proliferating if it produces exclusively NMPs
    if node_info.loc[node, 'Percent NMP'] == 1:
        node_info.loc[node, 'Progenitor Pool'] = 'Proliferating'

    # A node is differentiating if it produces some NMPS
    elif node_info.loc[node, 'Percent NMP'] > 0:
        node_info.loc[node, 'Progenitor Pool'] = 'Differentiating'

    # A node is exhausted if it produces 0 NMPs
    elif node_info.loc[node, 'Percent NMP'] == 0:
        node_info.loc[node, 'Progenitor Pool'] = 'Exhausted'

In [10]:
node_info.to_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/TLS1_to_200_node_information_filtered.txt', sep = '\t')

# Calculate the distance between all the pairwise trees


In [11]:
def calcDist(node1, node2, node_info):
    '''
    input:
        node1 - a name of the first node, assumed to be an index in node_info
        node2 - a name of the second node, assumed to be an index in node_info
        node_info - a node info table that has the fraction of NMP, Neural, and Somitic columns
        
    returns:
        The euclidean distance between the 2 nodes from the 3 fractions
    '''
    x1 = node_info.loc[node1, 'Percent NMP']
    y1 = node_info.loc[node1, 'Percent Neural']
    z1 = node_info.loc[node1, 'Percent Somitic']
    
    x2 = node_info.loc[node2, 'Percent NMP']
    y2 = node_info.loc[node2, 'Percent Neural']
    z2 = node_info.loc[node2, 'Percent Somitic']
    
    return np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)

In [12]:
def calcTernaryDist(tree1_name, tree2_name, node_info, meta_col = 'Sample_ID'):
    '''
    input:
        tree1 - string to subset all nodes in tree1 using the meta_col, assumed to all be in the node_info table
        tree2 - string to subset all nodes in tree1 using the meta_col, assumed to all be in the node_info table
        node_info - a node_info table that contains all nodes in tree1 and tree2, the column from meta_col and the fractions of NMP, Neural, and Somitic.
        meta_col - A column name in node_info that can be used to split tree1 and tree2
    returns:
        a value of the distance between tree1 and tree2
    '''
    tree1_nodes = node_info[node_info[meta_col] == tree1_name].index
    tree2_nodes = node_info[node_info[meta_col] == tree2_name].index
    
    pairwise_dists_df = pd.DataFrame(index = tree1_nodes, columns = tree2_nodes)
    
    for i in pairwise_dists_df.index:
        for j in pairwise_dists_df.columns:
            pairwise_dists_df.loc[i, j] = calcDist(i, j, node_info)
            
    total_dist = 0
    pen = 0 
    temp_dists_df = pairwise_dists_df.copy()
    min_nodes = min(len(temp_dists_df.columns), len(temp_dists_df.index))
    for i in range(max(len(temp_dists_df.columns), len(temp_dists_df.index))):
        if len(temp_dists_df.columns) == 0 or len(temp_dists_df.index) == 0:
            pen += 1
        else:
            min_val = 100
            min_col = '' 
            min_row = ''
            for col in temp_dists_df.columns:
                temp_dists_df[col] = pd.to_numeric(temp_dists_df[col])

                temp_row = temp_dists_df[col].idxmin(axis = 0)
                if min_val > temp_dists_df.loc[temp_row, col]:
                    min_val = temp_dists_df.loc[temp_row, col]
                    min_row = temp_row
                    min_col = col

            total_dist += temp_dists_df.loc[min_row, min_col]

            temp_dists_df.drop(index = min_row, inplace = True)
            temp_dists_df.drop(columns = min_col, inplace = True)    
            
    return (total_dist / min_nodes, (total_dist / min_nodes) + pen)

In [14]:
pairwise_trees = list(itertools.combinations(node_info['Sample_ID'].unique(), 2))
comparison_list = []

for x, y in pairwise_trees:
    comparison_list.append('{}_{}'.format(x, y))

TLS1_dists_df = pd.DataFrame(index = comparison_list, columns = ['ID_1', 'ID_2', 'Dist', 'Pen_Dist', 'Comp_Dist'])

In [15]:
for ID_1, ID_2 in pairwise_trees:
    name = '{}_{}'.format(ID_1, ID_2)
    dist, pen_dist = calcTernaryDist(int(ID_1), int(ID_2), node_info)

    TLS1_dists_df.loc[name] = [ID_1, ID_2, dist, pen_dist, '']

# Plot the results

In [31]:
plt.boxplot(TLS1_dists_df['Dist'], labels = ['Intra TLS1'])
plt.ylim(0, 0.8)
plt.ylabel('Normalized Ternary Dist')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/results/TLS1_ternary_dist.pdf', dpi = 300)
#plt.show()
plt.close()

# Find composition distance

In [19]:
def aitchisonDist (x, y):
    '''
    Input:
        x - a list or array of composition values in sample1
        y - a list or array of composition values in sample2
        
        Assumes that all 0 values have been removed and that the composition values in the list are connected by index
    return:
        a distance between the 2 vectors
    '''
    meanX = sum(x) / len(x)
    meanY = sum(y) / len(y)
    
    distSum = 0
    
    for index, value in enumerate(x):
        tempX = np.log(x[index] / meanX)
        tempY = np.log(y[index] / meanY)
        
        distSum += (tempX - tempY) ** 2
        
    return np.sqrt(distSum)

In [20]:
TLS_compositions = pd.DataFrame(index = node_info['Sample_ID'].unique(), columns = colorDict.keys())

temp_cell_state_table = cell_state_table.copy()
temp_cell_state_table.set_index('cellBC', inplace = True)

for index in TLS_compositions.index:
    ID = index
    barcode = 'TLS1'
    
    cell_counts = {}
    
    for i in colorDict.keys():
        cell_counts[i] = 0
    
    treeFile = '/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/{}_to_200/newick/{}_{}_to_200_downsampling_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, ID, barcode)

    t = Tree(treeFile, format = 1)
    leaves = [leaf.name for leaf in t.get_leaves()]
    #print(len(leaves))
    
    for leaf in leaves:
        cell_counts[temp_cell_state_table.loc[leaf, 'cell_state']] += 1
        
    for cell_state in cell_counts.keys():
        TLS_compositions.loc[index, cell_state] = cell_counts[cell_state]

In [21]:
# Compress all the cell types down to just 3 categories
temp_TLS_compositions = pd.DataFrame(index = TLS_compositions.index, columns = ['NMPs', 'Neural', 'Somitic'])

for i in temp_TLS_compositions.index:
    temp_TLS_compositions.loc[i, 'NMPs'] = TLS_compositions.loc[i, 'NMPs']
    
    temp_TLS_compositions.loc[i, 'Neural'] = TLS_compositions.loc[i, ['NeuralTube1', 'NeuralTube2']].sum()
    temp_TLS_compositions.loc[i, 'Somitic'] = TLS_compositions.loc[i, ['pPSM', 'aPSM', 'Somite-1', 'Somite0', 'Somite', 'SomiteSclero', 'SomiteDermo']].sum()

In [22]:
for i in TLS1_dists_df.index:
    ID_1 = int(TLS1_dists_df.loc[i, 'ID_1'])
    ID_2 = int(TLS1_dists_df.loc[i, 'ID_2'])
    
    x_vector = temp_TLS_compositions.loc[ID_1].values + 0.0001
    y_vector = temp_TLS_compositions.loc[ID_2].values + 0.0001
    
    TLS1_dists_df.loc[i, 'Comp_Dist'] = aitchisonDist(x_vector, y_vector)

In [23]:
TLS1_dists_df.to_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/TLS1_to_200/TLS1_to_200_intra_structure_dists.txt', sep = '\t')

In [4]:
TLS1_dists_df = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/TLS1_to_200/TLS1_to_200_intra_structure_dists.txt', sep = '\t', index_col = 0)

In [6]:
data = [TLS1_dists_df['Comp_Dist']]

plt.boxplot(data, labels = ['TLS1'])
plt.title('Intra Compositional Distances')
plt.ylabel('Aitchison Distance')
plt.ylim(0, 2)
plt.savefig('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/results/TLS1_composition_dist.pdf', dpi = 300)
#plt.show()
plt.close()

In [7]:
def abline(slope, intercept, color):
    """Plot a line from slope and intercept"""
    axes = plt.gca()
    x_vals = np.array(axes.get_xlim())
    y_vals = intercept + slope * x_vals
    plt.plot(x_vals, y_vals, '--{}'.format(color))

In [9]:
fig, ax = plt.subplots(figsize = (5, 5))

labels = []

plt.plot(TLS1_dists_df['Comp_Dist'], TLS1_dists_df['Dist'], '.r', alpha = 0.05)
slope_temp, intercept_temp, r, p, se = stats.linregress(TLS1_dists_df['Comp_Dist'].tolist(), TLS1_dists_df['Dist'].tolist())
abline(slope_temp, intercept_temp, 'r')

plt.legend(labels = ['TLS1 Intra', 'TLS1 Intra Line'], bbox_to_anchor=(1, 0.55))
plt.title('Ternary Dist vs Compositional Dist')
plt.xlabel('Comp Dist')
plt.ylabel('Ternary Dist')
plt.xlim(0, 2)
plt.ylim(0, 1)
plt.savefig('/Genomics/chanlab/blaw/TLS/data/ternary_dists/downsampling_to_200/TLS1_to_200/TLS1_ternary_vs_comp_dist.pdf', dpi = 300)
#plt.show()
plt.close()