In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import scanpy as sc
import cassiopeia as cas
import seaborn as sns
import pickle
import itertools
from matplotlib.pyplot import rc_context
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from ete3 import Tree

In [2]:
clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)

# Load the cell state table
cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/TLS_Explant_Total_cellBC_cellState.tsv', sep='\t')

barcodes = ['Bar1', 'Bar2', 'Bar3', 'Bar4', 'Bar5', 'Bar6']
progenitor_list = ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', 'PGCLC Progenitor', 'Endoderm Progenitor', 'Dropped']

# Helpful functions

In [3]:
def maxDepth(node):
    '''
    Input:
        - a node in an ete tree
    returns:
        - The max depth of any branch in that node
    '''
    if node.is_leaf():
        return 0
    children_depths = []
    
    for child in node.children:
        test = maxDepth(child)
        
        children_depths.append(test)
        
    return max(children_depths) + 1

In [4]:
def getProgenitorType(leaves, cell_state_table):
    '''
    input:
        - a list of leaves that are extant cells for a node
        - a table that contains the annotated cell state for each cellBC
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    - Extended progenitors: PGCs, Endoderm, Somitic, Neural
    - Pluripotent progenitors: Endoderm, Somitic, Neural
    - Bipotent progenitors: Somitic, Neural (both if contains or not NMPs still count)
    - Endoderm progenitors: Endoderm only
    - PGCLC progenitors: PGCLC only
    - Somitic progenitors: somitic only
    - Neural progenitors: neural only
    
    Neural class is made from NeuralTube1 and NeuralTube2
    Somite class is pPSM, aPSM, Somite-1, Somite0, Somite, Somite1, SomiteSclero, SomiteDermo
    
    NMPs are left out of the analysis. +/- an NMP does not change the category that a node gets
    
    exclude nodes that are Endoderm without both somitic and neural (unless it is alone)
    exclude nodes that are PGC without all 3 endoderm, somitic, and neural (unless it is alone)
    
    Endothelial is allowed (+/-) in extended progenitors and pluripotent progenitors
    Endothelial is not allowed (-) in all other progenitors
    
    Unassigned / Unknown cells are not looked at for this classification (+/-)
    
    I am changing this analysis to also record if a node is exclusively NMP (self renewing NMP)
    
    '''
    progenitor_types = {'Extended Progenitor': set(['PCGLC', 'Endoderm', 'Somitic', 'Neural']),
                        'Pluripotent Progenitor': set(['Endoderm', 'Somitic', 'Neural']),
                        'Bipotent Progenitor' : set(['Somitic', 'Neural']),
                        'Endoderm Progenitor': set(['Endoderm']),
                        'PGCLC Progenitor': set(['PCGLC']),
                        'Somitic Progenitor': set(['Somitic']),
                        'Neural Progenitor': set(['Neural'])}
    
    # make a list of the cell states in a given node
    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()
    
    #if set(cell_types) == set(['NMPs']):
     #   return 'Self Renewing NMP'
    
    # Group the cell states into neural and somite categories
    grouped_states = []
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            grouped_states.append('Somitic')
        elif state in ['NeuralTube1', 'NeuralTube2']:
            grouped_states.append('Neural')
        elif state in ['PCGLC', 'Endoderm', 'Endothelial']:
            grouped_states.append(state)
    
    state_set = set(grouped_states)
    
    for progenitor in progenitor_types.keys():
        if state_set == progenitor_types[progenitor]:
            return progenitor
    if state_set == set(['PCGLC', 'Endoderm', 'Somitic', 'Neural', 'Endothelial']):
        return 'Extended Progenitor'
    if state_set == set(['Endoderm', 'Somitic', 'Neural', 'Endothelial']):
        return 'Pluripotent Progenitor'
    
    return 'Dropped'

In [5]:
def getProgenitorType_FC(leaves, cell_state_table):
    '''
    input:
        - a tree node to test
        - a table that contains the annotated cell state for each cellBC
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    Record the cell states that are connected to a node from the following 6 states:
    - PGCLC
    - Endoderm
    - Endothelial
    - NMPs
    - Somitic
    - Neural
    
    Neural class is made from NeuralTube1 and NeuralTube2
    Somite class is pPSM, aPSM, Somite-1, Somite0, Somite, Somite1, SomiteSclero, SomiteDermo
    
    Unassigned / Unknown cells are not looked at for this classification (+/-)
    
    '''
    states = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
    total_combinations = []
    for i in range(5):
        for j in itertools.combinations(states, i+2):
            total_combinations.append(j)

    progenitor_type_dict = {'PCGLC': set(['PCGLC']),
                       'Endoderm': set(['Endoderm']),
                       'Endothelial': set(['Endothelial']),
                       'NMPs': set(['NMPs']),
                       'Somitic': set(['Somitic']),
                       'Neural': set(['Neural'])}
    for i in total_combinations:
        label = '_'.join(i)
        progenitor_type_dict[label] = set(i)
    
    # make a list of the cell states in a given node
    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()
    
    # Group the cell states into neural and somite categories
    grouped_states = []
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            grouped_states.append('Somitic')
        elif state in ['NeuralTube1', 'NeuralTube2']:
            grouped_states.append('Neural')
        elif state in ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs']:
            grouped_states.append(state)
    
    state_set = set(grouped_states)
    
    for progenitor in progenitor_type_dict.keys():
        if state_set == progenitor_type_dict[progenitor]:
            return progenitor
    
    # Return dropped if a node is not in the progenitor type dict
    return 'Dropped'

In [6]:
def getTimeType(leaves, cell_state_table):
    '''
    input:
        - a list of leaves that are extant cells for a node
        - a table that contains the annotated timepoint for each cellBC in a 'timepoint' column
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    Record the timepoints for each extant cell for the given node (the leaves):
        - 120h
        - 144h
    
    returns one of the 3 following time types:
        - 120
        - 144
        - 120_144
    
    '''
    states = ['120', '144']

    time_type_dict = {'120': set(['120']),
                       '144': set(['144']),
                       '120_144': set(['120', '144'])}
    
    grouped_states = [str(i) for i in cell_state_table[cell_state_table['cellBC'].isin(leaves)]['timepoint'].to_list()]
    
    # make a list of the cell states in a given node
    state_set = set(grouped_states)

    for time_type in time_type_dict.keys():
        if state_set == time_type_dict[time_type]:
            return time_type
    
    # Return dropped if a node is not in the progenitor type dict
    return 'Dropped'

In [7]:
def countTimepoint(leaves, cell_state_table):
    '''
    Input:
        - a leaf in the tree. Assumed to be a str that has a value in the cell state table
    return:
        - A tuple of the number of 120h and 144h cells for the node
    '''
    #leaves = [leaf.name for leaf in node.get_leaves()]

    # Group the Neural and Somite cell states into 1 category and remove PGCLC, Endoderm, Unknown, Epithelial
    count_120 = 0
    count_144 = 0
    temp = cell_state_table.copy()
    temp.set_index('cellBC', inplace = True)
    for cell in leaves:
        time = temp.loc[cell, 'timepoint']
        if time == 120:
            count_120 += 1
        elif time == 144:
            count_144 += 1

    return (count_120, count_144)

In [10]:
def shuffledBG (file_name, empty_node_info, cell_state_table, node_leaf_dict, dropped_leaf_dict, output_dir, FC = False, numShuffles = 500):
    '''
    input:
        file_name - an experiment name for saving the files
        empty_node_info - a table that contains all the nodes in the tree as indexes and a 'Progenitor Type' column to be populated
        cell_state_table - a table that contains the assigned cell state for each cell in the tree. Any nodes that are added 
            back after the trimming (keeping the top node of a nest set) should be added into these columns. This table should
            ONLY contain cells that are present on the tree / node table so that the shuffling is accurate
        node_leaf_dict - a dictionary that saves a list of leaves that are extant cells for a given node (key)
        dropped_leaf_dict - a dictionary that saves the cells that are removed from the tree (via trimming) as keys and the 
            new node values as values to be looked up in the cell state table
        output_dir - a file path to a directory to save the files and graphs
        FC - a boolean to use the full combination of progenitor states or the smaller subset (default to False)
        shuffles - the number of iterations to do the shuffled BG (default to 500)
    output:
        df_bgCounts - a dataframe of the counts for each progenitor state in each iteration of the shuffledBG 
            (saved as a csv file to output_dir)
        df_bgVals - a dataframe of the mean and median of normalized depth for each progenitor state for each itr
            (saved as a csv file to output_dir)
        bgDist_dict - a dictionary that saves a list of arrays of normalized depth for each progenitor state for each itr
            (saved as a pickle .p file to output_dir)
    '''
    # check if the output_dir is real
    if not os.path.exists(output_dir):
        print('output_dir path does not exists')
        return
    
    # shuffle background and calculate the # of each progenitor type for the entire dataset
    itrList = ['itr' + str(i) for i in range(numShuffles)]
    
    # set a dictionary of the progenitor types based on the full set of progenitor types or the reduced set
    if FC:
        states = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
        progenitor_types = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
        for i in range(5):
            for j in itertools.combinations(states, i+2):
                progenitor_types.append('_'.join(j))
    else:
        progenitor_types = ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', 'PGCLC Progenitor', 'Endoderm Progenitor', 'Dropped']
    

    indexList = []
    for progenitor in progenitor_types:
        for i in ['Mean', 'Median']:
            indexList.append(progenitor + '_' + i)
    
    # initialize the 3 variables to save the results
    df_bgCounts = pd.DataFrame(index = progenitor_types, columns = itrList)
    df_bgVals = pd.DataFrame(index = indexList, columns = itrList)
    bgDists_dict = {}

    for itr in range(numShuffles):
        # Assign a temp node_info table with progenitor types not classified
        node_info_itr = empty_node_info.copy()

        # Randomly shuffle the cell_state annotations in a cell state table
        shuffled_cell_state_table = cell_state_table.copy()
        shuffled_cell_state_table['cell_state'] = shuffled_cell_state_table['cell_state'].sample(frac = 1).values

        # fill the node_info_itr table with node classifications using the shuffled cell state table
        for node in node_info_itr.index:
            leaves = []
            for leaf in node_leaf_dict[node]:
                if leaf in dropped_leaf_dict.keys():
                    leaves.append(dropped_leaf_dict[leaf])
                else:
                    leaves.append(leaf)
            if FC:
                node_info_itr.loc[node]['Progenitor Type'] = getProgenitorType_FC(leaves, shuffled_cell_state_table)
            else:
                node_info_itr.loc[node]['Progenitor Type'] = getProgenitorType(leaves, shuffled_cell_state_table)

        for progenitor in progenitor_types:
            df_bgCounts.loc[progenitor, 'itr{}'.format(itr)] = len(node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Clone'])  

        for progenitor in progenitor_types:
            # assign the mean and median to be 0 if the node type was not observed in this iteration
            if df_bgCounts.loc[progenitor, 'itr{}'.format(itr)] > 0:
                df_bgVals.loc[progenitor + '_Mean', 'itr{}'.format(itr)] = node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Normalized Dist'].mean()
                df_bgVals.loc[progenitor + '_Median', 'itr{}'.format(itr)] = node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Normalized Dist'].median()
            else:
                df_bgVals.loc[progenitor + '_Mean', 'itr{}'.format(itr)] = 0
                df_bgVals.loc[progenitor + '_Median', 'itr{}'.format(itr)] = 0

        for progenitor in progenitor_types:
            # check if this progenitor has not been added to the dict yet
            if progenitor not in bgDists_dict.keys():
                bgDists_dict[progenitor] = []

            # If the progenitor is observed in this iteration, then add the array of normalized depths to the list
            if df_bgCounts.loc[progenitor, 'itr{}'.format(itr)] > 0:
                bgDists_dict[progenitor].append(node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Normalized Dist'])

                
    # Save the 3 objects
    with open(output_dir + '{}_shuffledBG_Distributions.pickle'.format(file_name), 'wb') as handle:
        pickle.dump(bgDists_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    df_bgVals.to_csv(output_dir + '{}_shuffledBG_Means.txt'.format(file_name))

    df_bgCounts.to_csv(output_dir + '{}_shuffledBG_Counts.txt'.format(file_name))

In [45]:
def shuffledBG_time (file_name, tree, empty_node_info, cell_state_table, node_leaf_dict, dropped_leaf_dict, output_dir, single_cell_clones = True, numShuffles = 500):
    '''
    input:
        file_name - an experiment name string for saving the files
        tree - an ete tree for the tree. Assumed to have all the same nodes and leaves in the node info and cell state table
        empty_node_info - a table that contains all the nodes in the tree as indexes and a 'Node Time' column to be populated
        cell_state_table - a table that contains the assigned cell state for each cell in the tree. Any nodes that are added 
            back after the trimming (keeping the top node of a nest set) should be added into these columns. This table should
            ONLY contain cells that are present on the tree / node table so that the shuffling is accurate
        node_leaf_dict - a dictionary that saves a list of leaves that are extant cells for a given node (key)
        dropped_leaf_dict - a dictionary that saves the cells that are removed from the tree (via trimming) as keys and the 
            new node values as values to be looked up in the cell state table
        output_dir - a file path to a directory to save the files and graphs
        single_cell_clones - a boolean to determine if single cell clones should be counted. Default is yes
        numShuffles - the number of iterations to do the shuffled BG (default to 500)
    output:
        df_bgCounts - a dataframe of the counts for each time state in each iteration of the shuffledBG 
            (saved as a csv file to output_dir)
        df_bgClones - a dataframe that counts the # of clones in each timepoint / iteration
        node_depths - a dictionary that saves the depths of all nodes in each timepoint / iteration
        clone_sizes - a dictionary that saves the sizes (# of cells) of each clone in each timepoint / iteration
        clone_depths - a dictionary that saves the max clone dpeth for each clone in each timepoint / iteration
    '''
    # check if the output_dir is real
    if not os.path.exists(output_dir):
        print('output_dir path does not exists')
        return
    
    # shuffle background and calculate the # of each time type for the entire dataset
    itrList = ['itr' + str(i) for i in range(numShuffles)]
    
    # set a dictionary of the time types based on the full set of time types or the reduced set
    time_types = ['120', '144', '120_144']
    

    indexList = []
    for time in time_types:
        for i in ['Mean', 'Median']:
            indexList.append(time + '_' + i)
    
    # initialize the dataframes and dictionaries to save the results
    # records the # of nodes in each timepoint for each iteration
    df_bgCounts = pd.DataFrame(index = time_types, columns = itrList)
    # records the # of clones in each timepoint for each iteration
    df_bgClones = pd.DataFrame(index = time_types, columns = itrList)
    
    # records the distribution of sizes and depths for clones in each timepoint in each iteration
    node_depths = {}
    clone_sizes = {}
    clone_depths = {}
    for time in time_types:
        clone_sizes[time] = []
        clone_depths[time] = []
        node_depths[time] = []

    for itr in range(numShuffles):
        # Assign a temp node_info table with time types not classified
        node_info_itr = empty_node_info.copy()

        # Randomly shuffle the timepoint annotations in the cell state table
        shuffled_cell_state_table = cell_state_table.copy()
        shuffled_cell_state_table['timepoint'] = shuffled_cell_state_table['timepoint'].sample(frac = 1).values

        # fill the node_info_itr table with node classifications using the shuffled cell state table
        for node in node_info_itr.index:
            leaves = []
            for leaf in node_leaf_dict[node]:
                if leaf in dropped_leaf_dict.keys():
                    leaves.append(dropped_leaf_dict[leaf])
                else:
                    leaves.append(leaf)
            
            node_info_itr.loc[node]['node_time'] = getTimeType(leaves, shuffled_cell_state_table)
            
        if single_cell_clones:
            # get a list of clones from the tree
            clones = [clone.name for clone in t.children]
        else:
            # get a list of clones from the node table. This removes single cell 'clones' since they don't have nodes
            clones = node_info_itr[node_info_itr['dist_to_clone'] == 0].index
        
        # count the # of clones in each timepoint in this iteration
        clones_120 = 0
        clones_144 = 0
        clones_120_144 = 0

        # capture the distribution of clones in each timepoint for this iteration in a dict
        temp_clone_sizes = {'120': [], '144': [], '120_144': []}
        temp_clone_depths = {'120': [], '144': [], '120_144': []}

        for clone in clones:
            # check if the clone is a single cell, pull the timepoint from the cell state table if so
            if clone.startswith('T'):
                cell_time = shuffled_cell_state_table[shuffled_cell_state_table['cellBC'] == clone]['timepoint'].tolist()[0]
                
                if cell_time == 120:
                    clones_120 += 1
                    temp_clone_sizes['120'].append(1)
                    temp_clone_depths['120'].append(0)
                elif cell_time == 144:
                    clones_144 += 1
                    temp_clone_sizes['144'].append(1)
                    temp_clone_depths['144'].append(0)

            # if the clone is a node, then pull its timepoint from the node info table
            else:
                if node_info_itr.loc[clone, 'node_time'] == '120':
                    clones_120 += 1
                    temp_clone_sizes['120'].append(node_info_itr.loc[clone, 'clone_size'])
                    temp_clone_depths['120'].append(node_info_itr.loc[clone, 'max_clone_depth'])
                elif node_info_itr.loc[clone, 'node_time'] == '144':
                    clones_144 += 1
                    temp_clone_sizes['144'].append(node_info_itr.loc[clone, 'clone_size'])
                    temp_clone_depths['144'].append(node_info_itr.loc[clone, 'max_clone_depth'])
                else:
                    clones_120_144 += 1
                    temp_clone_sizes['120_144'].append(node_info_itr.loc[clone, 'clone_size'])
                    temp_clone_depths['120_144'].append(node_info_itr.loc[clone, 'max_clone_depth'])

        # record the # of clones in each timepoint
        df_bgClones.loc['120', 'itr{}'.format(itr)] = clones_120
        df_bgClones.loc['144', 'itr{}'.format(itr)] = clones_144
        df_bgClones.loc['120_144', 'itr{}'.format(itr)] = clones_120_144
        
        for time in time_types:
            df_bgCounts.loc[time, 'itr{}'.format(itr)] = len(node_info_itr[node_info_itr['node_time'] == time]['clone'])
            node_depths[time].append(node_info_itr[node_info_itr['node_time'] == time]['norm_dist_to_clone'])
            
            clone_sizes[time].append(temp_clone_sizes[time])
            clone_depths[time].append(temp_clone_depths[time])

    if single_cell_clones:
        df_bgCounts.to_csv(output_dir + '{}_shuffledBG_Counts.txt'.format(file_name))
        df_bgClones.to_csv(output_dir + '{}_shuffledBG_Clones.txt'.format(file_name))
        
        f = open(output_dir + '{}_clone_sizes.pkl'.format(file_name), "wb")
        pickle.dump(clone_sizes, f)

        f = open(output_dir + '{}_clone_depths.pkl'.format(file_name), "wb")
        pickle.dump(clone_depths, f)
        
        f = open(output_dir + '{}_node_depths.pkl'.format(file_name), "wb")
        pickle.dump(node_depths, f)
    else:
        df_bgCounts.to_csv(output_dir + '{}_shuffledBG_Counts_without_scClones.txt'.format(file_name))
        df_bgClones.to_csv(output_dir + '{}_shuffledBG_Clones_without_scClones.txt'.format(file_name))
        
        f = open(output_dir + '{}_clone_sizes_without_scClones.pkl'.format(file_name), "wb")
        pickle.dump(clone_sizes, f)

        f = open(output_dir + '{}_clone_depths_without_scClones.pkl'.format(file_name), "wb")
        pickle.dump(clone_depths, f)
        
        f = open(output_dir + '{}_node_depths.pkl'.format(file_name), "wb")
        pickle.dump(node_depths, f)

# Run the time shuffled BG for the hybrid combined trees

In [46]:
# Perform shuffled background on time assignments
for barcode in barcodes:
    method = 'hybrid'
    time = '120_144'
    
    treeFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/{}/{}/{}_{}_{}_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, time, method, barcode, time, method)
    t = Tree(treeFile, format = 1)
    total_leaves = [leaf.name for leaf in t.get_leaves()]

    temp_cell_state_table = cell_state_table[cell_state_table['cellBC'].isin(total_leaves)]

    # Make empty node file
    nodes = []
    for node in t.traverse():
        if not node.is_leaf() and node.name != 'node0':
                nodes.append(node.name)

    empty_node_info = pd.DataFrame(index = nodes, columns = ['clone', 'dist_to_clone', 'max_clone_depth', 'clone_size', 'node_time', 'progenitor_type', 'norm_dist_to_clone', 'frac_120'])

    # fill node information into the empty node info table
    for clone in t.children:
        clone_depth = maxDepth(clone)
        clone_size = len(clone.get_leaves())

        for node in clone.traverse():
            if not node.is_leaf():
                dist_to_clone = t.get_distance(clone, node)

                empty_node_info.loc[node.name, 'clone'] = clone.name
                empty_node_info.loc[node.name, 'dist_to_clone'] = dist_to_clone
                empty_node_info.loc[node.name, 'max_clone_depth'] = clone_depth
                empty_node_info.loc[node.name, 'clone_size'] = clone_size
                empty_node_info.loc[node.name, 'norm_dist_to_clone'] = dist_to_clone / clone_depth

    actual_node_info = empty_node_info.copy()

    for node in t.traverse():
        if not node.is_leaf() and node.name != 'node0':
            leaves = [leaf.name for leaf in node.get_leaves()]
            actual_node_info.loc[node.name, 'progenitor_type'] = getProgenitorType(leaves, temp_cell_state_table)
            actual_node_info.loc[node.name, 'node_time'] = getTimeType(leaves, temp_cell_state_table)

    for clone in t.children:
        if not clone.is_leaf():
            actual_node_info.loc[clone.name, 'progenitor_type'] = 'Clone'

    actual_node_info.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_{}_{}_actual_node_info.txt'.format(barcode, barcode, time, method), sep = '\t')

    # Make a dictionary that stores the subnodes (not including leaves) for each node in the tree
    node_subnode_dict = {}
    # Make a dictionary that stores the leaves for each node in the tree
    node_leaf_dict = {}

    # Iter through all the nodes in t and populate two dictionaries for non-leaf nodes
    for node in t.traverse():
        if not node.is_leaf():
            node_leaf_dict[node.name] = [leaf.name for leaf in node.get_leaves()]

            children = []

            for subnode in node.traverse():
                if not subnode.is_leaf() and subnode != node:
                    children.append(subnode.name)

            node_subnode_dict[node.name] = children

    shuffledBG_time(file_name = '{}_{}_{}'.format(barcode, time, method),
                    tree = t,
                    empty_node_info = empty_node_info,
                    cell_state_table = temp_cell_state_table,
                    node_leaf_dict = node_leaf_dict,
                    dropped_leaf_dict = {},
                    output_dir = '/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/'.format(barcode),
                    single_cell_clones = True,
                    numShuffles = 500) 

    shuffledBG_time(file_name = '{}_{}_{}'.format(barcode, time, method),
                    tree = t,
                    empty_node_info = empty_node_info,
                    cell_state_table = temp_cell_state_table,
                    node_leaf_dict = node_leaf_dict,
                    dropped_leaf_dict = {},
                    output_dir = '/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/'.format(barcode),
                    single_cell_clones = False,
                    numShuffles = 500)  

# Plot the shuffled BG for clone and node times

In [11]:
# show the shuffled BG for nodes of each timepoint
fig, ax = plt.subplots(2, 3, figsize = (15, 10))
i = 0
j = 0
for barcode in barcodes:
    method = 'hybrid'
    time = '120_144'
    
    df_bgClones = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_{}_shuffledBG_Counts.txt'.format(barcode, barcode, method), index_col = 0)
    actual_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_{}_actual_node_info.txt'.format(barcode, barcode, method), sep = '\t', index_col = 0)

    # Get the actual values from the above node table
    nodes_120 = len(actual_node_info[actual_node_info['node_time'] == '120'])
    nodes_144 = len(actual_node_info[actual_node_info['node_time'] == '144'])
    nodes_120_144 = len(actual_node_info[actual_node_info['node_time'] == '120_144'])

    temp_ax = ax[i, j]
    if j >= 2:
        i += 1
        j = 0
    else:
        j += 1

    sns.violinplot(data = df_bgClones.T, scale = 'count', ax = temp_ax)
    sns.swarmplot(x = df_bgClones.index, y = [nodes_120, nodes_144, nodes_120_144], color = 'Blue', ax = temp_ax)
    temp_ax.set_ylabel('Node Counts')
    temp_ax.set_title('{}'.format(barcode))
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/plots/node_timepoints.pdf', dpi = 300)
#plt.show()
plt.close()

In [13]:
# plot the shuffled BG using shared clone counts not including single cell clones
fig, ax = plt.subplots(2, 3, figsize = (15, 10))
i = 0
j = 0
for barcode in barcodes:
    method = 'hybrid'
    
    df_bgClones = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_{}_shuffledBG_Clones_without_scClones.txt'.format(barcode, barcode, method), index_col = 0)
    actual_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_{}_actual_node_info.txt'.format(barcode, barcode, method), sep = '\t', index_col = 0)


    # Get the actual values from the above node table
    actual_values = []
    count_120 = len(actual_node_info[(actual_node_info['dist_to_clone'] == 0) & (actual_node_info['node_time'] == '120')])
    count_144 = len(actual_node_info[(actual_node_info['dist_to_clone'] == 0) & (actual_node_info['node_time'] == '144')])
    count_120_144 = len(actual_node_info[(actual_node_info['dist_to_clone'] == 0) & (actual_node_info['node_time'] == '120_144')])

    temp_ax = ax[i, j]
    if j >= 2:
        i += 1
        j = 0
    else:
        j += 1

    sns.violinplot(data = df_bgClones.T, scale = 'count', ax = temp_ax)
    sns.swarmplot(x = df_bgClones.index, y = [count_120, count_144, count_120_144], color = 'Blue', ax = temp_ax)
    temp_ax.set_ylabel('Count')
    temp_ax.set_title('{} - Without scClones'.format(barcode, method))
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/plots/clone_timepoints.pdf', dpi = 300)
#plt.show()
plt.close()

# Plot the size and max depths of clones of each timepoint

In [15]:
# make a pdf with all the clone depth distributions
pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/plots/clone_depths.pdf')

for barcode in barcodes:
    actual_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_hybrid_actual_node_info.txt'.format(barcode, barcode), sep = '\t', index_col = 0)
    with open('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_hybrid_clone_sizes_without_scClones.pkl'.format(barcode, barcode), 'rb') as f:
        clone_sizes = pickle.load(f)

    # Start a figure for the cdf plots for each progneitor type in the untrimmed not full combination dataset
    fig, ax = plt.subplots(1, 3, figsize = (15, 5))

    count = 0
    for size in clone_sizes.keys():
        temp_ax = ax[count]
        count += 1

        for i in clone_sizes[size]:
            if len(i) > 0:
                sns.ecdfplot(i, ax=temp_ax, color = 'lightblue', alpha = 0.1)
                temp_ax.set_title(size)
    # Plot the actual data
    clones = actual_node_info['clone'].unique()

    count = 0
    for time in clone_sizes.keys():
        temp_ax = ax[count]
        count += 1

        temp_clone_sizes = []
        for clone in clones:
            if actual_node_info.loc[clone, 'node_time'] == time:
                temp_clone_sizes.append(actual_node_info.loc[clone, 'clone_size'])
        sns.ecdfplot(temp_clone_sizes, ax = temp_ax, color = 'Black')
        temp_ax.set_title('{} - {} - Without scClones'.format(barcode, time))
        temp_ax.set_xlabel('Clone Size (Leaves)')
    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
pp.close()

In [17]:
# make a pdf with all the clone depth distributions
pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/plots/clone_depths.pdf')

for barcode in barcodes:
    actual_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_hybrid_actual_node_info.txt'.format(barcode, barcode), sep = '\t', index_col = 0)
    with open('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_hybrid_clone_sizes_without_scClones.pkl'.format(barcode, barcode), 'rb') as f:
        clone_sizes = pickle.load(f)

    # Start a figure for the cdf plots for each progneitor type in the untrimmed not full combination dataset
    fig, ax = plt.subplots(1, 3, figsize = (15, 5))

    count = 0
    for size in clone_sizes.keys():
        temp_ax = ax[count]
        count += 1

        for i in clone_sizes[size]:
            if len(i) > 0:
                sns.ecdfplot(i, ax=temp_ax, color = 'lightblue', alpha = 0.1)
                temp_ax.set_title(size)
    # Plot the actual data
    clones = actual_node_info['clone'].unique()

    count = 0
    for time in clone_sizes.keys():
        temp_ax = ax[count]
        count += 1

        temp_clone_sizes = []
        for clone in clones:
            if actual_node_info.loc[clone, 'node_time'] == time:
                temp_clone_sizes.append(actual_node_info.loc[clone, 'max_clone_depth'])
        sns.ecdfplot(temp_clone_sizes, ax = temp_ax, color = 'Black')
        temp_ax.set_title('{}_{}_Depth without scClones'.format(barcode, time))
        #temp_ax.set_xlim(0, 700)

    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
pp.close()

In [20]:
# make a pdf with all the node depth distributions
pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/plots/node_depths.pdf')

for barcode in barcodes:
    actual_node_info = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_hybrid_actual_node_info.txt'.format(barcode, barcode), sep = '\t', index_col = 0)
    with open('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/{}/{}_120_144_hybrid_node_depths.pkl'.format(barcode, barcode), 'rb') as f:
        node_depths = pickle.load(f)

    # Start a figure for the cdf plots for each progneitor type in the untrimmed not full combination dataset
    fig, ax = plt.subplots(1, 3, figsize = (15, 5))

    count = 0
    for time in node_depths.keys():
        temp_ax = ax[count]
        count += 1

        for i in node_depths[time]:
            if len(i) > 0:
                sns.ecdfplot(i, ax=temp_ax, color = 'lightblue', alpha = 0.1)
                temp_ax.set_title(time)
    # Plot the actual data

    count = 0
    for time in node_depths.keys():
        temp_ax = ax[count]
        count += 1

        temp_node_depths = actual_node_info[actual_node_info['node_time'] == time]['norm_dist_to_clone']
        sns.ecdfplot(temp_node_depths, ax = temp_ax, color = 'Black')
        temp_ax.set_title('{}_{}_Depth'.format(barcode, time))

    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
pp.close()

# Run the same shuffled BG on clone and node timepoints for the subsampled trees
- The subsampled trees are located in /Genomics/chanlab/blaw/TLS/data/explant/subsampling/

In [47]:
# Perform shuffled background on time assignments
for barcode in barcodes:
    method = 'hybrid'
    time = '120_144'
    
    treeFile = '/Genomics/chanlab/blaw/TLS/data/explant/subsampling/{}/{}_subsampling_hybrid_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, barcode)
    t = Tree(treeFile, format = 1)
    total_leaves = [leaf.name for leaf in t.get_leaves()]

    temp_cell_state_table = cell_state_table[cell_state_table['cellBC'].isin(total_leaves)]

    # Make empty node file
    nodes = []
    for node in t.traverse():
        if not node.is_leaf() and node.name != 'node0':
                nodes.append(node.name)

    empty_node_info = pd.DataFrame(index = nodes, columns = ['clone', 'dist_to_clone', 'max_clone_depth', 'clone_size', 'node_time', 'progenitor_type', 'norm_dist_to_clone', 'frac_120'])

    # fill node information into the empty node info table
    for clone in t.children:
        clone_depth = maxDepth(clone)
        clone_size = len(clone.get_leaves())

        for node in clone.traverse():
            if not node.is_leaf():
                dist_to_clone = t.get_distance(clone, node)

                empty_node_info.loc[node.name, 'clone'] = clone.name
                empty_node_info.loc[node.name, 'dist_to_clone'] = dist_to_clone
                empty_node_info.loc[node.name, 'max_clone_depth'] = clone_depth
                empty_node_info.loc[node.name, 'clone_size'] = clone_size
                empty_node_info.loc[node.name, 'norm_dist_to_clone'] = dist_to_clone / clone_depth

    actual_node_info = empty_node_info.copy()

    for node in t.traverse():
        if not node.is_leaf() and node.name != 'node0':
            leaves = [leaf.name for leaf in node.get_leaves()]
            actual_node_info.loc[node.name, 'progenitor_type'] = getProgenitorType(leaves, temp_cell_state_table)
            actual_node_info.loc[node.name, 'node_time'] = getTimeType(leaves, temp_cell_state_table)

    for clone in t.children:
        if not clone.is_leaf():
            actual_node_info.loc[clone.name, 'progenitor_type'] = 'Clone'

    actual_node_info.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/subsampled/{}/{}_subsampled_actual_node_info.txt'.format(barcode, barcode), sep = '\t')

    # Make a dictionary that stores the subnodes (not including leaves) for each node in the tree
    node_subnode_dict = {}
    # Make a dictionary that stores the leaves for each node in the tree
    node_leaf_dict = {}

    # Iter through all the nodes in t and populate two dictionaries for non-leaf nodes
    for node in t.traverse():
        if not node.is_leaf():
            node_leaf_dict[node.name] = [leaf.name for leaf in node.get_leaves()]

            children = []

            for subnode in node.traverse():
                if not subnode.is_leaf() and subnode != node:
                    children.append(subnode.name)

            node_subnode_dict[node.name] = children

    shuffledBG_time(file_name = '{}_subsampled'.format(barcode),
                    tree = t,
                    empty_node_info = empty_node_info,
                    cell_state_table = temp_cell_state_table,
                    node_leaf_dict = node_leaf_dict,
                    dropped_leaf_dict = {},
                    output_dir = '/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/subsampled/{}/'.format(barcode),
                    single_cell_clones = True,
                    numShuffles = 500) 

    shuffledBG_time(file_name = '{}_subsampled'.format(barcode),
                    tree = t,
                    empty_node_info = empty_node_info,
                    cell_state_table = temp_cell_state_table,
                    node_leaf_dict = node_leaf_dict,
                    dropped_leaf_dict = {},
                    output_dir = '/Genomics/chanlab/blaw/TLS/data/explant/shuffledBG/subsampled/{}/'.format(barcode),
                    single_cell_clones = False,
                    numShuffles = 500)  