In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import cassiopeia as cas
import seaborn as sns
import pickle
import itertools
from matplotlib.pyplot import rc_context
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from ete3 import Tree
from typing import Tuple
import os

In [2]:
clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)

# Load the allele table, tree, and lineage table for TLS1
TLS2_allele_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/lineage/2_lineage_reconstruction/allele_table.txt', sep='\t')

# Load the TLS1 tree
TLS2_loc = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/lineage/2_lineage_reconstruction/AM-DNA-098_hybrid_newick_noMutationlessEdges_Labeled.nwk'
t = Tree(TLS2_loc, format=1)

# Load the cell state table
cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_120h_2_cellBC_cellState.tsv', sep='\t')

# Helpful Functions

In [31]:
def maxDepth(node):
    '''
    Input:
        - a node in an ete tree
    returns:
        - The max depth of any branch in that node
    '''
    if node.is_leaf():
        return 0
    children_depths = []
    
    for child in node.children:
        test = maxDepth(child)
        
        children_depths.append(test)
        
    return max(children_depths) + 1

In [32]:
def getProgenitorType(leaves, cell_state_table):
    '''
    input:
        - a list of leaves that are extant cells for a node
        - a table that contains the annotated cell state for each cellBC
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    - Extended progenitors: PGCs, Endoderm, Somitic, Neural
    - Pluripotent progenitors: Endoderm, Somitic, Neural
    - Bipotent progenitors: Somitic, Neural (both if contains or not NMPs still count)
    - Endoderm progenitors: Endoderm only
    - PGCLC progenitors: PGCLC only
    - Somitic progenitors: somitic only
    - Neural progenitors: neural only
    
    Neural class is made from NeuralTube1 and NeuralTube2
    Somite class is pPSM, aPSM, Somite-1, Somite0, Somite, Somite1, SomiteSclero, SomiteDermo
    
    NMPs are left out of the analysis. +/- an NMP does not change the category that a node gets
    
    exclude nodes that are Endoderm without both somitic and neural (unless it is alone)
    exclude nodes that are PGC without all 3 endoderm, somitic, and neural (unless it is alone)
    
    Endothelial is allowed (+/-) in extended progenitors and pluripotent progenitors
    Endothelial is not allowed (-) in all other progenitors
    
    Unassigned / Unknown cells are not looked at for this classification (+/-)
    
    I am changing this analysis to also record if a node is exclusively NMP (self renewing NMP)
    
    '''
    progenitor_types = {'Extended Progenitor': set(['PCGLC', 'Endoderm', 'Somitic', 'Neural']),
                        'Pluripotent Progenitor': set(['Endoderm', 'Somitic', 'Neural']),
                        'Bipotent Progenitor' : set(['Somitic', 'Neural']),
                        'Endoderm Progenitor': set(['Endoderm']),
                        'PGCLC Progenitor': set(['PCGLC']),
                        'Somitic Progenitor': set(['Somitic']),
                        'Neural Progenitor': set(['Neural'])}
    
    # make a list of the cell states in a given node
    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()
    
    #if set(cell_types) == set(['NMPs']):
     #   return 'Self Renewing NMP'
    
    # Group the cell states into neural and somite categories
    grouped_states = []
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            grouped_states.append('Somitic')
        elif state in ['NeuralTube1', 'NeuralTube2']:
            grouped_states.append('Neural')
        elif state in ['PCGLC', 'Endoderm', 'Endothelial']:
            grouped_states.append(state)
    
    state_set = set(grouped_states)
    
    for progenitor in progenitor_types.keys():
        if state_set == progenitor_types[progenitor]:
            return progenitor
    if state_set == set(['PCGLC', 'Endoderm', 'Somitic', 'Neural', 'Endothelial']):
        return 'Extended Progenitor'
    if state_set == set(['Endoderm', 'Somitic', 'Neural', 'Endothelial']):
        return 'Pluripotent Progenitor'
    
    return 'Dropped'

In [33]:
def getProgenitorType_FC(leaves, cell_state_table):
    '''
    input:
        - a tree node to test
        - a table that contains the annotated cell state for each cellBC
    output:
        - the type of progenitor that we are classifying it as given these criteria:
    
    Record the cell states that are connected to a node from the following 6 states:
    - PGCLC
    - Endoderm
    - Endothelial
    - NMPs
    - Somitic
    - Neural
    
    Neural class is made from NeuralTube1 and NeuralTube2
    Somite class is pPSM, aPSM, Somite-1, Somite0, Somite, Somite1, SomiteSclero, SomiteDermo
    
    Unassigned / Unknown cells are not looked at for this classification (+/-)
    
    '''
    states = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
    total_combinations = []
    for i in range(5):
        for j in itertools.combinations(states, i+2):
            total_combinations.append(j)

    progenitor_type_dict = {'PCGLC': set(['PCGLC']),
                       'Endoderm': set(['Endoderm']),
                       'Endothelial': set(['Endothelial']),
                       'NMPs': set(['NMPs']),
                       'Somitic': set(['Somitic']),
                       'Neural': set(['Neural'])}
    for i in total_combinations:
        label = '_'.join(i)
        progenitor_type_dict[label] = set(i)
    
    # make a list of the cell states in a given node
    cell_types = cell_state_table[cell_state_table['cellBC'].isin(leaves)]['cell_state'].to_list()
    
    # Group the cell states into neural and somite categories
    grouped_states = []
    for state in cell_types:
        if state in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
            grouped_states.append('Somitic')
        elif state in ['NeuralTube1', 'NeuralTube2']:
            grouped_states.append('Neural')
        elif state in ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs']:
            grouped_states.append(state)
    
    state_set = set(grouped_states)
    
    for progenitor in progenitor_type_dict.keys():
        if state_set == progenitor_type_dict[progenitor]:
            return progenitor
    
    # Return dropped if a node is not in the progenitor type dict
    return 'Dropped'

In [34]:
def shuffledBG (file_name, empty_node_info, cell_state_table, node_leaf_dict, dropped_leaf_dict, output_dir, FC = False, numShuffles = 500):
    '''
    input:
        file_name - an experiment name for saving the files
        empty_node_info - a table that contains all the nodes in the tree as indexes and a 'Progenitor Type' column to be populated
        cell_state_table - a table that contains the assigned cell state for each cell in the tree. Any nodes that are added 
            back after the trimming (keeping the top node of a nest set) should be added into these columns. This table should
            ONLY contain cells that are present on the tree / node table so that the shuffling is accurate
        node_leaf_dict - a dictionary that saves a list of leaves that are extant cells for a given node (key)
        dropped_leaf_dict - a dictionary that saves the cells that are removed from the tree (via trimming) as keys and the 
            new node values as values to be looked up in the cell state table
        output_dir - a file path to a directory to save the files and graphs
        FC - a boolean to use the full combination of progenitor states or the smaller subset (default to False)
        shuffles - the number of iterations to do the shuffled BG (default to 500)
    output:
        df_bgCounts - a dataframe of the counts for each progenitor state in each iteration of the shuffledBG 
            (saved as a csv file to output_dir)
        df_bgVals - a dataframe of the mean and median of normalized depth for each progenitor state for each itr
            (saved as a csv file to output_dir)
        bgDist_dict - a dictionary that saves a list of arrays of normalized depth for each progenitor state for each itr
            (saved as a pickle .p file to output_dir)
    '''
    # check if the output_dir is real
    if not os.path.exists(output_dir):
        print('output_dir path does not exists')
        return
    
    # shuffle background and calculate the # of each progenitor type for the entire dataset
    itrList = ['itr' + str(i) for i in range(numShuffles)]
    
    # set a dictionary of the progenitor types based on the full set of progenitor types or the reduced set
    if FC:
        states = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
        progenitor_types = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
        for i in range(5):
            for j in itertools.combinations(states, i+2):
                progenitor_types.append('_'.join(j))
    else:
        progenitor_types = ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', 'PGCLC Progenitor', 'Endoderm Progenitor', 'Dropped']
    

    indexList = []
    for progenitor in progenitor_types:
        for i in ['Mean', 'Median']:
            indexList.append(progenitor + '_' + i)
    
    # initialize the 3 variables to save the results
    df_bgCounts = pd.DataFrame(index = progenitor_types, columns = itrList)
    df_bgVals = pd.DataFrame(index = indexList, columns = itrList)
    bgDists_dict = {}

    for itr in range(numShuffles):
        # Assign a temp node_info table with progenitor types not classified
        node_info_itr = empty_node_info.copy()

        # Randomly shuffle the cell_state annotations in a cell state table
        shuffled_cell_state_table = cell_state_table.copy()
        shuffled_cell_state_table['cell_state'] = shuffled_cell_state_table['cell_state'].sample(frac = 1).values

        # fill the node_info_itr table with node classifications using the shuffled cell state table
        for node in node_info_itr.index:
            leaves = []
            for leaf in node_leaf_dict[node]:
                if leaf in dropped_leaf_dict.keys():
                    leaves.append(dropped_leaf_dict[leaf])
                else:
                    leaves.append(leaf)
            if FC:
                node_info_itr.loc[node]['Progenitor Type'] = getProgenitorType_FC(leaves, shuffled_cell_state_table)
            else:
                node_info_itr.loc[node]['Progenitor Type'] = getProgenitorType(leaves, shuffled_cell_state_table)

        for progenitor in progenitor_types:
            df_bgCounts.loc[progenitor, 'itr{}'.format(itr)] = len(node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Clone'])  

        for progenitor in progenitor_types:
            # assign the mean and median to be 0 if the node type was not observed in this iteration
            if df_bgCounts.loc[progenitor, 'itr{}'.format(itr)] > 0:
                df_bgVals.loc[progenitor + '_Mean', 'itr{}'.format(itr)] = node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Normalized Dist'].mean()
                df_bgVals.loc[progenitor + '_Median', 'itr{}'.format(itr)] = node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Normalized Dist'].median()
            else:
                df_bgVals.loc[progenitor + '_Mean', 'itr{}'.format(itr)] = 0
                df_bgVals.loc[progenitor + '_Median', 'itr{}'.format(itr)] = 0

        for progenitor in progenitor_types:
            # check if this progenitor has not been added to the dict yet
            if progenitor not in bgDists_dict.keys():
                bgDists_dict[progenitor] = []

            # If the progenitor is observed in this iteration, then add the array of normalized depths to the list
            if df_bgCounts.loc[progenitor, 'itr{}'.format(itr)] > 0:
                bgDists_dict[progenitor].append(node_info_itr[node_info_itr['Progenitor Type'] == progenitor]['Normalized Dist'])

                
    # Save the 3 objects
    with open(output_dir + '{}_shuffledBG_Distributions.pickle'.format(file_name), 'wb') as handle:
        pickle.dump(bgDists_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    df_bgVals.to_csv(output_dir + '{}_shuffledBG_Means.txt'.format(file_name))

    df_bgCounts.to_csv(output_dir + '{}_shuffledBG_Counts.txt'.format(file_name))

# Make an empty node info table
- This table will have the clone information, max clone depth, and normalized dist from clone for each node
- Progenitor type will be added during the shuffledBG function for each iteration

In [35]:
# Find all the nodes in TLS2
# Do not include the root since we know that this structure starts from multiple cells
nodes = []
for node in t.traverse():
    if not node.is_leaf():
        if node.name != 'node0':
            nodes.append(node.name)
            
empty_node_info = pd.DataFrame(index = nodes, columns = ['Clone', 'Dist to Clone', 'Clone Depth', 'Progenitor Type', 'Normalized Dist'])

# Fill in topology information
for clone in t.children:
    clone_depth = maxDepth(clone)
    
    for node in clone.traverse():
        if not node.is_leaf():
            dist_to_clone = t.get_distance(clone, node)
            
            empty_node_info.loc[node.name, 'Clone'] = clone.name
            empty_node_info.loc[node.name, 'Dist to Clone'] = dist_to_clone
            empty_node_info.loc[node.name, 'Clone Depth'] = clone_depth
            if clone_depth == 1:
                empty_node_info.loc[node.name, 'Normalized Dist'] = 0
            else:
                empty_node_info.loc[node.name, 'Normalized Dist'] = dist_to_clone / (clone_depth - 1)
        
# filter the cell state table to only the cells in the tree
leaves = [leaf.name for leaf in t.get_leaves()]
tree_cell_state_table = cell_state_table[cell_state_table['cellBC'].isin(leaves)].copy()

# Label Progenitor Nodes in the Structure

In [36]:
# Label clones
for clone in t.children:
    if not clone.is_leaf():
        empty_node_info.loc[clone.name, 'Progenitor Type'] = 'Clone'

In [37]:
# Make a dictionary that stores the subnodes (not including leaves) for each node in the tree
node_subnode_dict = {}
# Make a dictionary that stores the leaves for each node in the tree
node_leaf_dict = {}

# Iter through all the nodes in t and populate two dictionaries for non-leaf nodes
for node in t.traverse():
    if not node.is_leaf():
        node_leaf_dict[node.name] = [leaf.name for leaf in node.get_leaves()]
        
        children = []
        
        for subnode in node.traverse():
            if not subnode.is_leaf() and subnode != node:
                children.append(subnode.name)
            
        node_subnode_dict[node.name] = children

In [38]:
node_info = empty_node_info.copy()

# For each node that is not a clone, record the progenitor type
for clone in t.children:
    for node in clone.traverse():
        if not node.is_leaf():
            leaves = [leaf.name for leaf in node.get_leaves()]
            node_info.loc[node.name, 'Progenitor Type'] = getProgenitorType(leaves, cell_state_table)

In [39]:
progenitor_types = ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', 'PGCLC Progenitor', 'Endoderm Progenitor', 'Dropped']

for progenitor in progenitor_types:
    print(progenitor)
    print(len(node_info[node_info['Progenitor Type'] == progenitor]['Clone Depth']))

Extended Progenitor
27
Pluripotent Progenitor
64
Bipotent Progenitor
187
Neural Progenitor
82
Somitic Progenitor
353
PGCLC Progenitor
32
Endoderm Progenitor
10
Dropped
268


# Shuffled Background for Progenitor Type Counts
For every shuffled background in ~500 repeats:
- shuffle the cell states around the tree
- classify the nodes
- count the progenitor types
- record the progenitor types

This can be done by shuffling the cell state annotations that are drawn from the cell_state_table

In [12]:
shuffledBG(file_name = 'AM-DNA-098',
           empty_node_info = empty_node_info,
           cell_state_table = tree_cell_state_table,
           node_leaf_dict = node_leaf_dict,
           dropped_leaf_dict = {},
           output_dir = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/',
           FC = False,
           numShuffles = 500)

# Plot the results

In [13]:
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)

# Start a figure for the cdf plots for each progneitor type in the untrimmed not full combination dataset
fig, ax = plt.subplots(2, 4, figsize = (16, 8))
axes = [ax[0, 0], ax[0, 1], ax[0, 2], ax[0, 3], ax[1, 0], ax[1, 1], ax[1, 2], ax[1, 3]]

count = 0
for progenitor in bgDists_dict.keys():
    temp_ax = axes[count]
    count += 1
    
    for i in bgDists_dict[progenitor]:
        sns.ecdfplot(i, ax=temp_ax, color = 'lightblue', alpha = 0.1)

# Plot the actual data
count = 0
for progenitor in bgDists_dict.keys():
    temp_ax = axes[count]
    count += 1
    
    sns.ecdfplot(node_info[node_info['Progenitor Type'] == progenitor]['Normalized Dist'], ax = temp_ax, color = 'Black')
    temp_ax.set_title(progenitor)
            
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_node_depths.pdf', dpi = 300)
#plt.show()
plt.close()

In [14]:
# Plot violin plots of the mean counts with a dot for the actual value
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_shuffledBG_Counts.txt', index_col = 0)

# Get the actual values from the above node table
actual_values =[]
for progenitor in progenitor_types:
    actual_values.append(len(node_info[node_info['Progenitor Type'] == progenitor]['Clone']))
    
fig, ax = plt.subplots(figsize = (20, 5))
sns.violinplot(data = df_bgCounts.T, scale = 'width')
sns.swarmplot(x = progenitor_types, y = actual_values, color = 'Blue')
plt.ylabel('Count')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_Progenitor_Counts.pdf', dpi = 300)
#plt.show()
plt.close()

  plot_data = [np.asarray(s, float) for k, s in iter_data]


In [28]:
colors = {'Extended Progenitor': 'black', 'Pluripotent Progenitor': 'Orange', 'Bipotent Progenitor': 'Blue',
         'Neural Progenitor': 'Green', 'Somitic Progenitor': 'Red'}

# Start a figure for the cdf plots for each progneitor type in the untrimmed not full combination dataset
fig, ax = plt.subplots(1, 1, figsize = (6, 4))

# Plot the actual data
count = 0
for progenitor in ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor']:    
    sns.ecdfplot(node_info[node_info['Progenitor Type'] == progenitor]['Normalized Dist'], color = colors[progenitor])

ax.legend(['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor'])
plt.title('TLS2')
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_node_depths_combined.pdf', dpi = 300)
#plt.show()
plt.close()

In [15]:
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_shuffledBG_Counts.txt', index_col = 0)
df_bgVals = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_shuffledBG_Means.txt', index_col = 0)
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)

# Make a table of actual count, mean BG count, std BG count, zscore, pval, actual dist mean, BG dist mean, BG dist std, zscore, pval
values = ['Actual_Count', 'Mean_BG_Count', 'Std_BG_Count', 'zscore_Count', 'pval_L_Count', 'pval_U_Count', 'Actual_Dist', 'Mean_BG_Dist', 'Std_BG_Dist', 'zscore_Dist', 'pval_L_Dist', 'pval_U_Dist']

progenitor_stats = pd.DataFrame(index = bgDists_dict.keys(), columns = values)

for progenitor in progenitor_stats.index:
    actualCount = len(node_info[node_info['Progenitor Type'] == progenitor]['Clone Depth'])
    meanCount = df_bgCounts.loc[progenitor].mean()
    stdCount = df_bgCounts.loc[progenitor].std()
    zScore = (actualCount - meanCount) / stdCount
    
    # split the distribution into classes of above, below, or equal to the actual value
    above = []
    below = []
    equal = []
    for i in df_bgCounts.loc[progenitor]:
        if i > actualCount:
            above.append(i)
        elif i < actualCount:
            below.append(i)
        else:
            equal.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower = (len(below) + len(equal)) / 500
    pval_upper = (len(above) + len(equal)) / 500
    
    progenitor_stats.loc[progenitor, 'Actual_Count'] = actualCount
    progenitor_stats.loc[progenitor, 'Mean_BG_Count'] = meanCount
    progenitor_stats.loc[progenitor, 'Std_BG_Count'] = stdCount
    progenitor_stats.loc[progenitor, 'zscore_Count'] = zScore
    progenitor_stats.loc[progenitor, 'pval_L_Count'] = pval_lower
    progenitor_stats.loc[progenitor, 'pval_U_Count'] = pval_upper
    
    # Record all the info for the normalized distributions
    actualDist = node_info[node_info['Progenitor Type'] == progenitor]['Normalized Dist'].mean()
    meanDist = df_bgVals.loc[progenitor + '_Mean'].mean()
    stdDist = df_bgVals.loc[progenitor + '_Mean'].std()
    zScoreDist = (actualDist - meanDist) / stdDist
    
    # split the distribution into classes of above, below, or equal to the actual value
    above_Dist = []
    below_Dist = []
    equal_Dist = []
    for i in df_bgVals.loc[progenitor + '_Mean']:
        if i > actualDist:
            above_Dist.append(i)
        elif i < actualDist:
            below_Dist.append(i)
        else:
            equal_Dist.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower_Dist = (len(below_Dist) + len(equal_Dist)) / 500
    pval_upper_Dist = (len(above_Dist) + len(equal_Dist)) / 500
    
    progenitor_stats.loc[progenitor, 'Actual_Dist'] = actualDist
    progenitor_stats.loc[progenitor, 'Mean_BG_Dist'] = meanDist
    progenitor_stats.loc[progenitor, 'Std_BG_Dist'] = stdDist
    progenitor_stats.loc[progenitor, 'zscore_Dist'] = zScoreDist
    progenitor_stats.loc[progenitor, 'pval_L_Dist'] = pval_lower_Dist
    progenitor_stats.loc[progenitor, 'pval_U_Dist'] = pval_upper_Dist
    
progenitor_stats.to_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_small_set/AM-DNA-098_Progenitor_Stats.txt', sep = '\t')

# ShuffledBG using the full set of progenitor nodes
- Given the 6 cell states (PGCLC, Endoderm, Endothelial, NMPs, Neural, Somitic), I will classify nodes using progenitor types of every combination of these cells

In [40]:
# start from the empty node info table
node_info_FC = empty_node_info.copy()

# For each node that is not a clone, record the progenitor type
for clone in t.children:
    for node in clone.traverse():
        if not node.is_leaf():
            leaves = [leaf.name for leaf in node.get_leaves()]
            node_info_FC.loc[node.name]['Progenitor Type'] = getProgenitorType_FC(leaves, cell_state_table)

In [41]:
# make a dictionary where the keys are each possible to record the actual values
states = ['PCGLC', 'Endoderm', 'Endothelial', 'NMPs', 'Somitic', 'Neural']
total_combinations = []
for i in range(5):
    for j in itertools.combinations(states, i+2):
        total_combinations.append(j)

progenitor_type_dict = {'PCGLC': set(['PCGLC']),
                   'Endoderm': set(['Endoderm']),
                   'Endothelial': set(['Endothelial']),
                   'NMPs': set(['NMPs']),
                   'Somitic': set(['Somitic']),
                   'Neural': set(['Neural'])}
for i in total_combinations:
    label = '_'.join(i)
    progenitor_type_dict[label] = set(i)

In [42]:
for progenitor in node_info_FC['Progenitor Type'].unique():
    print(progenitor)
    print(len(node_info_FC[node_info_FC['Progenitor Type'] == progenitor]['Clone Depth']))

Endothelial_Somitic_Neural
38
PCGLC_Endoderm_Endothelial_Somitic_Neural
11
PCGLC_Endoderm_Endothelial_NMPs_Somitic_Neural
5
Endoderm_NMPs_Somitic_Neural
6
PCGLC_Endothelial_Somitic_Neural
11
Endoderm_Endothelial_NMPs_Somitic_Neural
8
Endoderm_Somitic_Neural
38
Endoderm_Endothelial_Somitic_Neural
12
Somitic_Neural
179
PCGLC_Endoderm_NMPs_Somitic_Neural
1
PCGLC_Endoderm_Somitic_Neural
10
PCGLC_NMPs_Somitic_Neural
3
PCGLC_Endoderm_Somitic
9
PCGLC
32
Endothelial_Somitic
79
PCGLC_Neural
11
PCGLC_Somitic
10
Somitic
349
Neural
78
PCGLC_Endoderm_Neural
7
Endoderm_Endothelial_Somitic
11
PCGLC_Somitic_Neural
8
Endoderm_Somitic
52
Endoderm_Neural
9
NMPs_Somitic
4
NMPs_Somitic_Neural
8
Endoderm_Endothelial_NMPs_Somitic
3
Endoderm
10
Endoderm_NMPs_Neural
2
PCGLC_Endothelial_NMPs_Somitic_Neural
1
PCGLC_NMPs_Somitic
2
PCGLC_Endoderm
1
PCGLC_Endothelial_Somitic
3
Endothelial_NMPs_Somitic_Neural
1
Endothelial_Neural
2
NMPs_Neural
4
Endothelial_NMPs_Somitic
1
Endothelial
4


In [19]:
shuffledBG(file_name = 'AM-DNA-098_full_set',
           empty_node_info = empty_node_info,
           cell_state_table = tree_cell_state_table,
           node_leaf_dict = node_leaf_dict,
           dropped_leaf_dict = {},
           output_dir = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/',
           FC = True,
           numShuffles = 500)

# Plot all the progenitor combinations

In [20]:
# Graph the countsfrom each distribution
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_full_set_shuffledBG_Counts.txt', index_col = 0)

pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_Node_Counts_FC.pdf')
index_ranges = [(0, 10), (10, 19), (19, 28), (28, 37), (37, 46), (46, 55), (55, 63)]
count = 1
for i, j in index_ranges:
    test = df_bgCounts.index[i: j]
    fig, ax = plt.subplots(figsize = (15, 10))
    sns.violinplot(data = df_bgCounts.T[test], ax = ax, scale = 'width', color = 'lightblue')
    actual_values = []
    for progenitor in test:
        actual_values.append(len(node_info_FC[node_info_FC['Progenitor Type'] == progenitor]['Clone Depth']))
    sns.swarmplot(x = test, y = actual_values, color = 'black', ax = ax)
    plt.xticks(rotation=45)
    plt.ylabel('Counts')
    #plt.ylim(0, 225)
    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
    count += 1
pp.close()

  plot_data = [np.asarray(s, float) for k, s in iter_data]
  plot_data = [np.asarray(s, float) for k, s in iter_data]
  plot_data = [np.asarray(s, float) for k, s in iter_data]
  plot_data = [np.asarray(s, float) for k, s in iter_data]
  plot_data = [np.asarray(s, float) for k, s in iter_data]
  plot_data = [np.asarray(s, float) for k, s in iter_data]
  plot_data = [np.asarray(s, float) for k, s in iter_data]


In [21]:
# Plot all the depth distributions of each progenitor type
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_full_set_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)
    
pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_Node_Depths_FC.pdf')
    
# Graph the cdf plots from the dictionary
count = 0
for test in bgDists_dict.keys():
    if count == 0:
        fig, ax = plt.subplots(1, 4, figsize = (16, 4))
        
    temp_ax = ax[count]    

    for i in bgDists_dict[test]:
        sns.ecdfplot(i, ax=temp_ax, color = 'lightblue', alpha = 0.1)

    if test in node_info_FC['Progenitor Type'].unique():
        sns.ecdfplot(node_info_FC[node_info_FC['Progenitor Type'] == test]['Normalized Dist'], color = 'black', ax = temp_ax)
        
    temp_ax.set_title(test)
    
    if count == 3:
        plt.tight_layout()
        pp.savefig()
        #plt.show()
        plt.close()
        count = 0
    else:
        count += 1
if count != 0:
    plt.tight_layout()
    pp.savefig()
    plt.close()
pp.close()

In [22]:
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_full_set_shuffledBG_Counts.txt', index_col = 0)
df_bgVals = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_full_set_shuffledBG_Means.txt', index_col = 0)
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_full_set_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)

# Make a table of actual count, mean BG count, std BG count, zscore, pval, actual dist mean, BG dist mean, BG dist std, zscore, pval
values = ['Actual_Count', 'Mean_BG_Count', 'Std_BG_Count', 'zscore_Count', 'pval_L_Count', 'pval_U_Count', 'Actual_Dist', 'Mean_BG_Dist', 'Std_BG_Dist', 'zscore_Dist', 'pval_L_Dist', 'pval_U_Dist']

progenitor_stats_FC = pd.DataFrame(index = bgDists_dict.keys(), columns = values)

for progenitor in progenitor_stats_FC.index:
    actualCount = len(node_info_FC[node_info_FC['Progenitor Type'] == progenitor]['Clone Depth'])
    meanCount = df_bgCounts.loc[progenitor].mean()
    stdCount = df_bgCounts.loc[progenitor].std()
    zScore = (actualCount - meanCount) / stdCount
    
    # split the distribution into classes of above, below, or equal to the actual value
    above = []
    below = []
    equal = []
    for i in df_bgCounts.loc[progenitor]:
        if i > actualCount:
            above.append(i)
        elif i < actualCount:
            below.append(i)
        else:
            equal.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower = (len(below) + len(equal)) / 500
    pval_upper = (len(above) + len(equal)) / 500
    
    progenitor_stats_FC.loc[progenitor, 'Actual_Count'] = actualCount
    progenitor_stats_FC.loc[progenitor, 'Mean_BG_Count'] = meanCount
    progenitor_stats_FC.loc[progenitor, 'Std_BG_Count'] = stdCount
    progenitor_stats_FC.loc[progenitor, 'zscore_Count'] = zScore
    progenitor_stats_FC.loc[progenitor, 'pval_L_Count'] = pval_lower
    progenitor_stats_FC.loc[progenitor, 'pval_U_Count'] = pval_upper
    
    # Record all the info for the normalized distributions
    actualDist = node_info_FC[node_info_FC['Progenitor Type'] == progenitor]['Normalized Dist'].mean()
    meanDist = df_bgVals.loc[progenitor + '_Mean'].mean()
    stdDist = df_bgVals.loc[progenitor + '_Mean'].std()
    zScoreDist = (actualDist - meanDist) / stdDist
    
    # split the distribution into classes of above, below, or equal to the actual value
    above_Dist = []
    below_Dist = []
    equal_Dist = []
    for i in df_bgVals.loc[progenitor + '_Mean']:
        if i > actualDist:
            above_Dist.append(i)
        elif i < actualDist:
            below_Dist.append(i)
        else:
            equal_Dist.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower_Dist = (len(below_Dist) + len(equal_Dist)) / 500
    pval_upper_Dist = (len(above_Dist) + len(equal_Dist)) / 500
    
    progenitor_stats_FC.loc[progenitor, 'Actual_Dist'] = actualDist
    progenitor_stats_FC.loc[progenitor, 'Mean_BG_Dist'] = meanDist
    progenitor_stats_FC.loc[progenitor, 'Std_BG_Dist'] = stdDist
    progenitor_stats_FC.loc[progenitor, 'zscore_Dist'] = zScoreDist
    progenitor_stats_FC.loc[progenitor, 'pval_L_Dist'] = pval_lower_Dist
    progenitor_stats_FC.loc[progenitor, 'pval_U_Dist'] = pval_upper_Dist
    
progenitor_stats_FC.to_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/untrimmed_full_set/AM-DNA-098_Progenitor_Stats_FC.txt')

# Trim the tree and Perform shuffledBG
Trim nodes that occur in nested areas of the same type
- If a branch stops differentiating in a path, then trim that whole branch down to 1 state
- for example, if a somite committed node has 10 somite committed nodes under it, trim that branch down to just the top node
- Remove all the cells that were in the trimmed region and replace them with 1 state (somite for example)
- After performing this, calculate the actual values on the trimmed tree and perform the shuffledBG using the reduced cell states and tree

trim the tree using the full set of nodes (all node types), then analyze it as the small or full set

In [23]:
# Create a temp node_info table to perform the filtering on
temp_node_info = empty_node_info.copy()

# fill in the progenitor type info for the temp node info table using the full set of progenitor types
for node in temp_node_info.index:
    temp_node_info.loc[node, 'Progenitor Type'] = getProgenitorType_FC(node_leaf_dict[node], cell_state_table)

In [24]:
# Add a new column that records nodes that need trimming
# These are nodes that are singular cell type progenitors that have subnodes that are also the same progenitor state
# A singular cell type progenitor that only has leaves will not be changed
temp_node_info['Trimmed'] = 'No'

# Populate the column, Yes means that the node needs trimming, No means that it does not
for node in temp_node_info.index:
    trim = False

    # Checks if the node is a singular cell type progenitor
    if temp_node_info.loc[node, 'Progenitor Type'] in ['PCGLC', 'Neural', 'Somitic', 'Endoderm', 'Endothelial', 'NMPs']:
        # If the subnode dictionary is not an empty list, then the node has subnodes
        if node_subnode_dict[node] != []:
            trim = True

            # resets trim to false if any of the subnodes are not the same progenitor type
            for subnode in node_subnode_dict[node]:
                if temp_node_info.loc[node, 'Progenitor Type'] != temp_node_info.loc[subnode, 'Progenitor Type']:
                    trim = False

    if trim:
        temp_node_info.loc[node, 'Trimmed'] = 'Yes'
    else:
        temp_node_info.loc[node, 'Trimmed'] = 'No'
        
# With this labeling, I will remove all leave and subnodes of these labeled nodes
# some of these labeled nodes are subnodes of other labeled nodes, in which case only the top 'trimmed' node will be kept

# Make a set that saves the nodes and leaves that need to be trimmed
trimmed_nodes = set()
trimmed_leaves = set()

# Going through each node that was labeled for trimming, remove every subnode and leaf underneath that node
for node in temp_node_info[temp_node_info['Trimmed'] == 'Yes'].index:
    for subnode in node_subnode_dict[node]:
        trimmed_nodes.add(subnode)
    for leaf in node_leaf_dict[node]:
        trimmed_leaves.add(leaf)
        
# Make a new node info table and a new empty node table after dropping the nodes
trimmed_node_info = temp_node_info.drop(index = trimmed_nodes).copy()
empty_trimmed_node_info = empty_node_info.drop(index = trimmed_nodes).copy()

# Calculate the actual progenitor values in the trimmed table

In [25]:
for progenitor in trimmed_node_info['Progenitor Type'].unique():
    print(progenitor)
    print('\tUntrimmed: ', len(node_info_FC[node_info_FC['Progenitor Type'] == progenitor]['Clone Depth']))
    print('\ttrimmed: ', len(trimmed_node_info[trimmed_node_info['Progenitor Type'] == progenitor]['Clone Depth']))

Endothelial_Somitic_Neural
	Untrimmed:  38
	trimmed:  38
PCGLC_Endoderm_Endothelial_Somitic_Neural
	Untrimmed:  11
	trimmed:  11
PCGLC_Endoderm_Endothelial_NMPs_Somitic_Neural
	Untrimmed:  5
	trimmed:  5
Endoderm_NMPs_Somitic_Neural
	Untrimmed:  6
	trimmed:  6
PCGLC_Endothelial_Somitic_Neural
	Untrimmed:  11
	trimmed:  11
Endoderm_Endothelial_NMPs_Somitic_Neural
	Untrimmed:  8
	trimmed:  8
Endoderm_Somitic_Neural
	Untrimmed:  38
	trimmed:  38
Endoderm_Endothelial_Somitic_Neural
	Untrimmed:  12
	trimmed:  12
Somitic_Neural
	Untrimmed:  179
	trimmed:  179
PCGLC_Endoderm_NMPs_Somitic_Neural
	Untrimmed:  1
	trimmed:  1
PCGLC_Endoderm_Somitic_Neural
	Untrimmed:  10
	trimmed:  10
PCGLC_NMPs_Somitic_Neural
	Untrimmed:  3
	trimmed:  3
PCGLC_Endoderm_Somitic
	Untrimmed:  9
	trimmed:  9
PCGLC
	Untrimmed:  32
	trimmed:  16
Endothelial_Somitic
	Untrimmed:  79
	trimmed:  79
PCGLC_Neural
	Untrimmed:  11
	trimmed:  11
PCGLC_Somitic
	Untrimmed:  10
	trimmed:  10
Somitic
	Untrimmed:  349
	trimmed:  191

# ShuffledBG on the trimmed small set

In [29]:
# Trim the cell state table to be used for the shuffled background
leaves_to_keep = []

for cell in tree_cell_state_table['cellBC']:
    if not cell in trimmed_leaves:
        leaves_to_keep.append(cell)

trimmed_tree_cell_state_table = tree_cell_state_table[tree_cell_state_table['cellBC'].isin(leaves_to_keep)].copy()

In [30]:
# I also need to add in rows for the nodes that were the top of the trimming, these are the nodes that remain in the trimmed
# node table that are labeled with a 'yes'

# for the nodes that are kept at the top of the trimmed regions, I will assign them a cell type based on their classification
# Since these are all singular progenitor types, I will assign them as 'PCGLC', 'Neural', etc
temp_dict = {'Neural': 'NeuralTube1',
            'Somitic': 'Somite',
            'PCGLC': 'PCGLC',
            'Endoderm': 'Endoderm',
            'Endothelial': 'Endothelial',
            'NMPs': 'NMPs'}


# This adds back in ~66 nodes for the cells that were removed
for node in trimmed_node_info[trimmed_node_info['Trimmed'] == 'Yes'].index:
    cell_state = trimmed_node_info.loc[node, 'Progenitor Type']
    trimmed_tree_cell_state_table.loc[node] = [temp_dict[cell_state], 'None', node]

In [31]:
# I also need a dictionary that points each dropped leaf to the node in the trimmed_cell_state_table that saves it
dropped_leaf_dict = {}

# Add the trimmed leaves to the dict
for leaf in trimmed_leaves:
    dropped_leaf_dict[leaf] = ""
    
# go through remaining 'Yes' node and add that node as the item for each trimmed leaf
for node in trimmed_node_info[trimmed_node_info['Trimmed'] == 'Yes'].index:
    temp_leaves = node_leaf_dict[node]
    
    for leaf in temp_leaves:
        # every leaf under these trimmed nodes should be in the trimmed leaf set, this will throw an error if this is not true
        if dropped_leaf_dict[leaf] != '': 
            if dropped_leaf_dict[leaf] != node:
                # Print an error if the leaf points to 2 seperate nodes (error in the code somewhere)
                print('error')
                print(dropped_leaf_dict[leaf])
                break
        # Set the value in the leaf dict to the top node
        dropped_leaf_dict[leaf] = node

In [32]:
shuffledBG(file_name = 'AM-DNA-098_trimmed',
           empty_node_info = empty_trimmed_node_info,
           cell_state_table = trimmed_tree_cell_state_table,
           node_leaf_dict = node_leaf_dict,
           dropped_leaf_dict = dropped_leaf_dict,
           output_dir = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/',
           FC = False,
           numShuffles = 500)

In [33]:
order = ['Extended Progenitor', 'Pluripotent Progenitor', 'Bipotent Progenitor', 'Neural Progenitor', 'Somitic Progenitor', 'PGCLC Progenitor', 'Endoderm Progenitor', 'Dropped']

temp = trimmed_node_info.copy()

# Get the actual values from the above node table
for node in temp.index:
    leaves = []
    for leaf in node_leaf_dict[node]:
        if leaf in dropped_leaf_dict.keys():
            leaves.append(dropped_leaf_dict[leaf])
        else:
            leaves.append(leaf)
    temp.loc[node]['Progenitor Type'] = getProgenitorType(leaves, trimmed_tree_cell_state_table)

In [35]:
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_shuffledBG_Counts.txt', index_col = 0)

actual_values = []
for progenitor in order:
    actual_values.append(len(temp[temp['Progenitor Type'] == progenitor]['Clone']))

fig, ax = plt.subplots(figsize = (20, 5))
sns.violinplot(data = df_bgCounts.T, scale = 'width')
sns.swarmplot(x = order, y = actual_values, color = 'Blue')
plt.ylabel('Count')
plt.savefig('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_Progenitor_Counts.pdf', dpi = 300)
#plt.show()
plt.close()

In [36]:
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)

# Start a figure for the cdf plots for each progneitor type in the untrimmed not full combination dataset
fig, ax = plt.subplots(2, 4, figsize = (16, 8))
axes = [ax[0, 0], ax[0, 1], ax[0, 2], ax[0, 3], ax[1, 0], ax[1, 1], ax[1, 2], ax[1, 3]]

count = 0
for progenitor in bgDists_dict.keys():
    temp_ax = axes[count]
    count += 1
    
    for i in bgDists_dict[progenitor]:
        sns.ecdfplot(i, ax=temp_ax, color = 'lightblue', alpha = 0.1)

# Plot the actual data
count = 0
for progenitor in bgDists_dict.keys():
    temp_ax = axes[count]
    count += 1
    
    sns.ecdfplot(temp[temp['Progenitor Type'] == progenitor]['Normalized Dist'], ax = temp_ax, color = 'Black')
    temp_ax.set_title(progenitor)
            
plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_Progenitor_cdfs.pdf', dpi = 300)
#plt.show()
plt.close()

In [37]:
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_shuffledBG_Counts.txt', index_col = 0)
df_bgVals = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_shuffledBG_Means.txt', index_col = 0)
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)

# Make a table of actual count, mean BG count, std BG count, zscore, pval, actual dist mean, BG dist mean, BG dist std, zscore, pval
values = ['Actual_Count', 'Mean_BG_Count', 'Std_BG_Count', 'zscore_Count', 'pval_L_Count', 'pval_U_Count', 'Actual_Dist', 'Mean_BG_Dist', 'Std_BG_Dist', 'zscore_Dist', 'pval_L_Dist', 'pval_U_Dist']

progenitor_stats_trim = pd.DataFrame(index = bgDists_dict.keys(), columns = values)

for progenitor in progenitor_stats_trim.index:
    actualCount = len(temp[temp['Progenitor Type'] == progenitor]['Clone Depth'])
    meanCount = df_bgCounts.loc[progenitor].mean()
    stdCount = df_bgCounts.loc[progenitor].std()
    zScore = (actualCount - meanCount) / stdCount
    
    # split the distribution into classes of above, below, or equal to the actual value
    above = []
    below = []
    equal = []
    for i in df_bgCounts.loc[progenitor]:
        if i > actualCount:
            above.append(i)
        elif i < actualCount:
            below.append(i)
        else:
            equal.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower = (len(below) + len(equal)) / 500
    pval_upper = (len(above) + len(equal)) / 500
    
    progenitor_stats_trim.loc[progenitor, 'Actual_Count'] = actualCount
    progenitor_stats_trim.loc[progenitor, 'Mean_BG_Count'] = meanCount
    progenitor_stats_trim.loc[progenitor, 'Std_BG_Count'] = stdCount
    progenitor_stats_trim.loc[progenitor, 'zscore_Count'] = zScore
    progenitor_stats_trim.loc[progenitor, 'pval_L_Count'] = pval_lower
    progenitor_stats_trim.loc[progenitor, 'pval_U_Count'] = pval_upper
    
    # Record all the info for the normalized distributions
    actualDist = temp[temp['Progenitor Type'] == progenitor]['Normalized Dist'].mean()
    meanDist = df_bgVals.loc[progenitor + '_Mean'].mean()
    stdDist = df_bgVals.loc[progenitor + '_Mean'].std()
    zScoreDist = (actualDist - meanDist) / stdDist
    
    # split the distribution into classes of above, below, or equal to the actual value
    above_Dist = []
    below_Dist = []
    equal_Dist = []
    for i in df_bgVals.loc[progenitor + '_Mean']:
        if i > actualDist:
            above_Dist.append(i)
        elif i < actualDist:
            below_Dist.append(i)
        else:
            equal_Dist.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower_Dist = (len(below_Dist) + len(equal_Dist)) / 500
    pval_upper_Dist = (len(above_Dist) + len(equal_Dist)) / 500
    
    progenitor_stats_trim.loc[progenitor, 'Actual_Dist'] = actualDist
    progenitor_stats_trim.loc[progenitor, 'Mean_BG_Dist'] = meanDist
    progenitor_stats_trim.loc[progenitor, 'Std_BG_Dist'] = stdDist
    progenitor_stats_trim.loc[progenitor, 'zscore_Dist'] = zScoreDist
    progenitor_stats_trim.loc[progenitor, 'pval_L_Dist'] = pval_lower_Dist
    progenitor_stats_trim.loc[progenitor, 'pval_U_Dist'] = pval_upper_Dist
    
progenitor_stats_trim.to_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_small_set/AM-DNA-098_trimmed_Progenitor_Stats.txt', sep = '\t')

# ShuffledBG on the trimmed nodes using the full set

In [39]:
# Trim the cell state table to be used for the shuffled background
leaves_to_keep = []

for cell in tree_cell_state_table['cellBC']:
    if not cell in trimmed_leaves:
        leaves_to_keep.append(cell)

trimmed_tree_cell_state_table_FC = tree_cell_state_table[tree_cell_state_table['cellBC'].isin(leaves_to_keep)].copy()

In [40]:
# I also need to add in rows for the nodes that were the top of the trimming, these are the nodes that remain in the trimmed
# node table that are labeled with a 'yes'

# for the nodes that are kept at the top of the trimmed regions, their node name is the singular cell type that they are 
# progenitors of, so that will be set in the table
temp_dict = {'Neural': 'NeuralTube1',
            'Somitic': 'Somite',
            'PCGLC': 'PCGLC',
            'Endoderm': 'Endoderm',
            'Endothelial': 'Endothelial',
            'NMPs': 'NMPs'}

# This adds back in ~66 nodes for the cells that were removed
for node in trimmed_node_info[trimmed_node_info['Trimmed'] == 'Yes'].index:
    cell_state_FC = temp_dict[trimmed_node_info.loc[node, 'Progenitor Type']]
    trimmed_tree_cell_state_table_FC.loc[node] = [cell_state_FC, 'None', node]

In [41]:
# I also need a dictionary that points each dropped leaf to the node in the trimmed_cell_state_table that saves it
dropped_leaf_dict_FC = {}

# Add the trimmed leaves to the dict
for leaf in trimmed_leaves:
    dropped_leaf_dict_FC[leaf] = ""
    
# go through remaining 'Yes' node and add that node as the item for each trimmed leaf
for node in trimmed_node_info[trimmed_node_info['Trimmed'] == 'Yes'].index:
    temp_leaves = node_leaf_dict[node]
    
    for leaf in temp_leaves:
        # every leaf under these trimmed nodes should be in the trimmed leaf set, this will throw an error if this is not true
        if dropped_leaf_dict_FC[leaf] != '': 
            if dropped_leaf_dict_FC[leaf] != node:
                # Print an error if the leaf points to 2 seperate nodes (error in the code somewhere)
                print('error')
                print(dropped_leaf_dict_FC[leaf])
                break
        # Set the value in the leaf dict to the top node
        dropped_leaf_dict_FC[leaf] = node

In [42]:
shuffledBG(file_name = 'AM-DNA-098_trimmed_FC',
           empty_node_info = empty_trimmed_node_info,
           cell_state_table = trimmed_tree_cell_state_table_FC,
           node_leaf_dict = node_leaf_dict,
           dropped_leaf_dict = dropped_leaf_dict,
           output_dir = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/',
           FC = True,
           numShuffles = 500)

In [43]:
# Graph the countsfrom each distribution
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/AM-DNA-098_trimmed_FC_shuffledBG_Counts.txt', index_col = 0)

pp = PdfPages('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/AM-DNA-098_trimmed_Node_Counts_FC.pdf')
index_ranges = [(0, 10), (10, 19), (19, 28), (28, 37), (37, 46), (46, 55), (55, 63)]
count = 1
for i, j in index_ranges:
    test = df_bgCounts.index[i: j]
    fig, ax = plt.subplots(figsize = (15, 10))
    sns.violinplot(data = df_bgCounts.T[test], ax = ax, scale = 'width', color = 'lightblue')
    actual_values = []
    for progenitor in test:
        actual_values.append(len(node_info_FC[node_info_FC['Progenitor Type'] == progenitor]['Clone Depth']))
    sns.swarmplot(x = test, y = actual_values, color = 'black', ax = ax)
    plt.xticks(rotation=45)
    plt.ylabel('Counts')
    #plt.ylim(0, 225)
    plt.tight_layout()
    pp.savefig()
    #plt.show()
    plt.close()
    count += 1
pp.close()

In [44]:
df_bgCounts = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/AM-DNA-098_trimmed_FC_shuffledBG_Counts.txt', index_col = 0)
df_bgVals = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/AM-DNA-098_trimmed_FC_shuffledBG_Means.txt', index_col = 0)
with open('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/AM-DNA-098_trimmed_FC_shuffledBG_Distributions.pickle', 'rb') as f:
    bgDists_dict = pickle.load(f)

# Make a table of actual count, mean BG count, std BG count, zscore, pval, actual dist mean, BG dist mean, BG dist std, zscore, pval
values = ['Actual_Count', 'Mean_BG_Count', 'Std_BG_Count', 'zscore_Count', 'pval_L_Count', 'pval_U_Count', 'Actual_Dist', 'Mean_BG_Dist', 'Std_BG_Dist', 'zscore_Dist', 'pval_L_Dist', 'pval_U_Dist']

progenitor_stats_trim_FC = pd.DataFrame(index = bgDists_dict.keys(), columns = values)

for progenitor in progenitor_stats_trim_FC.index:
    actualCount = len(trimmed_node_info[trimmed_node_info['Progenitor Type'] == progenitor]['Clone Depth'])
    meanCount = df_bgCounts.loc[progenitor].mean()
    stdCount = df_bgCounts.loc[progenitor].std()
    zScore = (actualCount - meanCount) / stdCount
    
    # split the distribution into classes of above, below, or equal to the actual value
    above = []
    below = []
    equal = []
    for i in df_bgCounts.loc[progenitor]:
        if i > actualCount:
            above.append(i)
        elif i < actualCount:
            below.append(i)
        else:
            equal.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower = (len(below) + len(equal)) / 500
    pval_upper = (len(above) + len(equal)) / 500
    
    progenitor_stats_trim_FC.loc[progenitor, 'Actual_Count'] = actualCount
    progenitor_stats_trim_FC.loc[progenitor, 'Mean_BG_Count'] = meanCount
    progenitor_stats_trim_FC.loc[progenitor, 'Std_BG_Count'] = stdCount
    progenitor_stats_trim_FC.loc[progenitor, 'zscore_Count'] = zScore
    progenitor_stats_trim_FC.loc[progenitor, 'pval_L_Count'] = pval_lower
    progenitor_stats_trim_FC.loc[progenitor, 'pval_U_Count'] = pval_upper
    
    
    
    # Record all the info for the normalized distributions
    actualDist = trimmed_node_info[trimmed_node_info['Progenitor Type'] == progenitor]['Normalized Dist'].mean()
    meanDist = df_bgVals.loc[progenitor + '_Mean'].mean()
    stdDist = df_bgVals.loc[progenitor + '_Mean'].std()
    zScoreDist = (actualDist - meanDist) / stdDist
    
    # split the distribution into classes of above, below, or equal to the actual value
    above_Dist = []
    below_Dist = []
    equal_Dist = []
    for i in df_bgVals.loc[progenitor + '_Mean']:
        if i > actualDist:
            above_Dist.append(i)
        elif i < actualDist:
            below_Dist.append(i)
        else:
            equal_Dist.append(i)
    
    # this is the pval that the actual value is lower than the distribution, calculated as # of 
    pval_lower_Dist = (len(below_Dist) + len(equal_Dist)) / 500
    pval_upper_Dist = (len(above_Dist) + len(equal_Dist)) / 500
    
    progenitor_stats_trim_FC.loc[progenitor, 'Actual_Dist'] = actualDist
    progenitor_stats_trim_FC.loc[progenitor, 'Mean_BG_Dist'] = meanDist
    progenitor_stats_trim_FC.loc[progenitor, 'Std_BG_Dist'] = stdDist
    progenitor_stats_trim_FC.loc[progenitor, 'zscore_Dist'] = zScoreDist
    progenitor_stats_trim_FC.loc[progenitor, 'pval_L_Dist'] = pval_lower_Dist
    progenitor_stats_trim_FC.loc[progenitor, 'pval_U_Dist'] = pval_upper_Dist
    
progenitor_stats_trim_FC.to_csv('/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/shuffledBG/trimmed_full_set/AM-DNA-098_trimmed_FC_Progenitor_Stats.txt', sep = '\t')