In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import scanpy as sc
import cassiopeia as cas
import seaborn as sns
import pickle

from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from itertools import combinations
from itertools import product
from ete3 import Tree
from typing import Tuple

In [5]:
clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)
    
barcodes = ['Bar1', 'Bar2', 'Bar3', 'Bar4', 'Bar5', 'Bar6']

# Fill out metadata for trees

In [4]:
def maxDepth(node):
    '''
    Input:
        - a node in an ete tree
    returns:
        - The max depth of any branch in that node
    '''
    if node.is_leaf():
        return 0
    children_depths = []
    
    for child in node.children:
        test = maxDepth(child)
        
        children_depths.append(test)
        
    return max(children_depths) + 1

In [6]:
tree_combo = []
for barcode in barcodes:
    for time in ['120', '144', '120_144']:
        for method in ['hybrid', 'greedy', 'neighbor']:
            tree_combo.append('{}_{}_{}'.format(barcode, time, method))

explant_metadata = pd.DataFrame(index = tree_combo, columns = ['barcode', 'timepoint', 'method', 'n_of_cells', 'n_of_clones', 'max_clone_depth', 'avg_clone_depth', 'avg_clone_size', 'tree_likelihood', 'n_of_unique_alleles', 'n_of_unique_indels'])

In [10]:
# Fill out the metadata table
for barcode in barcodes:
    for time in ['120', '144', '120_144']:
        for method in ['greedy', 'neighbor', 'hybrid']:
            ID = '{}_{}_{}'.format(barcode, time, method)
            
            nwkFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/{}/{}/{}_{}_{}_newick_noMutationlessEdges_Labeled.nwk'.format(barcode, time, method, barcode, time, method)
            metadataFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/{}/{}/{}_{}_metadata.txt'.format(barcode, time, method, barcode, time)
            characterFile = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/{}/{}/{}_{}_character_matrix.txt'.format(barcode, time, method, barcode, time)
            prior_file = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/{}/{}/{}_{}_priors.pickle'.format(barcode, time, method, barcode, time)

            temp_allele_table_file = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/{}/{}/{}_{}_allele_table.txt'.format(barcode, time, barcode, time)
            temp_allele_table = pd.read_csv(temp_allele_table_file, index_col = 0)
            
            t = Tree(nwkFile, format=1)
            tree_meta = pd.read_csv(metadataFile, sep='\t')
            character_matrix = pd.read_csv(characterFile, sep='\t', index_col = 0)
            with open(prior_file, 'rb') as f:
                priors = pickle.load(f)

            tree = cas.data.CassiopeiaTree(character_matrix=character_matrix, priors=priors, tree = t)

            missing_proportion = (character_matrix == -1).sum(axis=0) / character_matrix.shape[0]
            uncut_proportion = (character_matrix == 0).sum(axis=0) / character_matrix.shape[0]
            n_unique_states = character_matrix.apply(lambda x: len(np.unique(x[(x != 0) & (x != -1)])), axis=0)
            tree.parameters['stochastic_missing_probability'] = 0.1
            
            temp_clones = {}
            clone_sizes = []
            clone_depths = []
            for clone in t.children:
                temp_clones[clone.name] = [leaf.name for leaf in clone.get_leaves()]
                clone_sizes.append(len(temp_clones[clone.name]))
                clone_depths.append(maxDepth(clone))
                
            unique_alleles = set()
            unique_indels = set()
            for allele in temp_allele_table['allele']:
                unique_alleles.add(allele)
            for indel in temp_allele_table['r1']:
                unique_indels.add(indel)
            for indel in temp_allele_table['r2']:
                unique_indels.add(indel)
            for indel in temp_allele_table['r3']:
                unique_indels.add(indel)

            n_of_unique_lineageBC = len(unique_lineageBC)
            n_of_unique_alleles = len(unique_alleles)
            n_of_unique_indels = len(unique_indels)
            
                        
            explant_metadata.loc[ID, 'barcode'] = barcode
            explant_metadata.loc[ID, 'timepoint'] = time
            explant_metadata.loc[ID, 'method'] = method
            explant_metadata.loc[ID, 'n_of_cells'] = len(t.get_leaves())
            explant_metadata.loc[ID, 'max_clone_depth'] = maxDepth(t)
            explant_metadata.loc[ID, 'n_of_clones'] = len(t.children)
            explant_metadata.loc[ID, 'avg_clone_depth'] = np.mean(clone_depths)
            explant_metadata.loc[ID, 'avg_clone_size'] = np.mean(clone_sizes)
            explant_metadata.loc[ID, 'likelihood'] = cas.tools.calculate_likelihood_continuous(tree)
            explant_metadata.loc[ID, 'n_of_unique_alleles'] = n_of_unique_alleles
            explant_metadata.loc[ID, 'n_of_unique_indels'] = n_of_unique_indels

In [11]:
explant_metadata.to_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/lineage_stats/explant_metadata.txt', sep = '\t')