In [131]:
import ete3
import pandas as pd
from pprint import pprint
import numpy as np

In [17]:
def annotate_lineages(tree, gnmdf, col):
    for leaf in tree.get_leaves():
        sp = list(leaf.get_species())[0]
        feat_val = list(gnmdf[col][gnmdf['Proteome'] == sp])[0]
        leaf.add_feature(col, feat_val)

    return 0

In [151]:
def clade_norm(tree, tree_id, cladedf, col):
    annotate_lineages(tree, cladedf, col)
    tlno = len(tree.get_leaf_names())

    mphysets = list()
    for st in tree.traverse():
        leaves = st.get_leaves()

        feat_list = list()
        for stleag in st.get_leaves():
            feat_list.append(getattr(stleag, col))

        if (len(set(feat_list)) == 1 and len(leaves) > 1 and
                len(leaves) != tlno):
            mphy = dict()
            mphy['tree'] = tree_id
            mphy['node'] = st
            mphy['seq_no'] = len(leaves)
            mphy[col] = feat_list[0]
            mphysets.append(mphy)

    nodedf = pd.DataFrame(mphysets)
    indexes = nodedf.index
    
    mphylist = list()
    for group in set(nodedf[col]):
        if str(group) != 'nan':
            maxval = max(nodedf.loc[nodedf[col] == group]['seq_no'])
            dfindex = indexes[(nodedf[col] == group) & (nodedf['seq_no'] == maxval)]
            mphylist.append(mphysets[dfindex.values[0]])
    
    nodel = list()
    for nodedict in mphylist:
        nodel.append(norm_factor(nodedict))

    return nodel

In [152]:
def get_species(node):
    if '_' in node:
        return node.split("_")[1]
    else:
        return node

In [153]:
def norm_factor(nodedict):
    ndlf = nodedict['node'].get_leaves()
    distl = list()
    for leaf in ndlf:
        distl.append(nodedict['node'].get_distance(leaf))
    nodedict['mrca_to_tip_median'] = np.median(distl)
    
    return nodedict

In [154]:
tree = ete3.PhyloTree('../data/0005_sptree.nwk', sp_naming_function=get_species)

In [155]:
gnmdf = pd.read_csv('../data/0005_norm_groups.csv')
gnmdf.columns

Index(['Proteome', 'TaxaID', 'Date', 'Longest', 'Source', 'Species Name',
       'Normalising group'],
      dtype='object')

In [158]:
cladesl = list()
for line in open('../data/0005_19.txt', 'r'):
    tree = ete3.PhyloTree(line.split('\t')[3], sp_naming_function=get_species)
    cladesl += clade_norm(tree, line.split('\t')[0], gnmdf, 'Normalising group')

In [187]:
cladedf = pd.DataFrame(cladesl)
cladedf = cladedf.set_index('tree')
normfactor = cladedf[cladedf['Normalising group'] == 'B'].mrca_to_tip_median

tree
Phy000CWI8_YEAST    0.170794
Phy000CXDB_YEAST    0.257514
Phy000CWKW_YEAST    0.438460
Phy000CY3M_YEAST    0.271640
Phy000CXWU_YEAST    0.429059
                      ...   
Phy000CYSF_YEAST    0.023830
Phy000CYMZ_YEAST    0.215596
Phy000CZP0_YEAST    0.821982
Phy000CXUN_YEAST    0.803219
Phy000CXY3_YEAST    0.070789
Name: mrca_to_tip_median, Length: 208, dtype: float64

[{'Normalising group': 'B',
  'mrca_to_tip_median': 0.170794,
  'node': PhyloTree node '' (0x7f18eebadc4),
  'seq_no': 12,
  'tree': 'Phy000CWI8_YEAST'},
 {'Normalising group': 'A',
  'mrca_to_tip_median': 0.13262100000000002,
  'node': PhyloTree node '' (0x7f18eebad8b),
  'seq_no': 9,
  'tree': 'Phy000CWI8_YEAST'},
 {'Normalising group': 'B',
  'mrca_to_tip_median': 0.2575145,
  'node': PhyloTree node '' (0x7f18eebb63d),
  'seq_no': 10,
  'tree': 'Phy000CXDB_YEAST'},
 {'Normalising group': 'A',
  'mrca_to_tip_median': 0.3078785,
  'node': PhyloTree node '' (0x7f18eebb625),
  'seq_no': 4,
  'tree': 'Phy000CXDB_YEAST'},
 {'Normalising group': 'B',
  'mrca_to_tip_median': 0.43846,
  'node': PhyloTree node '' (0x7f18eebc90a),
  'seq_no': 121,
  'tree': 'Phy000CWKW_YEAST'},
 {'Normalising group': 'A',
  'mrca_to_tip_median': 0.36852999999999997,
  'node': PhyloTree node '' (0x7f18eebb679),
  'seq_no': 24,
  'tree': 'Phy000CWKW_YEAST'},
 {'Normalising group': 'B',
  'mrca_to_tip_median': 0.