In [68]:
#load the data

import csv
import pandas as pd
import numpy as np
import scipy as sp 
import math
with open('table.dat', 'r') as file:
    table = pd.read_csv(file, delimiter=',', header = None)

with open('msa.dat', 'r') as file:
    msa = pd.read_csv(file, delimiter=',', header = None)

with open('branchlength.dat', 'r') as file:
    brlen = pd.read_csv(file, delimiter=',', header = None)

msa[[0,1]] = msa[0].str.split(' ', expand=True)
msa[0] = pd.to_numeric(msa[0])
Q = np.array([[-0.5625,0.1875,0.1875,0.1875],[0.1875,-0.5625,0.1875,0.1875],
 [0.1875,0.1875,-0.5625,0.1875],[0.1875,0.1875,0.1875,-0.5625]])
print(table)
print(msa)
print(brlen)

   0  1
0  9  1
1  9  2
2  8  9
3  8  3
4  7  4
5  7  5
6  6  7
7  6  8
   0                               1
0  1  AGATCAAGATCAAGATCAAGATCAAGATCA
1  2  AGCTCAAGCTCAAGCTCAAGCTCAAGCTCA
2  3  CGCTATCGCTATCGCTATCGCTATCGCTAT
3  4  CGTTACCGTTACCGTTACCGTTACCGTTAC
4  5  CGCTACCGCTACCGCTACCGCTACCGCTAC
     0    1     2     3    4     5     6     7
0  0.1  0.4  0.01  0.04  0.2  0.08  0.12  0.14


In [70]:
def create_node_list(tab, ms, brl):
    node_list = []
    nodes = {}
    branches = iter(brl.iloc[0])
    for i in range(len(tab[1])):
            
        itself = tab[1][i]
        parent =  tab[0][i]
        sib_index = [index for index, value in enumerate(tab[0]) if value == parent and tab[1][index] != itself]
        sibling = tab[1][sib_index[0]]
        branch = next(branches)
        sequence = np.array(list(ms[1].values[itself-1])) if itself in ms[0].values else None
        root = False       
        nodes[itself] = Node(itself, parent, sibling, sequence, branch, root)
        node_list.append(nodes[itself])
    for i in tab[0]:

        if i not in list(tab[1]):
            itself = i
            parent =  None
            sibling = None
            branch = None
            root = True
            sequence = np.array(list(ms[1][itself])) if itself in ms[0] else None
            nodes[itself] = Node(itself, parent, sibling, sequence, branch, root)
            node_list.append(nodes[itself])
            break
    return node_list

def find_node_by_number(node_list, number):
    for node in node_list:
        if node.number == number:
            return node
    return None

def find_node_by_sigling(node_list, number):
    for node in node_list:
        if node.number == find_node_by_number(node_list, number).sibling:
            return node
    return None
def find_node_by_parent(node_list, number):
    for node in node_list:
        if node.number == find_node_by_number(node_list, number).parent:
            return node
    return None
def find_root_node(node_list):
    for node in node_list:
        if node.root == True:
            return node
    return None

In [75]:
#define the class

class Node:
    def __init__(self, number, parent, sibling, seq, length, root):
        self.number = number
        self.parent = parent
        self.sibling = sibling
        self.seq = seq
        self.length = length
        self.root = root

    def add_probability_matrix(self, matrix):
        if hasattr(self, 'length') and self.length is not None:
            self.matrix = sp.linalg.expm(self.length * matrix)
        else:
           # print("No length attribute found")
            self.matrix = None
    def add_seq_probability(self):
        base_prob = {"A": np.array([1,0,0,0]), "T":np.array([0,1,0,0]), "C" : np.array([0,0,1,0]), "G" : np.array([0,0,0,1])}
        map_values = np.vectorize(lambda x: base_prob[x], signature='()->(n)')
        if hasattr(self, 'seq') and self.seq is not None:
            self.prob = map_values(self.seq)
        else:
            #print("No sequence attribute found")
            self.prob = None

class Tree:
    def __init__(self, position):
        self.position = position
    def fill_probability_matrix(self, matrix):
        for node in self.position:
            node.add_probability_matrix(matrix)
            node.add_seq_probability()
    def fill_probability_sequence(self):
        root_node = find_root_node(self.position)
        while root_node.prob is None:
            for node in self.position:
                sib = find_node_by_sigling(self.position, node.number)
                parent = find_node_by_parent(self.position, node.number)
                if node.root == False and node.prob is not None and sib.prob is not None and parent.prob is None:
                    vec1 = np.matmul(node.prob, node.matrix)
                    vec2 = np.matmul(sib.prob, sib.matrix)
                    vec_parent = vec1 * vec2
                    parent.prob = vec_parent
                else:
                    continue
    def tree_log_likelihood(self, matrix):
        self.fill_probability_matrix(matrix)
        self.fill_probability_sequence()
        root_node = find_root_node(self.position)
        npmm = np.matmul(root_node.prob, np.array([0.25,0.25,0.25,0.25])) 
        log_likelihood = sum([math.log(npmm.flatten()[i]) for i in range(len(npmm.flatten()))])
        return log_likelihood
        

In [76]:
tree_test = Tree(create_node_list(table, msa, brlen))
tree_test.tree_log_likelihood(Q)


-188.20007684036113