In [1]:
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio import Phylo
from featureUtils import BED_file_to_features, feature_to_seq
from Bio.Phylo.PhyloXML import Phyloxml
from hierarchical_clustering import hierarchical_clustering, build_phylogeny, get_matrices, save_phylogeny, save_matrices, load_phylogeny, load_matrices, get_clustering_matrices
from matrix_utils import load_matrix_from_triu

In [2]:
references = {seq.id : seq for seq in SeqIO.parse("./data/monomers/chr2_centromere_plus2Mb.fasta", "fasta")}

In [3]:
monomers_as_features = BED_file_to_features("./data/monomers/chr2_StringDecMon2extr_noMorgante.bed")

In [4]:
#sample_size = len(monomers_as_features)
#sample_size = 10

In [5]:
#monomers_as_features = monomers_as_features[0:sample_size]

In [6]:
monomers_as_seqs = [feature_to_seq(feature, references) for feature in monomers_as_features]

In [7]:
dist_matrix = load_matrix_from_triu('data/dist_matrix_triu.npy')

In [9]:
dist_matrix

array([[ 0, 14, 10, ..., 42, 48, 65],
       [14,  0,  8, ..., 47, 49, 65],
       [10,  8,  0, ..., 45, 47, 68],
       ...,
       [42, 47, 45, ...,  0, 26, 69],
       [48, 49, 47, ..., 26,  0, 70],
       [65, 65, 68, ..., 69, 70,  0]], dtype=uint8)

In [10]:
#dist_matrix = dist_matrix[0:sample_size,0:sample_size]

In [11]:
phylogeny = hierarchical_clustering(dist_matrix=dist_matrix)
save_phylogeny(phylogeny=phylogeny, filename='data/phylo')

In [12]:
phyloXml = Phyloxml(phylogenies=[build_phylogeny(simple_phylogeny=phylogeny, features=monomers_as_features)], attributes=None)
Phylo.write(phyloXml, 'data/phylogeny.xml', format='phyloxml')

1