In [1]:
from Bio import Entrez, SeqIO, AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
import numpy as np
import heapq

In [2]:
alignment = AlignIO.read('aligned_phages.fasta', 'fasta')

In [3]:
calculator = DistanceCalculator('identity')

In [4]:
distance_matrix = calculator.get_distance(alignment)

In [8]:
def top_distances(matrix, k = 1):
    array = np.array(matrix)

    minheap = [] # Initialize heap structure

    for i in range(len(array)):
        for j in range(i):
            d = array[i,j] # Distance between two samples
            if len(minheap) < k: # If heap is less than k, add the element
                heapq.heappush(minheap, (d, i, j)) 
            elif d > minheap[0][0]: # If distance is greater than root
                # push new element and pop current
                heapq.heappushpop(minheap, (d, i, j))
    return minheap

In [9]:
heap = top_distances(distance_matrix)
heap

[(0.5515128080537097, 6, 0),
 (0.6397764835510782, 22, 19),
 (0.6692045524538219, 22, 18),
 (0.680633593167213, 22, 0),
 (0.6730678419644263, 22, 17)]

In [10]:
unique_indices = set()
for _, i, j in heap: 
    unique_indices.update([i, j])

In [13]:
unique_indices

{0, 6, 17, 18, 19, 22}

In [11]:
accn = []
for ind in unique_indices:
    accn.append(distance_matrix.names[ind])

In [12]:
accn

['NC_070951.1',
 'NC_071021.1',
 'NC_048804.1',
 'NC_048802.1',
 'NC_048755.1',
 'NC_029000.1']