In [2]:
from Bio import Entrez, SeqIO, AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
import numpy as np
import heapq

In [3]:
alignment = AlignIO.read('mafft_file.fasta', 'fasta')

In [4]:
calculator = DistanceCalculator('identity')

In [5]:
distance_matrix = calculator.get_distance(alignment)

In [7]:
print(distance_matrix)

NC_070951.1 0.000000
NC_048804.1 0.397833    0.000000
NC_048802.1 0.389826    0.052290    0.000000
NC_048755.1 0.321344    0.113941    0.114557    0.000000
NC_070868.1 0.480995    0.469771    0.466722    0.457127    0.000000
NC_070867.1 0.480588    0.468673    0.465590    0.457836    0.026112    0.000000
NC_049463.1 0.479363    0.468844    0.465853    0.456402    0.035717    0.041476    0.000000
NC_070858.1 0.479938    0.475794    0.472837    0.457035    0.169697    0.168760    0.169482    0.000000
NC_070847.1 0.476286    0.475102    0.472033    0.455625    0.168828    0.168691    0.168520    0.124032    0.000000
NC_042082.1 0.493457    0.480981    0.478661    0.458859    0.289244    0.288292    0.288652    0.286961    0.285702    0.000000
NC_071018.1 0.507593    0.490607    0.488776    0.481419    0.252192    0.251004    0.251548    0.245834    0.248472    0.304711    0.000000
NC_071017.1 0.505218    0.492314    0.490706    0.482377    0.252647    0.251035    0.251648    0.247346    0

In [64]:
def top_distances(matrix, k = 5):
    array = np.array(matrix)

    minheap = [] # Initialize heap structure

    for i in range(len(array)):
        for j in range(i):
            d = array[i,j] # Distance between two samples
            if len(minheap) < k: # If heap is less than k, add the element
                heapq.heappush(minheap, (d, i, j)) 
            elif d > minheap[0][0]: # If distance is greater than root
                # push new element and pop current
                heapq.heappushpop(minheap, (d, i, j))
    return minheap

In [85]:
heap = top_distances(distance_matrix)
heap

[(0.5515128080537097, 16, 0),
 (0.6397764835510782, 14, 3),
 (0.680633593167213, 14, 0),
 (0.6692045524538219, 14, 2),
 (0.6730678419644263, 14, 1)]

In [86]:
unique_indices = set()
for _, i, j in heap: 
    unique_indices.update([i, j])

{0, 1, 2, 3, 14, 16}

In [100]:
accn = []
for i in unique_indices:
    accn.append(distance_matrix.names[i])

In [101]:
accn

['NC_070951.1',
 'NC_048804.1',
 'NC_048802.1',
 'NC_048755.1',
 'NC_029000.1',
 'NC_071021.1']

In [97]:
Entrez.email = 'alekey039@hotmail.com'

In [128]:
phagelist = []
handle = Entrez.efetch(db="nucleotide", id=accn, rettype="gb",
                    retmode="text", seq_start = 1, seq_stop = 1)
source = SeqIO.parse(handle, 'gb')

for item in source:
    organism = item.features[0].qualifiers['organism'][0]
    phagelist.append(organism)
handle.close()

In [144]:
print("\n".join(phagelist))

Stenotrophomonas maltophilia phage vB_SmaM_Ps15
Stenotrophomonas phage Mendera
Stenotrophomonas phage Moby
Stenotrophomonas phage YB07
Stenotrophomonas phage IME13
Stenotrophomonas phage BUCT627
