In [6]:
import json
from Bio import Entrez, SeqIO, AlignIO
from sklearn.neighbors import NearestNeighbors
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import json
import re
import subprocess

In [35]:
Entrez.email = 'alekey039@hotmail.com'

In [36]:
with open('output.json', 'r') as file:
        data = json.load(file)

In [37]:
species = data['species']
titles = data['titles']

In [38]:
species_dict = {}
for sp, protein in zip(species, titles):
    if sp not in species_dict:
        species_dict[sp] = []
    if protein not in species_dict[sp]:
        species_dict[sp].append(protein)

In [39]:
unique_sp = sorted(set(species)) 
unique_titles = sorted(set(titles)) # these two lists do not need to have corresponding order

df = pd.DataFrame(index = unique_sp, columns = unique_titles)

In [40]:
for sp in df.index:
    df.loc[sp] = df.columns.isin(species_dict.get(sp, False)).astype(int)

In [41]:
df

Unnamed: 0,"2,3-diketo-L-gulonate TRAP transporter substrate-binding protein YiaO","2,3-diketo-L-gulonate-binding periplasmic protein","2,3-diketo-L-gulonate-binding periplasmic protein YiaO (2,3-DKG-binding protein) (Extracytoplasmic solute receptor protein yiaO)","2,3-diketo-L-gulonate-binding periplasmic protein YiaO precursor",2-methylaconitate cis-trans-isomerase PrpF/tripartite-type tricarboxylate transporter receptor subunit TctC,3'-5' exonuclease,3-hydroxyacyl-CoA dehydrogenase,4-(cytidine 5'-diphospho)-2-C-methyl-D-erythritol kinase,4-amino-4-deoxy-L-arabinose-phosphoundecaprenol flippase subunit ArnF,5'-nucleotidase C-terminal domain-containing protein,...,zeta toxin family protein,zinc ABC transporter substrate-binding protein,zinc piracy TonB-dependent receptor ZnuD,zinc ribbon domain-containing protein,zinc-binding dehydrogenase,zinc-dependent alcohol dehydrogenase family protein,zinc-regulated TonB-dependent outer membrane receptor,zinc-regulated TonB-dependent outer membrane receptor domain protein,zinc-regulated outer membrane receptor,zonular occludens toxin domain-containing protein
Achromobacter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Achromobacter animicus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Achromobacter xylosoxidans,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acinetobacter,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
Acinetobacter baumannii,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vibrio vulnificus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Weissella,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Yersinia enterocolitica,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Yersinia pestis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
features_array = df.to_numpy() # -> features
features_array.shape

#unique_titles -> labels 

(200, 2550)

In [43]:
test = unique_sp[174]
test_features = features_array[[174]]
print(len(features_array), len(unique_sp))

200 200


In [44]:
nn_model = NearestNeighbors(n_neighbors = 5, metric = 'hamming') 
nn_model.fit(features_array)

tdistances, tindices = nn_model.kneighbors(test_features, n_neighbors = 5)

In [45]:
unique_sp[174]

'Stenotrophomonas'

In [46]:
tindices[0], tdistances[0]

(array([174, 175,  61,  59,  34]),
 array([0.        , 0.01843137, 0.0772549 , 0.08039216, 0.08117647]))

In [47]:
for i in tindices[0]:
    print(unique_sp[i])

Stenotrophomonas
Stenotrophomonas maltophilia
Delftia tsuruhatensis
Delftia
Burkholderia gladioli


In [48]:
for d in tdistances[0]:
    print(d)

0.0
0.01843137254901961
0.07725490196078431
0.0803921568627451
0.0811764705882353


In [49]:
modified_array = np.delete(features_array, 174, axis=0)
modified_labels = list(unique_sp)
modified_labels.pop(174)
print(len(modified_array), len(modified_labels))

199 199


In [50]:
nn_model = NearestNeighbors(n_neighbors = 5, metric = 'hamming')
nn_model.fit(modified_array)

distances, indices = nn_model.kneighbors(test_features, n_neighbors = 1)

In [51]:
unique_sp[174] # Use unique_sp since modified_labels does not have this element

'Stenotrophomonas'

In [52]:
indices[0], distances[0]

(array([174]), array([0.01843137]))

In [53]:
similar = []
for i in indices[0]:
    print(modified_labels[i])
    similar.append(modified_labels[i])

Stenotrophomonas maltophilia


In [54]:
for d in distances[0]:
    print(d)

0.01843137254901961


In [3]:
with open('phagedicts.json', 'r') as f:
    phageinfo = json.load(f)

In [4]:
phageinfo

[{'phage': 'Aeromonas phage PVN03',
  'id': '2506662159',
  'strain': 'Aeromonas hydrophila 4.3T',
  'host': 'Aeromonas hydrophila'},
 {'phage': 'Aeromonas phage pAh6.2TG',
  'id': '2506662093',
  'strain': 'Aeromonas hydrophila BT09',
  'host': 'Aeromonas hydrophila'},
 {'phage': 'Aeromonas phage BUCT696',
  'id': '2506662019',
  'strain': 'Aeromonas veronii',
  'host': 'Aeromonas veronii'},
 {'phage': 'Vibrio phage vB_VpaM_VPs20',
  'id': '2506661979',
  'strain': 'Vibrio parahaemolyticus',
  'host': 'Vibrio parahaemolyticus'},
 {'phage': 'Escherichia phage vB_EcoM_DE15',
  'id': '2506661895',
  'strain': 'Escherichia coli',
  'host': 'Escherichia coli'},
 {'phage': 'Escherichia phage vB_EcoM_SA91KD',
  'id': '2506661816',
  'strain': 'Escherichia coli O156:H25',
  'host': 'Escherichia coli'},
 {'phage': 'Escherichia phage vB_EcoM_Bp10',
  'id': '2506661743',
  'strain': 'Escherichia sp.',
  'host': 'Escherichia'},
 {'phage': 'Escherichia phage vB_EcoP_Bp7',
  'id': '2506661678',
  '

In [62]:
cocktail = {host:[record['id'] for record in phageinfo if record['host'] == host] for host in similar}
cocktail_names = [record['phage'] for record in phageinfo if record['host'] in similar]

print(cocktail)

{'Stenotrophomonas maltophilia': ['2438095501', '2473935512', '2473935015', '2473934935', '2467891406', '2438268677', '2438107029', '2438106930', '2438106827', '2438106726', '2438106623', '2438082133', '2438082043', '2438081212', '2438080142', '2431125873', '1864378377', '1859662848', '1859662496', '1859656088', '1631942219', '971766223', '971764472', '589891988', '414090084', '213163898', '1694995567', '514361131', '363542401', '70727678']}


In [57]:
# Retrieve genomes
seqs = []
for bact, ids in cocktail.items():
    handle = Entrez.efetch(db='nucleotide', id = ids, rettype = 'fasta', retmode = 'text')
    for record in SeqIO.parse(handle, 'fasta'):
        seqs.append(record)
    handle.close()
    
with open('phages.fasta', 'w') as file:
    SeqIO.write(seqs, file, 'fasta')

In [60]:
def align_sequences(input_file, output_file):
    mafft_command = f'mafft --auto {input_file} > {output_file}'
    subprocess.call(mafft_command, shell=True)

In [61]:
align_sequences('phages.fasta', 'aligned_phages.fasta')

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8176 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 98 ambiguous characters.
    1 / 30
done.

Constructing a UPGMA tree (efffree=0) ... 
   20 / 30
done.

Progressive alignment 1/2... 
STEP     1 / 29 
len1=162327, len2=5819, Switching to the memsave mode
STEP    10 / 29 mDP 00001 / 00001DP 00001 / 00001DP 00001 / 00001DP 00001 / 00001DP 00001 / 00001DP 00001 / 00001DP 00001 / 00716DP 00002 / 00716DP 00003 / 00716DP 00004 / 00716DP 00005 / 00716DP 00006 / 00716DP 00007 / 00716DP 00008 / 00716DP 00009 / 00716DP 00010 / 00716DP 00011 / 00716DP 00012 / 00716DP 00013 / 00716DP 00014 / 00716DP 00015 / 00716DP 00016 / 00716DP 00017 / 00716DP 00018 / 00716DP 00019 / 00716DP 00020 / 00716DP 00021 / 00716DP 00022 / 00716DP 00023 / 00716DP 00024 / 00716DP 00025 / 00716DP 00026 / 00716DP 00027 / 00716DP 00028 / 00716DP 00029 / 00716DP

STEP    11 / 29 mDP 00001 / 00609DP 00002 / 00609DP 00003 / 00609DP 00004 / 00609DP 00005 / 00609DP 00006 / 00609DP 00007 / 00609DP 00008 / 00609DP 00009 / 00609DP 00010 / 00609DP 00011 / 00609DP 00012 / 00609DP 00013 / 00609DP 00014 / 00609DP 00015 / 00609DP 00016 / 00609DP 00017 / 00609DP 00018 / 00609DP 00019 / 00609DP 00020 / 00609DP 00021 / 00609DP 00022 / 00609DP 00023 / 00609DP 00024 / 00609DP 00025 / 00609DP 00026 / 00609DP 00027 / 00609DP 00028 / 00609DP 00029 / 00609DP 00030 / 00609DP 00031 / 00609DP 00032 / 00609DP 00033 / 00609DP 00034 / 00609DP 00035 / 00609DP 00036 / 00609DP 00037 / 00609DP 00038 / 00609DP 00039 / 00609DP 00040 / 00609DP 00041 / 00609DP 00042 / 00609DP 00043 / 00609DP 00044 / 00609DP 00045 / 00609DP 00046 / 00609DP 00047 / 00609DP 00048 / 00609DP 00049 / 00609DP 00050 / 00609DP 00051 / 00609DP 00052 / 00609DP 00053 / 00609DP 00054 / 00609DP 00055 / 00609DP 00056 / 00609DP 00057 / 00609DP 00058 / 00609DP 00059 / 00609DP 00060 / 00609DP 00061 / 00609DP 0006

STEP    13 / 29 mDP 00001 / 00257DP 00002 / 00257DP 00003 / 00257DP 00004 / 00257DP 00005 / 00257DP 00006 / 00257DP 00007 / 00257DP 00008 / 00257DP 00009 / 00257DP 00010 / 00257DP 00011 / 00257DP 00012 / 00257DP 00013 / 00257DP 00014 / 00257DP 00015 / 00257DP 00016 / 00257DP 00017 / 00257DP 00018 / 00257DP 00019 / 00257DP 00020 / 00257DP 00021 / 00257DP 00022 / 00257DP 00023 / 00257DP 00024 / 00257DP 00025 / 00257DP 00026 / 00257DP 00027 / 00257DP 00028 / 00257DP 00029 / 00257DP 00030 / 00257DP 00031 / 00257DP 00032 / 00257DP 00033 / 00257DP 00034 / 00257DP 00035 / 00257DP 00036 / 00257DP 00037 / 00257DP 00038 / 00257DP 00039 / 00257DP 00040 / 00257DP 00041 / 00257DP 00042 / 00257DP 00043 / 00257DP 00044 / 00257DP 00045 / 00257DP 00046 / 00257DP 00047 / 00257DP 00048 / 00257DP 00049 / 00257DP 00050 / 00257DP 00051 / 00257DP 00052 / 00257DP 00053 / 00257DP 00054 / 00257DP 00055 / 00257DP 00056 / 00257DP 00057 / 00257DP 00058 / 00257DP 00059 / 00257DP 00060 / 00257DP 00061 / 00257DP 0006

STEP     3 / 29 mDP 00002 / 00257DP 00003 / 00257DP 00004 / 00257DP 00005 / 00257DP 00006 / 00257DP 00007 / 00257DP 00008 / 00257DP 00009 / 00257DP 00010 / 00257DP 00011 / 00257DP 00012 / 00257DP 00013 / 00257DP 00014 / 00257DP 00015 / 00257DP 00016 / 00257DP 00017 / 00257DP 00018 / 00257DP 00019 / 00257DP 00020 / 00257DP 00021 / 00257DP 00022 / 00257DP 00023 / 00257DP 00024 / 00257DP 00025 / 00257DP 00026 / 00257DP 00027 / 00257DP 00028 / 00257DP 00029 / 00257DP 00030 / 00257DP 00031 / 00257DP 00032 / 00257DP 00033 / 00257DP 00034 / 00257DP 00035 / 00257DP 00036 / 00257DP 00037 / 00257DP 00038 / 00257DP 00039 / 00257DP 00040 / 00257DP 00041 / 00257DP 00042 / 00257DP 00043 / 00257DP 00044 / 00257DP 00045 / 00257DP 00046 / 00257DP 00047 / 00257DP 00048 / 00257DP 00049 / 00257DP 00050 / 00257DP 00051 / 00257DP 00052 / 00257DP 00053 / 00257DP 00054 / 00257DP 00055 / 00257DP 00056 / 00257DP 00057 / 00257DP 00058 / 00257DP 00059 / 00257DP 00060 / 00257DP 00061 / 00257DP 00062 / 00257DP 0006

DP 00001 / 00870 DP 00002 / 00870 DP 00003 / 00870 DP 00004 / 00870 DP 00005 / 00870 DP 00006 / 00870 DP 00007 / 00870 DP 00008 / 00870 DP 00009 / 00870 DP 00010 / 00870 DP 00011 / 00870 DP 00012 / 00870 DP 00013 / 00870 DP 00014 / 00870 DP 00015 / 00870 DP 00016 / 00870 DP 00017 / 00870 DP 00018 / 00870 DP 00019 / 00870 DP 00020 / 00870 DP 00021 / 00870 DP 00022 / 00870 DP 00023 / 00870 DP 00024 / 00870 DP 00025 / 00870 DP 00026 / 00870 DP 00027 / 00870 DP 00028 / 00870 DP 00029 / 00870 DP 00030 / 008

STEP     4 / 29 mP 00365 / 00870DP 00366 / 00870DP 00367 / 00870DP 00368 / 00870DP 00369 / 00870DP 00370 / 00870DP 00371 / 00870DP 00372 / 00870DP 00373 / 00870DP 00374 / 00870DP 00375 / 00870DP 00376 / 00870DP 00377 / 00870DP 00378 / 00870DP 00379 / 00870DP 00380 / 00870DP 00381 / 00870DP 00382 / 00870DP 00383 / 00870DP 00384 / 00870DP 00385 / 00870DP 00386 / 00870DP 00387 / 00870DP 00388 / 00870DP 00389 / 00870DP 00390 / 00870DP 00391 / 00870DP 00392 / 00870DP 00393 / 00870DP 00394 / 00870DP 00395 / 00870DP 00396 / 00870DP 00397 / 00870DP 00398 / 00870DP 00399 / 00870DP 00400 / 00870DP 00401 / 00870DP 00402 / 00870DP 00403 / 00870DP 00404 / 00870DP 00405 / 00870DP 00406 / 00870DP 00407 / 00870DP 00408 / 00870DP 00409 / 00870DP 00410 / 00870DP 00411 / 00870DP 00412 / 00870DP 00413 / 00870DP 00414 / 00870DP 00415 / 00870DP 00416 / 00870DP 00417 / 00870DP 00418 / 00870DP 00419 / 00870DP 00420 / 00870DP 00421 / 00870DP 00422 / 00870DP 00423 / 00870DP 00424 / 00870DP 00425 / 00870DP 00426

STEP     5 / 29 mP 00338 / 00806DP 00339 / 00806DP 00340 / 00806DP 00341 / 00806DP 00342 / 00806DP 00343 / 00806DP 00344 / 00806DP 00345 / 00806DP 00346 / 00806DP 00347 / 00806DP 00348 / 00806DP 00349 / 00806DP 00350 / 00806DP 00351 / 00806DP 00352 / 00806DP 00353 / 00806DP 00354 / 00806DP 00355 / 00806DP 00356 / 00806DP 00357 / 00806DP 00358 / 00806DP 00359 / 00806DP 00360 / 00806DP 00361 / 00806DP 00362 / 00806DP 00363 / 00806DP 00364 / 00806DP 00365 / 00806DP 00366 / 00806DP 00367 / 00806DP 00368 / 00806DP 00369 / 00806DP 00370 / 00806DP 00371 / 00806DP 00372 / 00806DP 00373 / 00806DP 00374 / 00806DP 00375 / 00806DP 00376 / 00806DP 00377 / 00806DP 00378 / 00806DP 00379 / 00806DP 00380 / 00806DP 00381 / 00806DP 00382 / 00806DP 00383 / 00806DP 00384 / 00806DP 00385 / 00806DP 00386 / 00806DP 00387 / 00806DP 00388 / 00806DP 00389 / 00806DP 00390 / 00806DP 00391 / 00806DP 00392 / 00806DP 00393 / 00806DP 00394 / 00806DP 00395 / 00806DP 00396 / 00806DP 00397 / 00806DP 00398 / 00806DP 00399

DP 00061 / 00333 DP 00062 / 00333 DP 00063 / 00333 DP 00064 / 00333 DP 00065 / 00333 DP 00066 / 00333 DP 00067 / 00333 DP 00068 / 00333 DP 00069 / 00333 DP 00070 / 00333 DP 00071 / 00333 DP 00072 / 00333 DP 00073 / 00333 DP 00074 / 00333 DP 00075 / 00333 DP 00076 / 00333 DP 00077 / 00333 DP 00078 / 00333 DP 00079 / 00333 DP 00080 / 00333 DP 00081 / 00333 DP 00082 / 00333 DP 00083 / 00333 DP 00084 / 00333 DP 00085 / 00333 DP 00086 / 00333 DP 00087 / 00333 DP 00088 / 00333 DP 00089 / 00333 DP 00090 / 003

STEP     6 / 29 mDP 00001 / 00277DP 00002 / 00277DP 00003 / 00277DP 00004 / 00277DP 00005 / 00277DP 00006 / 00277DP 00007 / 00277DP 00008 / 00277DP 00009 / 00277DP 00010 / 00277DP 00011 / 00277DP 00012 / 00277DP 00013 / 00277DP 00014 / 00277DP 00015 / 00277DP 00016 / 00277DP 00017 / 00277DP 00018 / 00277DP 00019 / 00277DP 00020 / 00277DP 00021 / 00277DP 00022 / 00277DP 00023 / 00277DP 00024 / 00277DP 00025 / 00277DP 00026 / 00277DP 00027 / 00277DP 00028 / 00277DP 00029 / 00277DP 00030 / 00277DP 00031 / 00277DP 00032 / 00277DP 00033 / 00277DP 00034 / 00277DP 00035 / 00277DP 00036 / 00277DP 00037 / 00277DP 00038 / 00277DP 00039 / 00277DP 00040 / 00277DP 00041 / 00277DP 00042 / 00277DP 00043 / 00277DP 00044 / 00277DP 00045 / 00277DP 00046 / 00277DP 00047 / 00277DP 00048 / 00277DP 00049 / 00277DP 00050 / 00277DP 00051 / 00277DP 00052 / 00277DP 00053 / 00277DP 00054 / 00277DP 00055 / 00277DP 00056 / 00277DP 00057 / 00277DP 00058 / 00277DP 00059 / 00277DP 00060 / 00277DP 00061 / 00277DP 0006

In [None]:
alignment = AlignIO.read('aligned_phages.fasta', 'fasta')

In [73]:
id_to_phage = {record['id']: record['phage'] for record in phageinfo}

indices = {0, 1, 2, 3, 14, 16}
for bact, ids in cocktail.items():
    for i in indices:
        target_id = ids[i]
        print(id_to_phage.get(target_id)) #wrong phages here

Stenotrophomonas maltophilia phage vB_SmaM_Ps15
Stenotrophomonas phage vB_SmaS-AXL_1
Stenotrophomonas phage vB_Sm_QDWS359
Stenotrophomonas phage A1432
Stenotrophomonas phage Pokken
Stenotrophomonas phage Philippe


In [35]:
# tail_regex = re.compile(r'tail.*protein')
# fiberdict = {}

# for bact, ids in cocktail.items():
    
#     tailseqs = []
#     handle = Entrez.efetch(db='nucleotide', id = ids, rettype = 'gb', retmode = 'text')
    
#     for record in SeqIO.parse(handle, 'genbank'):
#         for feature in record.features:
#             if feature.type == 'CDS':
#                 protein = feature.qualifiers.get('product', '')[0]
#                 # Use the regex expression to look for 'tail' and 'protein' in the name
#                 if tail_regex.search(protein):
#                     tailseqs.append(feature.qualifiers.get('translation', '')[0])
                    
#                 # Look for all proteins with 'tail' in them
# #                 if 'tail' in protein:
# #                     print(protein)
# #                 Retrieve the phage name as well
# #             if feature.type == 'source':
# #                 phage = feature.qualifiers.get('organism', '')
# #                 print(phage)
#     fiberdict[bact] = tailseqs
#     handle.close()