In [1]:
from Bio import Entrez,SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd
import numpy as np
import time
import json
from sklearn.neighbors import NearestNeighbors
from sklearn import model_selection
from sklearn.metrics import accuracy_score

In [2]:
Entrez.email = 'alekey039@hotmail.com'

In [3]:
def retrieve_ids(max, db, query):
  ids = []
  start = 0
  sleep_time = 1

  while(True):
    try:
      handle = Entrez.esearch(db = db, retmax = max, retstart = start, term = query)
      rec = Entrez.read(handle)
      handle.close()
      sleep_time = 1

    except Exception as error:
      print('Search failed, trying again in', sleep_time,'seconds:', error)
      time.sleep(sleep_time)
      sleep_time *= 2
      continue

    if len(rec['IdList']) == 0:
      break

    start += max
    ids += rec['IdList']
  return ids


In [4]:
def retrieve_summary(ids, max):
  titles = []
  acc = []
  start = 0
  sleep_time = 1

  while start < len(ids):
    idsfrag = ids[start:start + max]
    retrieval = False

    while not retrieval:
      try:
        handle = Entrez.esummary(db='ipg', id = idsfrag, retmax = max)
        ipgsum = Entrez.read(handle)
        handle.close()
        retrieval = True
        sleep_time = 1

      except Exception as error:
        print('Error retrieving data, trying again in', sleep_time,'seconds:', error)
        time.sleep(sleep_time)
        sleep_time *= 2

    for entry in ipgsum['DocumentSummarySet']['DocumentSummary']:
      titles.append(entry['Title'])
      acc.append(entry['Accession'])


    start += max
  return titles, acc

In [5]:
#Naming unnamed proteins
def fix_unnamed(titles):
  c = 1
  for i in range(len(titles)):
    if titles[i] == '':
      titles[i] = 'unnamed protein v'+str(c)
      c += 1
  return titles

In [6]:
#Download pathogenic bacteria list from Barlett et al.
#Store it in a dataframe
url = 'https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens/raw/master/data/bacteria_human_pathogens.xlsx'
bdf = pd.read_excel(url, sheet_name='Tab 6 Full List', usecols="F:G", skiprows=0)

In [7]:
#Convert dataframe to list
#Join the genus and species column

pblist = list(bdf['genus'] + ' ' + bdf['species'])

In [8]:
# Found random characters
# Used .replace to remove them

clean_pathogen_list = [species.replace('¬†','') for species in pblist]

In [10]:
max = 100
db = 'nucleotide'
query = 'Viruses[ORGN] AND phage[All fields] AND srcdb_refseq[PROP] \
NOT wgs[PROP] NOT cellular organisms[ORGN] NOT AC_000001:AC_999999[PACC]'

#Execute the function with these parameters
phageids = retrieve_ids(max, db, query)

In [11]:
#Input: IDs of phages
#Output: List of bacterial hosts
#seq_start and seq_stop parameters retrieve the first feature only (source)
#In the source feature, there is information about the host

def phageid_to_host(phageids):
  phageinfo = []
  sleep_time = 1

  for id in phageids:
    phage_dict = {}
    try:
      handle = Entrez.efetch(db="nucleotide", id=id, rettype="gb",
                            retmode="text", seq_start = 1, seq_stop = 1)
      source = SeqIO.read(handle, 'gb')
      handle.close()

      features = source.features[0]
      qual = features.qualifiers

      strain = qual.get('host', qual.get('lab_host', None))

      if strain != None:
        strain = strain[0]
        phage_dict['phage'] = qual['organism'][0]
        phage_dict['id'] = id
        phage_dict['acc'] = source.id
        phage_dict['strain'] = strain

        split = strain.split(" ", 2)

        if len(split) > 1 and ("sp." in split[1] or "spp." in split[1]):
          species = split[0]
        elif len(split) > 1:
          species = split[0] + " " + split[1]
        else:
          species = split[0]
        phage_dict['host'] = species

        phageinfo.append(phage_dict)

      sleep_time = 1

    except Exception as error:
      print('Error fetching data, trying again in', sleep_time,'seconds:', error)
      time.sleep(sleep_time)
      sleep_time *= 2
      continue

  return phageinfo

In [None]:
phageinfo = phageid_to_host(phageids)

In [None]:
def select_hosts(phinfo, patlist):
  pathost = []
  patstring = ' '.join(patlist)

  for phage in phinfo:
    if phage['host'] in patstring:
      pathost.append(phage)

  return filtered_phagedict

In [18]:
# Download the dictionary of phages with pathogenic hosts
# might have to modify this and reduce it to unique phages

with open('phagedicts.json', 'w') as f:
    json.dump(filtered_phagedict, f)

NameError: name 'pathost' is not defined

In [None]:
#List of unique pathogen hosts

uniquepat = []
for phage in pathost:
  if phage['host'] not in uniquepat and phage['host'] not 'bacterium':
    uniquepat.append(phage['host'])

In [32]:
file_path = 'uniquepat.txt'

# Create an empty list to store the lines
upat = []

# Open the file and read each line
with open(file_path, 'r') as file:
    for line in file:
        # Strip newline characters and add to the list
        upat.append(line.strip())

# Print the list to verify the contents
print(upat)


['Aeromonas hydrophila', 'Aeromonas veronii', 'Vibrio parahaemolyticus', 'Escherichia coli', 'Escherichia', 'Yersinia pseudotuberculosis', 'Salmonella enterica', 'Vibrio cholerae', 'Gordonia terrae', 'Gordonia rubripertincta', 'Klebsiella pneumoniae', 'Listeria monocytogenes', 'Pseudomonas aeruginosa', 'Enterococcus faecalis', 'Bacillus pumilus', 'Bacillus cereus', 'Staphylococcus aureus', 'Bacillus thuringiensis', 'Staphylococcus xylosus', 'Serratia marcescens', 'Pseudomonas', 'Vibrio alginolyticus', 'Vibrio anguillarum', 'Proteus mirabilis', 'Pseudomonas fluorescens', 'Arthrobacter', 'Rhizobium', 'Bifidobacterium dentium', 'Curtobacterium', 'Rothia dentocariosa', 'Microbacterium paraoxydans', 'Mycobacterium smegmatis', 'Rhodococcus erythropolis', 'Streptomyces', 'Providencia stuartii', 'Stenotrophomonas maltophilia', 'Salmonella', 'Salmonella enteritidis', 'Bacillus anthracis', 'Shigella boydii', 'Bacillus subtilis', 'Sphingomonas', 'Hafnia paralvei', 'Pseudomonas putida', 'Klebsiell

In [44]:
def receptors(maxm,db,query):

  ids = retrieve_ids(maxm,db,query)
  titles, acc = retrieve_summary(ids,maxm)
  aaseqs = fetch_sequences(acc)
  titles = fix_unnamed(titles)

  return titles, aaseqs


In [43]:
# maxm = 200
# db = 'ipg'
# alltitles = []
# allseqs = []
# species = [] 

# for pathogen in upat:
#     query = str(pathogen)+'[ORGN] AND receptor[All fields]'
#     print(pathogen)
#     titles, aaseqs = receptors(maxm,db,query)
#     alltitles += titles
#     allseqs += aaseqs
#     species += pathogen
#     print(len(titles))
    

  #add species list (to link it to the phage)
  #run this locally

