In [4]:
def retrieve_pathogenic_list():
    #Download pathogenic bacteria list from Barlett et al.
    #Store it in a dataframe
    url = 'https://github.com/padpadpadpad/bartlett_et_al_2022_human_pathogens/raw/master/data/bacteria_human_pathogens.xlsx'
    bdf = pd.read_excel(url, sheet_name='Tab 6 Full List', usecols="F:G", skiprows=0)
    
    #Convert dataframe to list
    #Combine the genus and species column
    pblist = list(bdf['genus'] + ' ' + bdf['species'])
    
    #Removing random characters
    clean_patlist = [species.replace('¬†','') for species in pblist]
    return clean_patlist

In [None]:
max = 100
db = 'nucleotide'
query = 'Viruses[ORGN] AND phage[All fields] AND srcdb_refseq[PROP] \
NOT wgs[PROP] NOT cellular organisms[ORGN] NOT AC_000001:AC_999999[PACC]'

#Execute the function with these parameters
phageids = retrieve_ids(max, db, query)

In [None]:
#Input: IDs of phages
#Output: List of bacterial hosts
#seq_start and seq_stop parameters retrieve the first feature only (source)
#In the source feature, there is information about the host

def phageid_to_host(phageids):
  phageinfo = []
  sleep_time = 1

  for id in phageids:
    phage_dict = {}
    try:
      handle = Entrez.efetch(db="nucleotide", id=id, rettype="gb",
                            retmode="text", seq_start = 1, seq_stop = 1)
      source = SeqIO.read(handle, 'gb')
      handle.close()

      features = source.features[0]
      qual = features.qualifiers

      strain = qual.get('host', qual.get('lab_host', None))

      if strain != None:
        strain = strain[0]
        phage_dict['phage'] = qual['organism'][0]
        phage_dict['id'] = id
        phage_dict['acc'] = source.id
        phage_dict['strain'] = strain

        split = strain.split(" ", 2)

        if len(split) > 1 and ("sp." in split[1] or "spp." in split[1]):
          species = split[0]
        elif len(split) > 1:
          species = split[0] + " " + split[1]
        else:
          species = split[0]
        phage_dict['host'] = species

        phageinfo.append(phage_dict)

      sleep_time = 1

    except Exception as error:
      print('Error fetching data, trying again in', sleep_time,'seconds:', error)
      time.sleep(sleep_time)
      sleep_time *= 2
      continue

  return phageinfo

In [None]:
def select_hosts(phinfo, patlist):
  pathost = []
  patstring = ' '.join(patlist)

  for phage in phinfo:
    if phage['host'] in patstring:
      pathost.append(phage)

  return pathost

In [None]:
# Download the dictionary of phages with pathogenic hosts

with open('phagedicts.json', 'w') as f:
    json.dump(pathost, f)

In [None]:
#List of unique pathogen hosts

uniquepat = []
for phage in pathost:
  if phage['host'] not in uniquepat and phage['host'] != 'bacterium':
    uniquepat.append(phage['host'])

In [None]:
file_path = 'uniquepat.txt'

# Create an empty list to store the lines
upat = []

# Open the file and read each line
with open(file_path, 'r') as file:
    for line in file:
        # Strip newline characters and add to the list
        upat.append(line.strip())

# Print the list to verify the contents
# print(upat)

In [None]:
#Retrieve phage IDs with previously defined function
phageids = retrieve_ids(max, db, query)

#Retrieve host information using phage IDs
phageinfo = phageid_to_host(phageids)

#Create list of dictionaries for phages with pathogen hosts
pathost = select_hosts(phageinfo, clean_pathogen_list)

#Create list of unique pathogen hosts
uniquepat = []
for phage in pathost:
    if phage['host'] not in uniquepat:
        uniquepat.append(phage['host'])