In [5]:
from Bio import Entrez,SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd
import numpy as np
import time
import json
from sklearn.neighbors import NearestNeighbors

In [6]:
Entrez.email = 'alekey039@hotmail.com'

In [7]:
def retrieve_ids(query, db, maxrec = 50):
  """Fetch IDs from an NCBI database.

  Args:
    maxrec (int, optional): The number of records to retrieve for each batch
    db (str): Database from which records are retrieved
    query (str): A string used to query the database. The format
      should match the specific requirements of the database.

  Returns:
    list: A list of IDs retrieved
  """
  
  ids = [] # Initialize IDs list
  start = 0 # Start index for batch retrieval
  sleep_time = 1 # Initial sleep time for retrying after an error

  while(True):
    try:
      # Requesting batch of IDs from the database
      handle = Entrez.esearch(db = db, retmax = maxrec, retstart = start, term = query)
      rec = Entrez.read(handle)
      handle.close()
      sleep_time = 1 # Reset sleep time after successful request

    except Exception as error:
      # Retry mechanism in case of error
      print('Search failed, trying again in', sleep_time,'seconds:', error)
      time.sleep(sleep_time)
      sleep_time *= 2
      continue

    # Break the loop if no more IDs are found
    if len(rec['IdList']) == 0:
      break

    # Update start index for next batch and extend IDs list
    start += maxrec
    ids += rec['IdList']
    
  return ids

In [8]:
def retrieve_titles(ids, db = 'ipg', maxrec = 50):
  """Retrieve protein names and accession numbers for given IDs from 'Identical 
    Protein Groups' NCBI database.

  Args:
    ids (list): A list of protein IDs for which to retrieve the name
    maxrec (int, optional): The number of records to retrieve for each batch
    db (str, optional): Database from which records are retrieved. 

  Returns:
    tuple: A tuple containing two lists. The first list contains the protein titles
      for each ID in the given list. The second list contains the accession numbers
      for each ID.
  """
  
  titles = [] # Initialize titles list
  start = 0 # Start index for batch retrieval
  sleep_time = 1 # Initial sleep time for retrying after an error

  while start < len(ids):
    idsfrag = ids[start:start + maxrec] # Get a fragment of IDs for a batch
    retrieval = False # Indicates successful retrieval

    while not retrieval:
      try:
        # Retrieve batch of summaries from the database
        handle = Entrez.esummary(db = db, id = idsfrag, retmax = maxrec)
        ipgsum = Entrez.read(handle)
        handle.close()
        retrieval = True
        sleep_time = 1 # Reset sleep time after successful request

      except Exception as error:
        print('Error retrieving data, trying again in', sleep_time,'seconds:', error)
        time.sleep(sleep_time)
        sleep_time *= 2

    # Extract titles and accession numbers from retrieved data
    for entry in ipgsum['DocumentSummarySet']['DocumentSummary']:
      titles.append(entry['Title'])

    start += maxrec # Update start index for next batch
    
  return titles


def fix_unnamed(titles):
  """Replace empty strings ('') with a placeholder ('unnamed protein v#'). 
    Modifies the list in place.

  Args:
    titles (list): A list of protein titles

  Returns:
    list: Updated list of protein titles
  """
  
  unnamed_count = 1 # Counter for unnamed proteins
  for index, title in enumerate(titles):
    if title == '':
      titles[index] = 'unnamed protein v' + str(unnamed_count)
      unnamed_count += 1
  return titles

In [9]:
with open('phagedicts.json', 'r') as f:
    pathost = json.load(f)

In [10]:
def receptors(query, recs = 50):
    
    ids = retrieve_ids(query, db = 'ipg', maxrec = recs)
    titles = retrieve_titles(ids, db = 'ipg', maxrec = recs)
    titles = fix_unnamed(titles)
    
    return titles

In [11]:
def read_data(file):
  '''
  Reads data from a JSON file.

  Args:
    file (str): The path of the file to be read
  Returns:
    dict or None: A dictionary with data read from
      the file or None if the file cannot be read.
  '''
  try:
    with open(file, 'r') as f:
        data = json.load(f)
  # Return None if the file cannot be read or does not exist 
  except:
    return None

def store_data(data, file):
  '''Saves data to a JSON file.
  Existing data in the file will be overwritten.
  
  Args:
    data (dict): The data to be saved.
    file (str): The path of the file where
      data will be saved.
  
  '''
  try:
    with open(file, 'w') as f:
        json.dump(data, f)
  except Exception as error:
    print('Error writing to file:', error)

In [12]:
def read_pathogens(file):
  '''Create a list of unique pathogenic species from a text file.
  Each line must contain an individual pathogen name.
  
  Args:
    file (str): File path of the text file with the pathogen names.
  Returns:
    list: A list of unique pathogenic species.
  '''
  # Open the file and read each line
  with open(file, 'r') as f: 
    upat = [line.strip() for line in f]
  return upat

In [13]:
def query_pathogens(upat, maxrec, output_file):
  '''Query information about receptor proteins for each pathogen
  and save the data in a dictionary.
  
  Args:
    upat (str): List of pathogenic species to query
    maxrec (int): Maximum number of records to retrieve for each batch.
    output_file (str): File path where the data will be saved
  Returns:
    dict: A dictionary of the collected data with 2 keys (titles,
     and species).
  '''
  
  # Initialize a data dictionary to store the results
  data = {}
  
  # Read existing data from the output file, if any
  # Useful in case of interruption
  saved_data = read_data(output_file)
  # Initialize a counter to save batches
  element_counter = 0

  # If data already exists, store it in the dictionary
  if saved_data != None:
    data = saved_data
  # Keep track of already processed pathogens, if any
  processed_pathogens = data.keys()

  # Iterate over each pathogen and query for receptor proteins
  for pathogen in upat:
    
    # Skip pathogen if already processed
    if pathogen in processed_pathogens:
      continue
    
    query = pathogen + '[ORGN] AND receptor[All fields]'
    # Retrieve protein names and sequences for current pathogen
    titles = receptors(query, maxrec)
    # Store the data in the dictionary
    data[pathogen] = titles
    # Lists should be of the same length

    # Update the counter with the number of elements in each list
    element_counter += len(titles)
    # If 10 000 elements or more, store the data 
    if element_counter >= 10000:
        store_data(data, output_file)
        element_count = 0 # Reset the counter after saving data
        
  # Return the dictionary with data on receptor proteins
  return data

In [54]:
upat = read_pathogens('uniquepat.txt')
output_file = 'receptor_data.json'
data = query_pathogens(upat, 200, output_file)
store_data(data, output_file) # Save after previous function is done