<a href="https://colab.research.google.com/github/Alebraco/phagetool/blob/main/Phage_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install bio
from Bio import Entrez,SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd
import numpy as np

Collecting bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, bio
Successfully installed bio-1.5.9 biopython-1.81 biothings-client-0.3.0 gprofiler-official-1.0.0 mygene-3.2.2


In [3]:
Entrez.email = 'alekey039@hotmail.com'

In [4]:
handle = Entrez.einfo()
info = Entrez.read(handle)
print(info)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [5]:
def retrieve_ids(max, query):
  ids = []
  start = 0
  while(True):
    try:
      handle = Entrez.esearch(db='ipg', retmax = max, retstart = start, term = query)
      rec = Entrez.read(handle)
      handle.close()

    except Exception as error:
      print('Search failed, trying again in 1 second:', error)
      time.sleep(1)
      continue

    if len(rec['IdList']) == 0:
      break

    start += max
    ids += rec['IdList']
  return ids


In [6]:
#Can be a separate function

max = 40
strain = 'Escherichia coli K-12'
query = str(strain)+'[ORGN] AND receptor[All fields]'
print(query)
#Execute the function with these parameters

ids = retrieve_ids(max, query)

Escherichia coli K-12[ORGN] AND receptor[All fields]


In [7]:
import time
def retrieve_summary(ids, max):
  titles = []
  acc = []
  start = 0
  while start < len(ids):
    idsfrag = ids[start:start + max]
    retrieval = False

    while not retrieval:
      try:
        handle = Entrez.esummary(db='ipg', id = idsfrag, retmax = max)
        ipgsum = Entrez.read(handle)
        handle.close()
        retrieval = True
      except Exception as error:
        print('Error retrieving data, trying again in 1 second:', error)
        time.sleep(1)

    for entry in ipgsum['DocumentSummarySet']['DocumentSummary']:
      titles.append(entry['Title'])
      acc.append(entry['Accession'])


    start += max
  return titles, acc

In [8]:
max = 40
#Execute the function with these parameters
titles, acc = retrieve_summary(ids, max)

In [9]:
#Retrieving AA sequences
success = False
while not success:
  try:
    handle = Entrez.efetch(db = 'protein', id = acc, rettype = 'gb', retmode = 'text')
    output = list(SeqIO.parse(handle, 'gb'))
    handle.close()
    success = True
  except Exception as error:
    print('Error fetching data, trying again in 1 second:', error)
    time.sleep(1)


In [10]:
#Reading sequences and adding them to a list
aaseqs = []
aaseqs = [str(entry.seq) for entry in output]

In [11]:
#Naming unnamed proteins
c = 1
for i in range(len(titles)):
  if titles[i] == '':
    titles[i] = 'unnamed protein v'+str(c)
    c += 1


In [19]:
#Version 1
#Create a unique title list
#Aminoacid sequence not considered
#Comparison will be by protein name

titles_unique = []
for title in titles:
  if title not in titles_unique:
    titles_unique.append(title)

In [13]:
#Version 1 dictionary
#If multiple AA sequences for a protein, add them to a unique key
#Unique keys, multiple values

# protein_dictionary = {}
# for key, value in zip(titles, aaseqs):
#   if key not in protein_dictionary:
#     protein_dictionary[key] = [value]
#   else:
#     protein_dictionary[key].append(value)


In [33]:
#Version 2 dictionary
#Create an alternative list that accounts for same protein names
#Unique keys, unique values
#Comparison will be by AA seq

titles_version = []
key_counter = {}
protein_dictionary_v2 = {}

for key in titles:
  if key in key_counter.keys():
    key_counter[key] += 1
    key_title = str(key) + ' v' + str(key_counter[key])
  else:
    key_counter[key] = 1
    key_title = key
  titles_version.append(key_title)

for key, value in zip(titles_version, aaseqs):
  protein_dictionary_v2[key] = value


In [34]:
#Creating DataFrame of Version 1 dictionary
df = pd.DataFrame(titles_unique, columns=['Protein Name'])
df[strain] = df['Protein Name'].isin(titles)

df

Unnamed: 0,Protein Name,Escherichia coli K-12
0,ferrichrome porin FhuA,True
1,fimbrial adhesin EcpD,True
2,biofilm formation regulator BssR,True
3,catecholate siderophore receptor CirA,True
4,bacteriophage adsorption protein NfrA,True
...,...,...
74,Hypothetical protein,True
75,tonB-dependent receptor yncD,True
76,ferrienterobactin receptor precursor,True
77,phage lambda receptor protein,True


In [36]:
#Creating DataFrame of Version 2 dictionary
#Compared dictionary values (accumulative) with AA sequence values (unique for each query)
#Returns True/False

df2 = pd.DataFrame(protein_dictionary_v2.items(), columns=['Protein Name','AAseq'])
df2[strain] = df2['AAseq'].isin(aaseqs)
df2.drop('AAseq', axis = 1, inplace = True)

df2

Unnamed: 0,Protein Name,Escherichia coli K-12
0,ferrichrome porin FhuA,True
1,fimbrial adhesin EcpD,True
2,biofilm formation regulator BssR,True
3,catecholate siderophore receptor CirA,True
4,bacteriophage adsorption protein NfrA,True
...,...,...
365,TonB-dependent receptor plug domain-containing...,True
366,TonB-dependent receptor v23,True
367,ferrienterobactin receptor precursor,True
368,phage lambda receptor protein,True
