<a href="https://colab.research.google.com/github/Alebraco/phagetool/blob/main/Phage_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
!pip install bio
from Bio import Entrez,SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd
import numpy as np



In [100]:
Entrez.email = 'alekey039@hotmail.com'

In [101]:
handle = Entrez.einfo(db='ipg')
info = Entrez.read(handle)
print(info)

{'DbInfo': {'DbName': 'ipg', 'MenuName': 'Identical Protein Groups', 'Description': 'Identical Protein Groups DB', 'DbBuild': 'Build230910-1713.1', 'Count': '640396107', 'LastUpdate': '2023/09/17 14:55', 'FieldList': [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '12187684305', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to each sequence', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '44', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'WORD', 'FullName': 'Text Word', 'Description': 'Free text associated with record', 'TermCount': '9390372', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hiera

In [102]:
def retrieve_ids(max, query):
  ids = []
  start = 0
  while(True):
    handle = Entrez.esearch(db='ipg', retmax = max, retstart = start, term = query)
    rec = Entrez.read(handle)
    handle.close()

    if len(rec['IdList']) == 0:
      break

    start += max
    ids = ids + rec['IdList']
  return ids


In [103]:
max = 40
strain = 'Escherichia coli K-12'
query = str(strain)+'[ORGN] AND receptor[All fields]'
print(query)
#Execute the function with these parameters

ids = retrieve_ids(max, query)

Escherichia coli K-12[ORGN] AND receptor[All fields]


In [104]:
def retrieve_summary(ids, max):
  titles = []
  acc = []
  start = 0
  while start < len(ids):
    idsfrag = ids[start:start + max]
    handle = Entrez.esummary(db='ipg', id = idsfrag, retmax = max)
    ipgsum = Entrez.read(handle)
    handle.close()

    for entry in ipgsum['DocumentSummarySet']['DocumentSummary']:
      titles.append(entry['Title'])
      acc.append(entry['Accession'])

    start += max
  return titles, acc

In [105]:
max = 40
#Execute the function with these parameters
titles, acc = retrieve_summary(ids, max)

In [106]:
handle = Entrez.esummary(db="ipg", id = ids)
ipgsum = Entrez.read(handle)
handle.close()
titles = [entry['Title'] for entry in ipgsum['DocumentSummarySet']['DocumentSummary']]
acc = [entry['Accession'] for entry in ipgsum['DocumentSummarySet']['DocumentSummary']]


In [107]:
#Retrieving AA sequences
handle = Entrez.efetch(db = 'protein', id = acc, rettype = 'gb', retmode = 'text')
output = list(SeqIO.parse(handle, 'gb'))
handle.close()

In [108]:
#Reading sequences and adding them to a list
aaseqs = []
aaseqs = [str(entry.seq) for entry in output]

In [109]:
#Naming unnamed proteins
c = 1
for i in range(len(titles)):
  if titles[i] == '':
    titles[i] = 'unnamed protein v'+str(c)
    c += 1


In [110]:
#Version 1 dictionary
#If multiple AA sequences for a protein, add them to a unique key
#Unique keys, multiple values

protein_dictionary = {}
for key, value in zip(titles, aaseqs):
  if key not in protein_dictionary:
    protein_dictionary[key] = [value]
  else:
    protein_dictionary[key].append(value)


In [111]:
#Version 2 dictionary
#Create an alternative list that accounts for same protein names
#Unique keys, unique values

titles_version = []
key_counter = {}

for key in titles:
  if key in key_counter.keys():
    key_counter[key] += 1
    key_title = str(key) + ' v' + str(key_counter[key])
  else:
    key_counter[key] = 1
    key_title = key
  titles_version.append(key_title)


In [112]:
#Creating DataFrame of Version 1 dictionary
df = pd.DataFrame(list(protein_dictionary.keys()), columns=['Protein Name'])
df[strain] = df['Protein Name'].isin(titles)

df

Unnamed: 0,Protein Name,Escherichia coli K-12
0,TonB-dependent receptor plug domain-containing...,True
1,MFS transporter family glucose-6-phosphate rec...,True
2,ferric-rhodotorulic acid/ferric-coprogen recep...,True
3,ferric aerobactin receptor IutA,True
4,long-chain fatty acid transporter FadL,True
...,...,...
74,Hypothetical protein,True
75,tonB-dependent receptor yncD,True
76,ferrienterobactin receptor precursor,True
77,phage lambda receptor protein,True


In [113]:
#Creating DataFrame of Version 2 dictionary
#Compared dictionary values (accumulative) with AA sequence values (unique for each query)
#Returns True/False

df2 = pd.DataFrame(list(protein_dictionary_v2.items()), columns=['Protein Name','AAseq'])
df2[strain] = df2['AAseq'].isin(aaseqs)
df2.drop('AAseq', axis = 1, inplace = True)

df2

Unnamed: 0,Protein Name,Escherichia coli K-12
0,TonB-dependent receptor plug domain-containing...,True
1,TonB-dependent receptor plug domain-containing...,True
2,MFS transporter family glucose-6-phosphate rec...,True
3,TonB-dependent receptor plug domain-containing...,True
4,ferric-rhodotorulic acid/ferric-coprogen recep...,True
...,...,...
365,TonB-dependent receptor plug domain-containing...,True
366,TonB-dependent receptor v23,True
367,ferrienterobactin receptor precursor,True
368,phage lambda receptor protein,True
