In [2]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.PDB import PDBList
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *
import requests
import json
import time
import random
import re
true = True
false = False

In [2]:
data = requests.get("https://data.rcsb.org/rest/v1/core/entry/4HHB")
data.status_code

200

In [3]:
my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLPARTVETRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMNCKCVIS"
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}

my_query = json.dumps(my_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
    df

In [4]:
seqList = []
with open('data/aechaeoglobus_fulgidus.faa') as protfile: #Generates a list of each protein sequence
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    seqList.append(sequence)

dfdict = {
    'sequence' : [],
    'length' : [],
    'IEP' : [],
    'MW' : [],
    'charge at pH 7' : [],
}

loadBar = widgets.IntProgress(min=0, max=len(seqList), description='0%', bar_style='info', orientation='horizontal') #Displays loading bar for convenience
display(loadBar)

for sequence in seqList[:1]: #Checks each protein of RCSB API for sequence matches
    my_query = {
      "query": {
        "type": "terminal",
        "service": "sequence",
        "parameters": {
          "evalue_cutoff": 1,
          "identity_cutoff": 0.9,
          "sequence_type": "protein",
          "value": sequence
        }
      },
      "request_options": {
        "scoring_strategy": "sequence"
      },
      "return_type": "entry"
    }
    my_query = json.dumps(my_query)
    APIdata = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
    if APIdata.status_code == 200: #Adds each protein with sequence matches to a dataframe
        length = len(sequence)
        dfdict['sequence'].append(sequence)
        dfdict['length'].append(length)
        protparams = PA(sequence)
        dfdict['IEP'].append(protparams.isoelectric_point())
        dfdict['MW'].append(protparams.molecular_weight())
        dfdict['charge at pH 7'].append(protparams.charge_at_pH(7))
  
    #Iterates loading bar
    loadBar.value += 1
    loadBar.description = f'Done:{100*loadBar.value/len(seqList):0.2f}%'

loadBar.description = f'Search Complete'
df = pd.DataFrame.from_dict(dfdict)

IntProgress(value=0, bar_style='info', description='0%', max=2407)

In [5]:
df2 = df.copy()
df2['IEP'] = df2['IEP'].map(lambda x: f"{x:.02f}")
df2['charge at pH 7'] = df2['charge at pH 7'].map(lambda x: f"{x:.02f}")
df2['MW'] = df2['MW'].map(lambda x: f"{x:.0f}")
df2

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7


In [6]:

my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": df.iloc[random.randint(0,212)]["sequence"]
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}
my_query = json.dumps(my_query)
APIcheck = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
protdata = APIcheck.json()
print(protdata)
print(protdata['result_set'][0]['identifier'])

IndexError: single positional indexer is out-of-bounds

In [None]:
my_query2 = {"query":{"type":"group","nodes":[{"type":"terminal","service":"text","parameters":{"attribute":"rcsb_entity_source_organism.taxonomy_lineage.name","negation":false,"operator":"contains_phrase","value":"Pyrococcus abyssi"}},{"type":"terminal","service":"text","parameters":{"attribute":"entity_poly.rcsb_entity_polymer_type","value":"Protein","operator":"exact_match"}}],"logical_operator":"and","label":"text"},"return_type":"polymer_entity","request_options":{"return_all_hits":true,"results_verbosity":"minimal","results_content_type":["experimental"],"sort":[{"sort_by":"score","direction":"desc"}],"scoring_strategy":"combined"}}
my_query2 = json.dumps(my_query2)
APIdata2 = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query2}")
protdata = APIdata2.json()

identifiers = []
for entry in protdata['result_set']:
    identifiers.append(entry['identifier'])



In [None]:
pdblist = PDBList()

In [3]:
taxid = input("Enter Taxonomy ID")
url = f'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=%28organism_id%3A{taxid}%29%20AND%20%28reviewed%3Atrue%29'
all_fastas = requests.get(url).text

fasta_list = re.split(r'\n(?=>)', all_fastas)

Enter Taxonomy ID 562


In [5]:
def str_to_SeqRecord(string):
    string = string.replace(">","")
    Name = ID = string.split(" ")[0]
    Description = string.split("\n")[0]
    Sequence = Seq(string.replace(Description,"").replace("\n",""))
    return SeqRecord(Sequence, id = ID, name = Name, description = Description)

In [6]:
records = []
for rec in fasta_list:
    records.append(str_to_SeqRecord(rec))

SeqIO.write(records,os.path.join("data","API imports",input("Input file name")),"fasta")

Input file name E_coli.faa


710