In [27]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.PDB import PDBList
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *
import requests
import json
import time
import random
import re
true = True
false = False

In [3]:
data = requests.get("https://data.rcsb.org/rest/v1/core/entry/4HHB")
data.status_code

200

In [4]:
my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLPARTVETRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMNCKCVIS"
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}

my_query = json.dumps(my_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
    df

In [56]:
seqList = []
with open('data/aechaeoglobus_fulgidus.faa') as protfile: #Generates a list of each protein sequence
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    seqList.append(sequence)

dfdict = {
    'sequence' : [],
    'length' : [],
    'IEP' : [],
    'MW' : [],
    'charge at pH 7' : [],
}

loadBar = widgets.IntProgress(min=0, max=len(seqList), description='0%', bar_style='info', orientation='horizontal') #Displays loading bar for convenience
display(loadBar)

for sequence in seqList[:100]: #Checks each protein of RCSB API for sequence matches
    my_query = {
      "query": {
        "type": "terminal",
        "service": "sequence",
        "parameters": {
          "evalue_cutoff": 1,
          "identity_cutoff": 0.9,
          "sequence_type": "protein",
          "value": sequence
        }
      },
      "request_options": {
        "scoring_strategy": "sequence"
      },
      "return_type": "entry"
    }
    my_query = json.dumps(my_query)
    APIdata = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
    if APIdata.status_code == 200: #Adds each protein with sequence matches to a dataframe
        length = len(sequence)
        dfdict['sequence'].append(sequence)
        dfdict['length'].append(length)
        protparams = PA(sequence)
        dfdict['IEP'].append(protparams.isoelectric_point())
        dfdict['MW'].append(protparams.molecular_weight())
        dfdict['charge at pH 7'].append(protparams.charge_at_pH(7))
  
    #Iterates loading bar
    loadBar.value += 1
    loadBar.description = f'Done:{100*loadBar.value/len(seqList):0.2f}%'

loadBar.description = f'Search Complete'
df = pd.DataFrame.from_dict(dfdict)

IntProgress(value=0, bar_style='info', description='0%', max=2407)

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
0,MFKRETKDFINIDPLQTGGKLTEEARQALLEWGDGYSVCDFCTTGR...,371,7.140535,41929.7438,0.3787
1,MELPSFIFQAQENLVERPWGGEWIALLKGFRQSGIGESWEFSAHTS...,299,5.300145,33495.6303,-9.905084
2,MTPVGMDRKSLSLLILIVLLGLCIRLQNFGEIFDSRIYYYGYDPYY...,593,9.034035,68171.2664,11.98862
3,MKICVFHDYFGAIGGGEKVALTISKLFNADVITTDVDAVPEEFRNK...,363,9.15227,42108.6242,13.874878
4,MEKRQFMKMKEKLKRACFEFAVSNRYLYNLAKRILDSSPKLQKIKE...,473,8.2022,55346.1547,3.3051
5,MKFACRAITRGRAEGEALVTKEYISFLGGIDKETGIVKEDCEIKGE...,132,5.682728,14189.2879,-1.481504
6,MDYFRLAEKFLREMHAKYMKRVSRPGNTPRPWFDFSEERLLSRLFE...,93,4.961669,11459.907,-5.386259
7,MELKYKIGFPSLYYPKISLADRIDAAAEKFGEKTAIISAEPKFPSE...,542,5.296621,60995.6118,-15.642612
8,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAG...,240,9.480414,27026.4138,5.894878
9,MRLLFSALLALLSSIILLFVLLPVAATVTLQLFNFDEFLKAASDPA...,261,7.970436,27832.9487,0.77981


In [64]:
df2 = df.copy()
df2['IEP'] = df2['IEP'].map(lambda x: f"{x:.02f}")
df2['charge at pH 7'] = df2['charge at pH 7'].map(lambda x: f"{x:.02f}")
df2['MW'] = df2['MW'].map(lambda x: f"{x:.0f}")
df2

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
0,MFKRETKDFINIDPLQTGGKLTEEARQALLEWGDGYSVCDFCTTGR...,371,7.14,41930,0.38
1,MELPSFIFQAQENLVERPWGGEWIALLKGFRQSGIGESWEFSAHTS...,299,5.3,33496,-9.91
2,MTPVGMDRKSLSLLILIVLLGLCIRLQNFGEIFDSRIYYYGYDPYY...,593,9.03,68171,11.99
3,MKICVFHDYFGAIGGGEKVALTISKLFNADVITTDVDAVPEEFRNK...,363,9.15,42109,13.87
4,MEKRQFMKMKEKLKRACFEFAVSNRYLYNLAKRILDSSPKLQKIKE...,473,8.2,55346,3.31
5,MKFACRAITRGRAEGEALVTKEYISFLGGIDKETGIVKEDCEIKGE...,132,5.68,14189,-1.48
6,MDYFRLAEKFLREMHAKYMKRVSRPGNTPRPWFDFSEERLLSRLFE...,93,4.96,11460,-5.39
7,MELKYKIGFPSLYYPKISLADRIDAAAEKFGEKTAIISAEPKFPSE...,542,5.3,60996,-15.64
8,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAG...,240,9.48,27026,5.89
9,MRLLFSALLALLSSIILLFVLLPVAATVTLQLFNFDEFLKAASDPA...,261,7.97,27833,0.78


In [96]:

my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": df.iloc[random.randint(0,212)]["sequence"]
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}
my_query = json.dumps(my_query)
APIcheck = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
protdata = APIcheck.json()
print(protdata)
print(protdata['result_set'][0]['identifier'])

{'query_id': 'd2459c26-c367-4b0a-8c02-10e89721fd9d', 'result_type': 'entry', 'total_count': 1, 'result_set': [{'identifier': '3DF7', 'score': 1.0}]}
3DF7


In [19]:
my_query2 = {"query":{"type":"group","nodes":[{"type":"terminal","service":"text","parameters":{"attribute":"rcsb_entity_source_organism.taxonomy_lineage.name","negation":false,"operator":"contains_phrase","value":"Pyrococcus abyssi"}},{"type":"terminal","service":"text","parameters":{"attribute":"entity_poly.rcsb_entity_polymer_type","value":"Protein","operator":"exact_match"}}],"logical_operator":"and","label":"text"},"return_type":"polymer_entity","request_options":{"return_all_hits":true,"results_verbosity":"minimal","results_content_type":["experimental"],"sort":[{"sort_by":"score","direction":"desc"}],"scoring_strategy":"combined"}}
my_query2 = json.dumps(my_query2)
APIdata2 = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query2}")
protdata = APIdata2.json()

identifiers = []
for entry in protdata['result_set']:
    identifiers.append(entry['identifier'])



In [18]:
pdblist = PDBList()

In [66]:
taxid = input("Enter Taxonomy ID")
url = f'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=%28organism_id%3A{taxid}%29%20AND%20%28reviewed%3Atrue%29'
all_fastas = requests.get(url).text

fasta_list = re.split(r'\n(?=>)', all_fastas)
fasta_list

Enter Taxonomy ID 10181


['>sp|A0A0P6JG37|ASAH1_HETGA Acid ceramidase OS=Heterocephalus glaber OX=10181 GN=ASAH1 PE=1 SV=1\nMLGRSRLTFVLLAAAVTCAEAQHAPPWTEDCRKSTYPPSGPTYRGPVPWYTINLDLPPYK\nRWHELMVDKGPMLKIIVNSFKNMVNTFVPSGKVMQMVDQKLPDLLGQFSGPYEEEMKGIA\nDVTEIPLGEIISFNIFYELFTMCTSIITEDKKGHLLHVRNMDFGIFLGWNINNNTWVITE\nELKPLTVNLDFQRNSKTVFKATSFAGYVGMLTGFKPGQFSLTLNERFSMNGGYLGLLEWI\nLGKKDASWIGFITRSVLENATSYEEAKNILAKTKLLAPAYFILGGNQSGEGCVITRERKD\nSLDIYELDPKQGRWYVVQTNYDRWKNPLFLDDRRTPAQTCLKRTTQENLSFATLYDILST\nKPVLNKLTVFTALMDVTKNHYEAYLRDCPDPCVGW',
 '>sp|G5AY81|HYAS2_HETGA Hyaluronan synthase 2 OS=Heterocephalus glaber OX=10181 GN=Has2 PE=2 SV=1\nMHCERFLCILRIIGTTLFGVSLLLGITAAYIVGYQFIQTDNYYFSFGLYGAFLASHLIIQ\nSLFAFLEHRKMKKSLETPIKLNKTVALCIAAYQEDPDYLRKCLQSVKRLTYPGIKVVMVI\nDGNSDDDLYMMDIFSEVMGRDKSATYIWKNNFHEKGPGETDESHKESSQHVTQLVLSSKS\nVCIMQKWGGKREVMYTAFRALGRSVDYVQVCDSDTMLDPASSVEMVKVLEEDPMVGGVGG\nDVQILNKYDSWISFLSSVRYWMAFNIERACQSYFGCVQCISGPLGMYRNSLLHEFVEDWY\nSQEFMGNQCSFGDDRHLTNRVLSLGYATKYTARSKCLTETPIEYLRWLNQQTRWSKSYFR\nEWLYNAMWFHKHHLWMTYE

In [67]:
def str_to_SeqRecord(string):
    string = string.replace(">","")
    Name = ID = string.split(" ")[0]
    Description = string.split("\n")[0]
    Sequence = Seq(string.replace(Description,"").replace("\n",""))
    return SeqRecord(Sequence, id = ID, name = Name, description = Description)

In [68]:
records = []
for rec in fasta_list:
    records.append(str_to_SeqRecord(rec))

SeqIO.write(records,os.path.join("data","API imports",input("Input file name")),"fasta")

Input file name Heterocephalus_glaber.faa


6

In [65]:
loadBar = widgets.IntProgress(min=0, max=len(records), description='0%', bar_style='info', orientation='horizontal')
loadCount = 0
display(loadBar)
for rec in records:
    if rec.id == "sp|A0A087X1C5|CP2D7_HUMAN":
        print(rec)
        print(loadCount)
    loadCount += 1
    if loadCount % 10 == 0:
        loadBar.value += 100
        loadBar.description = f'Done:{100*loadBar.value/len(records):0.2f}%'

IntProgress(value=0, bar_style='info', description='0%', max=20422)

ID: sp|A0A087X1C5|CP2D7_HUMAN
Name: sp|A0A087X1C5|CP2D7_HUMAN
Description: sp|A0A087X1C5|CP2D7_HUMAN Putative cytochrome P450 2D7 OS=Homo sapiens OX=9606 GN=CYP2D7 PE=5 SV=1
Number of features: 0
Seq('MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNLLHVDFQNT...VPR')
0


20422