In [2]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.PDB import PDBList
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *
import requests
import json
import time
import random
true = True
false = False

In [44]:
data = requests.get("https://data.rcsb.org/rest/v1/core/entry/4HHB")
data.status_code

200

In [45]:
my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLPARTVETRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMNCKCVIS"
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}

my_query = json.dumps(my_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
    df

In [61]:
seqList = []
with open('data/aechaeoglobus_fulgidus.faa') as protfile: #Generates a list of each protein sequence
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    seqList.append(sequence)

dfdict = defaultdict(list) #Defines empty dictionary that can be converted to a PD dataframe
datalen = []
protpI = []
protMW = []
prot_charge_at_pH = []

loadBar = widgets.IntProgress(min=0, max=len(seqList), description=f'0/{len(seqList)}', bar_style='info', orientation='horizontal') #Displays loading bar for convenience
display(loadBar)

for sequence in seqList: #Checks each protein of RCSB API for sequence matches
    my_query = {
      "query": {
        "type": "terminal",
        "service": "sequence",
        "parameters": {
          "evalue_cutoff": 1,
          "identity_cutoff": 0.9,
          "sequence_type": "protein",
          "value": sequence
        }
      },
      "request_options": {
        "scoring_strategy": "sequence"
      },
      "return_type": "entry"
    }
    my_query = json.dumps(my_query)
    APIdata = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
    if APIdata.status_code == 200: #Adds each protein with sequence matches to a dataframe
        length = len(sequence)
        dfdict['sequence'].append(sequence)
        datalen.append(length)
        protparams = PA(sequence)
        protpI.append(protparams.isoelectric_point())
        protMW.append(protparams.molecular_weight())
        prot_charge_at_pH.append(protparams.charge_at_pH(7))
    
    #Iterates loading bar
    loadBar.value += 1
    loadBar.description = f'Done:{loadBar.value}/{len(seqList)}'

loadBar.description = f'Search Complete'
df = pd.DataFrame.from_dict(dfdict)

df['length'] = datalen

df['IEP'] = protpI

df['MW'] = protMW

df['charge at pH 7'] = prot_charge_at_pH

df.sort_values(by=["length"])

IntProgress(value=0, bar_style='info', description='Done:0/2407', max=2407)

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
99,MPKIIEAVYENGVFKPLQKVDLREGERE,28,5.232848,3288.7701,-1.483251
115,MHSRFVKVKCPDCEHEQVIFDHPSTIVKCIICGRTVAEPTGGKGNI...,58,5.851370,6513.4814,-2.258278
100,MPKIIEAVYENGVFKPLQKVDLREGEKVKIIAGNLVERLRKYRVKV...,59,9.452177,6887.0168,2.514339
191,MPKIIEAVYENGVFKPLQKVDLKEGERVKIKLELKVEPIDLGEPVS...,61,5.411379,6981.1366,-1.480999
101,MPKIIEAIYENGVFKPLQKVDLKEGERVRVVVSEVVAKTRGLLKGC...,61,4.784786,6920.0319,-4.481723
...,...,...,...,...,...
126,MCMDRIEKLIKKVSKPARLSVERCRLYTESMKQTEGEPMIIRQAKA...,776,5.155490,87165.5748,-21.743197
48,MVKDTYISSASKTPPMERTVRVTGMTCAMCVKSIETAVGSLEGVEE...,804,5.604234,86430.8507,-10.700651
41,MQNAESWFKKYWHLSVLVIAALISVKLRILNPWNSVFTWTVRLGGN...,868,8.335391,98252.0528,4.018070
195,MTLDEEYLDITFLTENGFVRKRCPKCGKHFWTADPEREICGDPPCE...,906,5.283264,102535.1316,-30.690231


In [84]:
df

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
0,MFKRETKDFINIDPLQTGGKLTEEARQALLEWGDGYSVCDFCTTGR...,371,7.140535,41929.7438,0.378700
1,MELPSFIFQAQENLVERPWGGEWIALLKGFRQSGIGESWEFSAHTS...,299,5.300145,33495.6303,-9.905084
2,MTPVGMDRKSLSLLILIVLLGLCIRLQNFGEIFDSRIYYYGYDPYY...,593,9.034035,68171.2664,11.988620
3,MKICVFHDYFGAIGGGEKVALTISKLFNADVITTDVDAVPEEFRNK...,363,9.152270,42108.6242,13.874878
4,MEKRQFMKMKEKLKRACFEFAVSNRYLYNLAKRILDSSPKLQKIKE...,473,8.202200,55346.1547,3.305100
...,...,...,...,...,...
207,MRAAVVYKTDGHVKRIEEALKRLEVEVELFNQPSEELENFDFIVSV...,249,6.249072,27868.0936,-1.304459
208,MRSLRANGYNFRQDTRHYVYVVSQSNNWQQGRHLIDDPGKFYFDPL...,506,4.752160,55503.6214,-20.924925
209,MQKRVTDEEIKERLGKIKSRIAVMSGKGGVGKSTVTALLAVHYARQ...,254,6.022000,27795.0284,-2.211083
210,MAKALEQPFDVANIPGPKMATLLEKGKPVANMIKKAKRPLLIVGPD...,175,5.813117,19701.7361,-3.128491


'MPKIIEAVYENGVFKPLQKVDLREGERE'

In [96]:

my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": df.iloc[random.randint(0,212)]["sequence"]
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}
my_query = json.dumps(my_query)
APIcheck = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
protdata = APIcheck.json()
print(protdata)
print(protdata['result_set'][0]['identifier'])

{'query_id': 'd2459c26-c367-4b0a-8c02-10e89721fd9d', 'result_type': 'entry', 'total_count': 1, 'result_set': [{'identifier': '3DF7', 'score': 1.0}]}
3DF7


In [19]:
my_query2 = {"query":{"type":"group","nodes":[{"type":"terminal","service":"text","parameters":{"attribute":"rcsb_entity_source_organism.taxonomy_lineage.name","negation":false,"operator":"contains_phrase","value":"Pyrococcus abyssi"}},{"type":"terminal","service":"text","parameters":{"attribute":"entity_poly.rcsb_entity_polymer_type","value":"Protein","operator":"exact_match"}}],"logical_operator":"and","label":"text"},"return_type":"polymer_entity","request_options":{"return_all_hits":true,"results_verbosity":"minimal","results_content_type":["experimental"],"sort":[{"sort_by":"score","direction":"desc"}],"scoring_strategy":"combined"}}
my_query2 = json.dumps(my_query2)
APIdata2 = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query2}")
protdata = APIdata2.json()

identifiers = []
for entry in protdata['result_set']:
    identifiers.append(entry['identifier'])



In [18]:
pdblist = PDBList()