In [15]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.PDB import PDBList
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *
import requests
import json
import time
import random
true = True
false = False

In [16]:
data = requests.get("https://data.rcsb.org/rest/v1/core/entry/1pq5")  # testing the connection to the PDB
data.status_code  # a status code = 200 means the connection is working

200

### Solution
The original search yielded 5711 structures that contain the same catalytic site residue combination at trypsin (PDB ID = 1PQ5). If you change

```   
      "computational",            
      "experimental"  
```

to 

```   
      "computational"
```

then the yield decreases to 2105 structures (based on RCSB PDB contens as of 5.30.23), reflecting the number of computed structure models (CSMs) that contain a similar active site as trypsin. 


### Search for structures of a particular protein

User often provide the name of a protein and ask how to find the strucure in the [RCSB PDB](https://rcsb.org). There are many approaches that can be used, including UniProt ID search, a sequence search, sequence similarity search, and/or a structural similarity search depending on how much information the user has to begin with.  

#### Example #1: Proteins that share 90% sequence similarity to a given structure

In this example, we query the RCSB PDB for proteins that share 90% sequence similarity to this sequence from a RAS protein. The protein structure in the PDB is [4Q21](https://www.rcsb.org/structure/4Q21) and it has the following sequence: MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLPARTVETRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMNCKCVIS

The code shown below was obtained from the [Advanced Search Interface](https://www.rcsb.org/search/advanced) of the RCSB PDB website. The advanced query for structural similarity looks like this:

![The advanced query for sequence similarity](images/Adv_search_query_builder_seq_similarity.png "The Advanced Query for Sequence Similarity")

To complete this search properly, you must also tell the search to Return Polymer Entities, then hit the Search key. As of 31 May 2023, this search yielded 491 hits for PDB entries that share 90% sequence identity with PDB entry 4Q21. 

Once the search has been executed, the formal query is listed on the top of the Advanced Search Query page. 

It's possible to simplify this exact search by using a tool called json. 



In [26]:
my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS"
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}

my_query = json.dumps(my_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
df

Unnamed: 0,query_id,result_type,total_count,result_set
0,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '4Q21', 'score': 1.0}"
1,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '5X9S', 'score': 0.695121951219..."
2,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '1AA9', 'score': 0.475609756097..."
3,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '1IOZ', 'score': 0.475609756097..."
4,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '1Q21', 'score': 0.475609756097..."
5,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '6Q21', 'score': 0.475609756097..."
6,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '7VV9', 'score': 0.463414634146..."
7,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '2Q21', 'score': 0.439024390243..."
8,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '6AMB', 'score': 0.414634146341..."
9,109c0bba-a801-4dde-8ef1-3e97e21db448,entry,449,"{'identifier': '6KYH', 'score': 0.390243902439..."


#### Example #2: search by Uniprot ID

In this example, we query the RCSB PDB for proteins with the UniProt ID for this sequence from a RAS protein: P01112. The results are identical to those obtained from an Advanced Search on the RCSB PDB site.

In [32]:
my_Uniprot_query = {
  "query": {
    "type": "group",
    "logical_operator": "and",
    "nodes": [
      {
        "type": "terminal",
        "service": "text",
        "parameters": {
          "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
          "operator": "in",
          "negation": false,
          "value": [
            "P01112"       # here is the Uniprot ID for this search. You could replace this with another Uniprot ID of your choice.
          ]
        }
      },
      {
        "type": "terminal",
        "service": "text",
        "parameters": {
          "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
          "operator": "exact_match",
          "value": "UniProt",
          "negation": false
        }
      }
    ],
    "label": "nested-attribute"
  },
  "return_type": "entry",
  "request_options": {
    "paginate": {
      "start": 0,
      "rows": 25
    },
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}

my_Uniprot_query = json.dumps(my_Uniprot_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_Uniprot_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
df

Unnamed: 0,query_id,result_type,total_count,result_set
0,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '121P', 'score': 1.0}"
1,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1AA9', 'score': 1.0}"
2,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1AGP', 'score': 1.0}"
3,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1BKD', 'score': 1.0}"
4,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1CLU', 'score': 1.0}"
5,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1CRP', 'score': 1.0}"
6,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1CRQ', 'score': 1.0}"
7,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1CRR', 'score': 1.0}"
8,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1CTQ', 'score': 1.0}"
9,3255443f-c82e-4691-a3ac-55cbac73a515,entry,209,"{'identifier': '1GNP', 'score': 1.0}"


#### Example #3: Structural Similarity Search

In this example, a structure search is based on the structure of lysozyme (PDB ID = 9LYZ). 

In [36]:
my_structure_query = {
  "query": {
    "type": "terminal",
    "service": "structure",
    "parameters": {
      "operator": "strict_shape_match",
      "target_search_space": "assembly",
      "value": {
        "entry_id": "9LYZ",
        "assembly_id": "1"
      }
    }
  },
  "return_type": "assembly",
  "request_options": {
    "paginate": {
      "start": 0,
      "rows": 25
    },
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}

my_structure_query = json.dumps(my_structure_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_structure_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
df

Unnamed: 0,query_id,result_type,total_count,result_set
0,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '6LYZ-1', 'score': 1.0}"
1,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '9LYZ-1', 'score': 1.0}"
2,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '3LYZ-1', 'score': 0.8606413418..."
3,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '8LYZ-1', 'score': 0.8403579397..."
4,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '2LYZ-1', 'score': 0.8366206429..."
5,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '2LZH-1', 'score': 0.8213374156..."
6,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '1LZH-2', 'score': 0.8058715681..."
7,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '1LZH-1', 'score': 0.7991286568..."
8,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '4NFV-1', 'score': 0.7954033405..."
9,b95360fe-66f5-4249-9c9c-6c1caaf020f8,assembly,1443,"{'identifier': '6QY1-1', 'score': 0.7927515000..."


### Example 4: Searching for active site residues
The query below is designed to search for proteins that contain a set of active site residues that align with the designated residues in the model. The code in this window is json, taken from the Advanced Search window of the RCSB PDB web site.

If you look in the json code, you can see comments that explain the significance at critical positions in the json. You could also manually change the json code to do something slightly differently. For example, try running the code the first time and see how many structures are selected. Then modify the code to yield only computed structure models. 

In [18]:
my_query = {
  "query": {
    "type": "terminal",
    "service": "strucmotif",
    "parameters": {
      "value": {
        "residue_ids": [
          {
            "label_asym_id": "A",  # chain A
            "struct_oper_id": "1",
            "label_seq_id": 41     # residue 41. In the case of 1PQ5, this is His41, but the json does not include the residue
          },
          {
            "label_asym_id": "A",
            "struct_oper_id": "1",
            "label_seq_id": 84     # residue 84. This is Asp84
          },
          {
            "label_asym_id": "A",
            "struct_oper_id": "1",
            "label_seq_id": 180    # Ser180
          }
        ],
        "entry_id": "1PQ5"         # the PDB id
      },
      "rmsd_cutoff": 2,            # the RMSD cutoff for an alignment
      "atom_pairing_scheme": "ALL",
      "exchanges": [
        {
          "residue_id": {
            "label_asym_id": "A",
            "struct_oper_id": "1",
            "label_seq_id": 41
          },
          "allowed": [
            "HIS"                   # includes options for residues. In this case it is only HIS, but it might be Glu/Asp in another case
          ]
        },
        {
          "residue_id": {
            "label_asym_id": "A",
            "struct_oper_id": "1",
            "label_seq_id": 84
          },
          "allowed": [
            "ASP"
          ]
        },
        {
          "residue_id": {
            "label_asym_id": "A",
            "struct_oper_id": "1",
            "label_seq_id": 180
          },
          "allowed": [
            "SER"
          ]
        }
      ]
    }
  },
  "return_type": "assembly",
  "request_options": {
    "results_content_type": [
      "computational",            # This yields the computed structure models that align well (RMSD < 2.0 Angstroms)
      "experimental"             # This yields experimentally determined structure models that align well
    ],
    "paginate": {
      "start": 0,
      "rows": 25
    },
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}

my_query = json.dumps(my_query)
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
if data.status_code == 200:
    results = data.json()
    df = pd.DataFrame.from_dict(results)
    df
    
df

Unnamed: 0,query_id,result_type,total_count,result_set
0,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1PQ5-1', 'score': 1.0}"
1,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1XVO-1', 'score': 0.97}"
2,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1PQ7-1', 'score': 0.965}"
3,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1GDN-1', 'score': 0.96}"
4,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1PQ8-1', 'score': 0.96}"
5,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1FN8-1', 'score': 0.955}"
6,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1GDQ-1', 'score': 0.955}"
7,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1GDU-1', 'score': 0.955}"
8,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1XVM-1', 'score': 0.955}"
9,d5238e8f-fd35-49e5-8546-380c1cfbd879,assembly,5711,"{'identifier': '1FY4-1', 'score': 0.95}"


#### Example #5: Combining searches

The search question this time is, "What computed strcture models based on bacterial sequences share the active site residues of trypsin?"


### Search for structures from a specific organism that are in the RCSB PDB

In [7]:
seqList = []
with open('data/aechaeoglobus_fulgidus.faa') as protfile: #Generates a list of each protein sequence
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    seqList.append(sequence)

dfdict = defaultdict(list) #Defines empty dictionary that can be converted to a PD dataframe
datalen = []
protpI = []
protMW = []
prot_charge_at_pH = []

loadBar = widgets.IntProgress(min=0, max=len(seqList), description=f'0/{len(seqList)}', bar_style='info', orientation='horizontal') #Displays loading bar for convenience
display(loadBar)

for sequence in seqList: #Checks each protein of RCSB API for sequence matches
    my_query = {
      "query": {
        "type": "terminal",
        "service": "sequence",
        "parameters": {
          "evalue_cutoff": 1,
          "identity_cutoff": 0.9,
          "sequence_type": "protein",
          "value": sequence
        }
      },
      "request_options": {
        "scoring_strategy": "sequence"
      },
      "return_type": "entry"
    }
    my_query = json.dumps(my_query)
    APIdata = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
    if APIdata.status_code == 200: #Adds each protein with sequence matches to a dataframe
        length = len(sequence)
        dfdict['sequence'].append(sequence)
        datalen.append(length)
        protparams = PA(sequence)
        protpI.append(protparams.isoelectric_point())
        protMW.append(protparams.molecular_weight())
        prot_charge_at_pH.append(protparams.charge_at_pH(7))
    
    #Iterates loading bar
    loadBar.value += 1
    loadBar.description = f'Done:{loadBar.value}/{len(seqList)}'

loadBar.description = f'Search Complete'
df = pd.DataFrame.from_dict(dfdict)

df['length'] = datalen

df['IEP'] = protpI

df['MW'] = protMW

df['charge at pH 7'] = prot_charge_at_pH

df.sort_values(by=["length"])

IntProgress(value=0, bar_style='info', description='0/2407', max=2407)

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
99,MPKIIEAVYENGVFKPLQKVDLREGERE,28,5.232848,3288.7701,-1.483251
115,MHSRFVKVKCPDCEHEQVIFDHPSTIVKCIICGRTVAEPTGGKGNI...,58,5.851370,6513.4814,-2.258278
100,MPKIIEAVYENGVFKPLQKVDLREGEKVKIIAGNLVERLRKYRVKV...,59,9.452177,6887.0168,2.514339
191,MPKIIEAVYENGVFKPLQKVDLKEGERVKIKLELKVEPIDLGEPVS...,61,5.411379,6981.1366,-1.480999
101,MPKIIEAIYENGVFKPLQKVDLKEGERVRVVVSEVVAKTRGLLKGC...,61,4.784786,6920.0319,-4.481723
...,...,...,...,...,...
126,MCMDRIEKLIKKVSKPARLSVERCRLYTESMKQTEGEPMIIRQAKA...,776,5.155490,87165.5748,-21.743197
48,MVKDTYISSASKTPPMERTVRVTGMTCAMCVKSIETAVGSLEGVEE...,804,5.604234,86430.8507,-10.700651
41,MQNAESWFKKYWHLSVLVIAALISVKLRILNPWNSVFTWTVRLGGN...,868,8.335391,98252.0528,4.018070
195,MTLDEEYLDITFLTENGFVRKRCPKCGKHFWTADPEREICGDPPCE...,906,5.283264,102535.1316,-30.690231


In [84]:
df

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
0,MFKRETKDFINIDPLQTGGKLTEEARQALLEWGDGYSVCDFCTTGR...,371,7.140535,41929.7438,0.378700
1,MELPSFIFQAQENLVERPWGGEWIALLKGFRQSGIGESWEFSAHTS...,299,5.300145,33495.6303,-9.905084
2,MTPVGMDRKSLSLLILIVLLGLCIRLQNFGEIFDSRIYYYGYDPYY...,593,9.034035,68171.2664,11.988620
3,MKICVFHDYFGAIGGGEKVALTISKLFNADVITTDVDAVPEEFRNK...,363,9.152270,42108.6242,13.874878
4,MEKRQFMKMKEKLKRACFEFAVSNRYLYNLAKRILDSSPKLQKIKE...,473,8.202200,55346.1547,3.305100
...,...,...,...,...,...
207,MRAAVVYKTDGHVKRIEEALKRLEVEVELFNQPSEELENFDFIVSV...,249,6.249072,27868.0936,-1.304459
208,MRSLRANGYNFRQDTRHYVYVVSQSNNWQQGRHLIDDPGKFYFDPL...,506,4.752160,55503.6214,-20.924925
209,MQKRVTDEEIKERLGKIKSRIAVMSGKGGVGKSTVTALLAVHYARQ...,254,6.022000,27795.0284,-2.211083
210,MAKALEQPFDVANIPGPKMATLLEKGKPVANMIKKAKRPLLIVGPD...,175,5.813117,19701.7361,-3.128491


In [96]:

my_query = {
  "query": {
    "type": "terminal",
    "service": "sequence",
    "parameters": {
      "evalue_cutoff": 1,
      "identity_cutoff": 0.9,
      "sequence_type": "protein",
      "value": df.iloc[random.randint(0,212)]["sequence"]
    }
  },
  "request_options": {
    "scoring_strategy": "sequence"
  },
  "return_type": "entry"
}
my_query = json.dumps(my_query)
APIcheck = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
protdata = APIcheck.json()
print(protdata)
print(protdata['result_set'][0]['identifier'])

{'query_id': 'd2459c26-c367-4b0a-8c02-10e89721fd9d', 'result_type': 'entry', 'total_count': 1, 'result_set': [{'identifier': '3DF7', 'score': 1.0}]}
3DF7


In [19]:
my_query2 = {"query":{"type":"group","nodes":[{"type":"terminal","service":"text","parameters":{"attribute":"rcsb_entity_source_organism.taxonomy_lineage.name","negation":false,"operator":"contains_phrase","value":"Pyrococcus abyssi"}},{"type":"terminal","service":"text","parameters":{"attribute":"entity_poly.rcsb_entity_polymer_type","value":"Protein","operator":"exact_match"}}],"logical_operator":"and","label":"text"},"return_type":"polymer_entity","request_options":{"return_all_hits":true,"results_verbosity":"minimal","results_content_type":["experimental"],"sort":[{"sort_by":"score","direction":"desc"}],"scoring_strategy":"combined"}}
my_query2 = json.dumps(my_query2)
APIdata2 = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query2}")
protdata = APIdata2.json()

identifiers = []
for entry in protdata['result_set']:
    identifiers.append(entry['identifier'])



In [21]:
pdblist = PDBList()
