# Extract proteinsequences from the Protein Data Bank (PDB) at the National Center for Biotechnology Information (NCBI)

In [4]:
#Import modules
from Bio.PDB import PDBParser, PDBList
#help(PDBList)

In [6]:
#Download the 7BYR structure from the PDB database
pdbl=PDBList()
pdbl.retrieve_pdb_file("7BYR", file_format="pdb", pdir="dir")

Downloading PDB structure '7BYR'...


'dir\\pdb7byr.ent'

In [8]:
parser = PDBParser()
structure = parser.get_structure("7BYR","dir/pdb7byr.ent")
#There seems to be a discontinuity in Chains A, B, and C.



In [10]:
#How many chains does the structure contain?
for chain in structure[0]:
    print("chainid: {}".format(chain.id))

chainid: A
chainid: B
chainid: C
chainid: H
chainid: L
chainid: D
chainid: E
chainid: F
chainid: G
chainid: I
chainid: J


In [12]:
resolution = structure.header["resolution"]
resolution

3.84

In [14]:
#Which keywords are associated with this structure?
keywords = structure.header["keywords"]
keywords

'sars-cov-2, antigen, rbd, neutralizing antibody, viral protein'

In [16]:
### Prosite ###
#import modules
from Bio import ExPASy
from Bio.ExPASy import Prosite
#help(Prosite)

In [18]:
handle = ExPASy.get_prosite_raw("PS51442")
record = Prosite.read(handle)

In [20]:
print(record.description)

Coronavirus main protease (M-pro) domain profile.


In [26]:
print(record.pdb_structs[:3])

[]


In [77]:
handle = ExPASy.get_prosite_raw("PS00001")
record = Prosite.read(handle)
print(record.pattern)

N-{P}-[ST]-{P}.


In [28]:
### ScanProsite ###
#Import necessary modules 
from Bio.ExPASy import ScanProsite
from Bio import SeqIO

In [32]:
prot_record = SeqIO.read(r"C:\Users\marbj610\Documents\Repository\prot_seq.fasta", format="fasta")
len(prot_record.seq)

147

In [34]:
#Initiate a scan of the protein sequence against the Prosite database to identify any matching domains/motifs.
handle = ScanProsite.scan(seq=prot_record.seq, mirror="https://prosite.expasy.org/")
result = ScanProsite.read(handle)

In [35]:
result.n_match

1

In [36]:
#Result of matched motif and the location in the protein sequence.
#USERSEQ1=accession code for the sequence that was scanned.
#PS01033=accession code for the Prosite pattern that was matched.
#score=confidence in match, level=classification level
result[0]

{'sequence_ac': 'USERSEQ1',
 'start': 3,
 'stop': 147,
 'signature_ac': 'PS01033',
 'score': '43.983',
 'level': '0'}