# Proteomics

# Libraries to Import

Be sure to activate the "biopython" conda environment.

In [1]:
# !jupyter-nbextension enable nglview --py --sys-prefix

In [2]:
from Bio.PDB import *
import nglview as nv
import ipywidgets

import warnings
warnings.filterwarnings('ignore')



# Using the PDB (Protein Data Bank) File Format

In [3]:
parser = PDBParser()

In [4]:
structure = parser.get_structure("PHA-L", "Data/1FAT.pdb")

In [5]:
view = nv.show_biopython(structure)
view

NGLWidget()

# Using the CIF (Crystallographic Information File) File Format

In [6]:
parser = MMCIFParser()

In [7]:
structure = parser.get_structure("PHA-L", "fa/1fat.cif")

In [8]:
view = nv.show_biopython(structure)
view

NGLWidget()

# Diving into the Header Information

In [9]:
mmcif_dict = MMCIF2Dict.MMCIF2Dict("fa/1fat.cif")

In [10]:
list(mmcif_dict.keys())

['data_',
 '_entry.id',
 '_audit_conform.dict_name',
 '_audit_conform.dict_version',
 '_audit_conform.dict_location',
 '_database_2.database_id',
 '_database_2.database_code',
 '_pdbx_database_status.status_code',
 '_pdbx_database_status.entry_id',
 '_pdbx_database_status.recvd_initial_deposition_date',
 '_pdbx_database_status.deposit_site',
 '_pdbx_database_status.process_site',
 '_pdbx_database_status.status_code_sf',
 '_pdbx_database_status.status_code_mr',
 '_pdbx_database_status.SG_entry',
 '_pdbx_database_status.pdb_format_compatible',
 '_pdbx_database_status.status_code_cs',
 '_pdbx_database_status.status_code_nmr_data',
 '_pdbx_database_status.methods_development_category',
 '_audit_author.name',
 '_audit_author.pdbx_ordinal',
 '_citation.id',
 '_citation.title',
 '_citation.journal_abbrev',
 '_citation.journal_volume',
 '_citation.page_first',
 '_citation.page_last',
 '_citation.year',
 '_citation.journal_id_ASTM',
 '_citation.country',
 '_citation.journal_id_ISSN',
 '_citatio

**What’s the overall layout of a Structure object?**

The Structure object follows the so-called SMCRA (Structure/Model/Chain/Residue/Atom) architecture :

- A structure consists of models
- A model consists of chains
- A chain consists of residues
- A residue consists of atoms

**Accessing Residue Sequence**

In [11]:
# Iterate over all residues in a model
for model in structure:
    for residue in model.get_residues():
        print(residue)

<Residue SER het=  resseq=1 icode= >
<Residue ASN het=  resseq=2 icode= >
<Residue ASP het=  resseq=3 icode= >
<Residue ILE het=  resseq=4 icode= >
<Residue TYR het=  resseq=5 icode= >
<Residue PHE het=  resseq=6 icode= >
<Residue ASN het=  resseq=7 icode= >
<Residue PHE het=  resseq=8 icode= >
<Residue GLN het=  resseq=9 icode= >
<Residue ARG het=  resseq=10 icode= >
<Residue PHE het=  resseq=11 icode= >
<Residue ASN het=  resseq=12 icode= >
<Residue GLU het=  resseq=13 icode= >
<Residue THR het=  resseq=14 icode= >
<Residue ASN het=  resseq=15 icode= >
<Residue LEU het=  resseq=16 icode= >
<Residue ILE het=  resseq=17 icode= >
<Residue LEU het=  resseq=18 icode= >
<Residue GLN het=  resseq=19 icode= >
<Residue ARG het=  resseq=20 icode= >
<Residue ASP het=  resseq=21 icode= >
<Residue ALA het=  resseq=22 icode= >
<Residue SER het=  resseq=23 icode= >
<Residue VAL het=  resseq=24 icode= >
<Residue SER het=  resseq=25 icode= >
<Residue SER het=  resseq=26 icode= >
<Residue SER het=  re

In [12]:
residues = structure.get_residues() # returns a generator object

In [13]:
[item for item in residues]

[<Residue SER het=  resseq=1 icode= >,
 <Residue ASN het=  resseq=2 icode= >,
 <Residue ASP het=  resseq=3 icode= >,
 <Residue ILE het=  resseq=4 icode= >,
 <Residue TYR het=  resseq=5 icode= >,
 <Residue PHE het=  resseq=6 icode= >,
 <Residue ASN het=  resseq=7 icode= >,
 <Residue PHE het=  resseq=8 icode= >,
 <Residue GLN het=  resseq=9 icode= >,
 <Residue ARG het=  resseq=10 icode= >,
 <Residue PHE het=  resseq=11 icode= >,
 <Residue ASN het=  resseq=12 icode= >,
 <Residue GLU het=  resseq=13 icode= >,
 <Residue THR het=  resseq=14 icode= >,
 <Residue ASN het=  resseq=15 icode= >,
 <Residue LEU het=  resseq=16 icode= >,
 <Residue ILE het=  resseq=17 icode= >,
 <Residue LEU het=  resseq=18 icode= >,
 <Residue GLN het=  resseq=19 icode= >,
 <Residue ARG het=  resseq=20 icode= >,
 <Residue ASP het=  resseq=21 icode= >,
 <Residue ALA het=  resseq=22 icode= >,
 <Residue SER het=  resseq=23 icode= >,
 <Residue VAL het=  resseq=24 icode= >,
 <Residue SER het=  resseq=25 icode= >,
 <Residue

In [14]:
res_list = Selection.unfold_entities(structure, "R")

In [15]:
res_list

[<Residue SER het=  resseq=1 icode= >,
 <Residue ASN het=  resseq=2 icode= >,
 <Residue ASP het=  resseq=3 icode= >,
 <Residue ILE het=  resseq=4 icode= >,
 <Residue TYR het=  resseq=5 icode= >,
 <Residue PHE het=  resseq=6 icode= >,
 <Residue ASN het=  resseq=7 icode= >,
 <Residue PHE het=  resseq=8 icode= >,
 <Residue GLN het=  resseq=9 icode= >,
 <Residue ARG het=  resseq=10 icode= >,
 <Residue PHE het=  resseq=11 icode= >,
 <Residue ASN het=  resseq=12 icode= >,
 <Residue GLU het=  resseq=13 icode= >,
 <Residue THR het=  resseq=14 icode= >,
 <Residue ASN het=  resseq=15 icode= >,
 <Residue LEU het=  resseq=16 icode= >,
 <Residue ILE het=  resseq=17 icode= >,
 <Residue LEU het=  resseq=18 icode= >,
 <Residue GLN het=  resseq=19 icode= >,
 <Residue ARG het=  resseq=20 icode= >,
 <Residue ASP het=  resseq=21 icode= >,
 <Residue ALA het=  resseq=22 icode= >,
 <Residue SER het=  resseq=23 icode= >,
 <Residue VAL het=  resseq=24 icode= >,
 <Residue SER het=  resseq=25 icode= >,
 <Residue

In [16]:
# Using CA-CA
ppb = CaPPBuilder()
counter = 1
for pp in ppb.build_peptides(structure):
    print("Sequence: ", counter)
    print(pp.get_sequence())
    counter += 1

Sequence:  1
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN
Sequence:  2
NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFSATTGINKGNVETNDVLSWSFASKLS
Sequence:  3
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLNGNGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFSATTGINKGNVETNDVLSWSFASKLS
Sequence:  4
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN
Sequence:  5
NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFSATTGINKGNVETNDVLSWSFASKLS
Sequence:  6
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNL
Sequence:  7
NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFS

In [17]:
seqs = {} #empty dictionary
counter = 1
for pp in ppb.build_peptides(structure):
    seqs["Sequence {}".format(counter)] = pp.get_sequence()
    counter += 1

In [18]:
seqs

{'Sequence 1': Seq('SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN'),
 'Sequence 2': Seq('NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFA...KLS'),
 'Sequence 3': Seq('SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLNGNGEPRVGSLGRAFYSAP...KLS'),
 'Sequence 4': Seq('SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN'),
 'Sequence 5': Seq('NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFA...KLS'),
 'Sequence 6': Seq('SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNL'),
 'Sequence 7': Seq('NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFA...KLS')}

In [21]:
seq1 = seqs['Sequence 1']

In [20]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [24]:
analysed_seq = ProteinAnalysis(str(seq1))

In [26]:
analysed_seq.molecular_weight()

4176.516699999999

In [27]:
analysed_seq.gravy()

-0.561111111111111

In [28]:
analysed_seq.count_amino_acids()

{'A': 1,
 'C': 0,
 'D': 2,
 'E': 1,
 'F': 3,
 'G': 1,
 'H': 0,
 'I': 2,
 'K': 0,
 'L': 5,
 'M': 0,
 'N': 6,
 'P': 0,
 'Q': 3,
 'R': 3,
 'S': 5,
 'T': 2,
 'V': 1,
 'W': 0,
 'Y': 1}

In [29]:
analysed_seq.get_amino_acids_percent()

{'A': 0.027777777777777776,
 'C': 0.0,
 'D': 0.05555555555555555,
 'E': 0.027777777777777776,
 'F': 0.08333333333333333,
 'G': 0.027777777777777776,
 'H': 0.0,
 'I': 0.05555555555555555,
 'K': 0.0,
 'L': 0.1388888888888889,
 'M': 0.0,
 'N': 0.16666666666666666,
 'P': 0.0,
 'Q': 0.08333333333333333,
 'R': 0.08333333333333333,
 'S': 0.1388888888888889,
 'T': 0.05555555555555555,
 'V': 0.027777777777777776,
 'W': 0.0,
 'Y': 0.027777777777777776}

In [30]:
analysed_seq.secondary_structure_fraction() # helix, turn, sheet

(0.3333333333333333, 0.3333333333333333, 0.19444444444444445)

Scales are located [here](https://github.com/biopython/biopython/blob/master/Bio/SeqUtils/ProtParamData.py#L6).

- Kyte & Doolittle index of hydrophobicity --> kd
- Normalized flexibility parameters (B-values), average --> flex
- Hydrophilicity --> hw
- Surface accessibility --> em
- Janin Interior to surface transfer energy scale --> ja

In [40]:
kd = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5,
      "Q": -3.5, "E": -3.5, "G": -0.4, "H": -3.2, "I": 4.5,
      "L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6,
      "S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2}

flex = {"A": 0.984, "C": 0.906, "E": 1.094, "D": 1.068,
        "G": 1.031, "F": 0.915, "I": 0.927, "H": 0.950,
        "K": 1.102, "M": 0.952, "L": 0.935, "N": 1.048,
        "Q": 1.037, "P": 1.049, "S": 1.046, "R": 1.008,
        "T": 0.997, "W": 0.904, "V": 0.931, "Y": 0.929}

hw = {"A": -0.5, "R": 3.0, "N": 0.2, "D": 3.0, "C": -1.0,
      "Q": 0.2, "E": 3.0, "G": 0.0, "H": -0.5, "I": -1.8,
      "L": -1.8, "K": 3.0, "M": -1.3, "F": -2.5, "P": 0.0,
      "S": 0.3, "T": -0.4, "W": -3.4, "Y": -2.3, "V": -1.5}

em = {"A": 0.815, "R": 1.475, "N": 1.296, "D": 1.283, "C": 0.394,
      "Q": 1.348, "E": 1.445, "G": 0.714, "H": 1.180, "I": 0.603,
      "L": 0.603, "K": 1.545, "M": 0.714, "F": 0.695, "P": 1.236,
      "S": 1.115, "T": 1.184, "W": 0.808, "Y": 1.089, "V": 0.606}

ja = {"A": 0.28, "R": -1.14, "N": -0.55, "D": -0.52, "C": 0.97,
      "Q": -0.69, "E": -1.01, "G": 0.43, "H": -0.31, "I": 0.60,
      "L": 0.60, "K": -1.62, "M": 0.43, "F": 0.46, "P": -0.42,
      "S": -0.19, "T": -0.32, "W": 0.29, "Y": -0.15, "V": 0.60}

In [44]:
analysed_seq.protein_scale(window=7, param_dict=hw)

[-0.4142857142857142,
 -0.8142857142857142,
 -0.8142857142857142,
 -0.8142857142857142,
 -0.9142857142857144,
 -0.557142857142857,
 0.22857142857142865,
 0.14285714285714293,
 0.5285714285714286,
 0.24285714285714288,
 -0.4428571428571429,
 -0.3428571428571429,
 -0.3428571428571429,
 -0.3428571428571429,
 0.14285714285714288,
 0.04285714285714287,
 0.3428571428571429,
 0.38571428571428573,
 0.6857142857142857,
 0.7,
 0.3142857142857142,
 -0.1142857142857143,
 -0.01428571428571429,
 -0.31428571428571433,
 0.32857142857142857,
 0.028571428571428543,
 -0.07142857142857142,
 -0.08571428571428573,
 -0.3428571428571429,
 -0.34285714285714286]