In [1]:
import numpy as np
import pandas as pd
import os
import requests

In [3]:
modpep_df = pd.read_csv("uaa_data/modpep_ids.txt", sep="\s+", header=0)
modpep_df.head()

Unnamed: 0,PDBID
0,1aycP
1,1f1wB
2,1f8aC
3,1fhrP
4,1g1hB


In [4]:
data = []
pdbids = modpep_df['PDBID']


for pdb in pdbids:
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb[:4]}"
    r = requests.get(url)
    if r.status_code == 200:
        data.append(r.json())

print(len(data))

501


In [5]:
data[0]

{'audit_author': [{'name': 'Lee, C.-H.', 'pdbx_ordinal': 1},
  {'name': 'Kuriyan, J.', 'pdbx_ordinal': 2}],
 'cell': {'angle_alpha': 90.0,
  'angle_beta': 90.0,
  'angle_gamma': 90.0,
  'length_a': 62.2,
  'length_b': 62.2,
  'length_c': 74.7,
  'zpdb': 8},
 'citation': [{'country': 'UK',
   'id': 'primary',
   'journal_abbrev': 'Structure',
   'journal_id_astm': 'STRUE6',
   'journal_id_csd': '2005',
   'journal_id_issn': '0969-2126',
   'journal_volume': '2',
   'page_first': '423',
   'page_last': '438',
   'pdbx_database_id_doi': '10.1016/S0969-2126(00)00044-7',
   'pdbx_database_id_pub_med': 7521735,
   'rcsb_authors': ['Lee, C.H.',
    'Kominos, D.',
    'Jacques, S.',
    'Margolis, B.',
    'Schlessinger, J.',
    'Shoelson, S.E.',
    'Kuriyan, J.'],
   'rcsb_is_primary': 'Y',
   'rcsb_journal_abbrev': 'Structure',
   'title': 'Crystal structures of peptide complexes of the amino-terminal SH2 domain of the Syp tyrosine phosphatase.',
   'year': 1994},
  {'country': 'US',
   'i

In [24]:
for key in data[0].keys():
    print(key, type(data[0][key]))

audit_author <class 'list'>
cell <class 'dict'>
citation <class 'list'>
database2 <class 'list'>
diffrn <class 'list'>
diffrn_radiation <class 'list'>
entry <class 'dict'>
exptl <class 'list'>
exptl_crystal <class 'list'>
pdbx_audit_revision_category <class 'list'>
pdbx_audit_revision_details <class 'list'>
pdbx_audit_revision_group <class 'list'>
pdbx_audit_revision_history <class 'list'>
pdbx_audit_revision_item <class 'list'>
pdbx_database_status <class 'dict'>
pdbx_vrpt_summary <class 'dict'>
pdbx_vrpt_summary_diffraction <class 'list'>
pdbx_vrpt_summary_geometry <class 'list'>
rcsb_accession_info <class 'dict'>
rcsb_entry_container_identifiers <class 'dict'>
rcsb_entry_info <class 'dict'>
rcsb_primary_citation <class 'dict'>
refine <class 'list'>
refine_hist <class 'list'>
refine_ls_restr <class 'list'>
software <class 'list'>
struct <class 'dict'>
struct_keywords <class 'dict'>
symmetry <class 'dict'>
rcsb_id <class 'str'>


## Extract individual UAAs from ModPep Peptides (Example Pipeline)

In [28]:
!pip install rcsb-api biopython


Collecting biopython
  Downloading biopython-1.86-cp311-cp311-macosx_10_9_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp311-cp311-macosx_10_9_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m42.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [29]:
from rcsbapi.data import DataQuery


def get_polymer_entities(pdbid: str):
    dq = DataQuery(
        input_type="entries",
        input_ids=[pdbid[:-1]],
        return_data_list=[
          'rcsb_entry_container_identifiers'
        ]
    )

    data = dq.exec()

    polys = data['data']['entries']

    return data #[ent["rcsb_entry_container_identifiers"]['entity_ids'] for ent in polys]



# example:
#d = get_modified_residues("1aycP")['data']['entries'][0]["polymer_entities"][0]['entity_poly'].keys()#["polymer_entities.entity_poly"]

e = get_polymer_entities("1AYCP")
print(e)



{'data': {'entries': [{'rcsb_id': '1AYC', 'rcsb_entry_container_identifiers': {'water_entity_ids': None, 'entry_id': '1AYC', 'assembly_ids': ['1'], 'related_emdb_ids': None, 'pubmed_id': 7521735, 'model_ids': [1], 'emdb_ids': None, 'rcsb_id': '1AYC', 'entity_ids': ['1', '2', '3'], 'non_polymer_entity_ids': None, 'branched_entity_ids': None, 'polymer_entity_ids': ['1', '2']}}]}}


In [18]:
modpep_df['data'] = modpep_df['PDBID'].apply(get_polymer_entities)

modpep_df.head()

Unnamed: 0,PDBID,data
0,1aycP,"[[1, 2, 3]]"
1,1f1wB,"[[1, 2, 3]]"
2,1f8aC,"[[1, 2, 3]]"
3,1fhrP,"[[1, 2]]"
4,1g1hB,"[[1, 2, 3]]"


In [38]:
import io
import urllib.request

import pandas as pd
from Bio.PDB import MMCIFParser, Polypeptide

standard_aa = set(Polypeptide.standard_aa_names)  # {'ALA', 'ARG', ..., 'VAL'}
print(standard_aa)

def split_entry_chain(pdbid_with_chain: str):
    """
    '1aycP' -> ('1AYC', 'P')
    If there is no chain letter, returns (entry, None).
    """
    pdbid_with_chain = pdbid_with_chain.strip()
    entry = pdbid_with_chain[:4].upper()
    chain = pdbid_with_chain[4:] or None
    return entry, chain

def get_unnatural_residues(pdbid_with_chain: str):
    entry, chain = split_entry_chain(pdbid_with_chain)
    print(entry, chain)

    url = f"https://files.rcsb.org/download/{entry}.cif"
    text = urllib.request.urlopen(url).read().decode()

    parser = MMCIFParser(QUIET=True)
    struct = parser.get_structure(entry, io.StringIO(text))

    print(struct)
    

    unnats = []
    i = 0
    for model in struct:
        print(i)
        print(model)
        i+=1

        for ch in model:
            if chain and ch.id != chain:
                continue
            for res in ch:
                hetflag, resseq, icode = res.id

                # skip waters/solvent, ions, etc.
                if hetflag.strip():  # HETATM residues have non-empty hetflag
                    continue
                print('we got here fam')

                resname = res.get_resname().strip()
                print(resname)

                if resname not in standard_aa:
                    unnats.append(
                        {
                            "entry": entry,
                            "chain": ch.id,
                            "resseq": resseq,
                            "icode": icode.strip() or None,
                            "resname": resname,
                        }
                    )
    return unnats

# Apply to your MODPEP dataframe
all_rows = []
for pdbid in modpep_df["PDBID"]:
    row = get_unnatural_residues(pdbid)
    all_rows.extend(row)

unnat_df = pd.DataFrame(all_rows)


{'PRO', 'ASP', 'ARG', 'CYS', 'VAL', 'ALA', 'HIS', 'MET', 'ILE', 'GLU', 'LYS', 'TYR', 'PHE', 'LEU', 'GLY', 'ASN', 'GLN', 'TRP', 'SER', 'THR'}
1AYC P
<Structure id=1AYC>
0
<Model id=0>
we got here fam
GLY
we got here fam
GLY
we got here fam
MET
we got here fam
ASP
we got here fam
MET
we got here fam
SER
1F1W B
<Structure id=1F1W>
0
<Model id=0>
we got here fam
SER
we got here fam
VAL
we got here fam
ASN
we got here fam
VAL
we got here fam
GLN
we got here fam
ASN
1F8A C
<Structure id=1F8A>
0
<Model id=0>
we got here fam
TYR
we got here fam
PRO
we got here fam
THR
we got here fam
PRO
we got here fam
SER
1FHR P
<Structure id=1FHR>
0
<Model id=0>
we got here fam
GLU
we got here fam
ASP
we got here fam
ILE
we got here fam
TYR
we got here fam
LEU
we got here fam
ASP
1
<Model id=1>
we got here fam
GLU
we got here fam
ASP
we got here fam
ILE
we got here fam
TYR
we got here fam
LEU
we got here fam
ASP
2
<Model id=2>
we got here fam
GLU
we got here fam
ASP
we got here fam
ILE
we got here fam
TYR
w

KeyboardInterrupt: 