# Analyse biochemical properties of isopeptide bond PDB/AF2 datasets

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from Bio import SeqIO

In [2]:
def get_sequence(row, pdb = True)->str:
    """

        Get sequence from structure. Consider +-20 amino acids from 
        first and last isopeptide bond amino acid
    
    """
    structure_path = row["structure_path"]
    if pdb:
        r1 = row["Position 1\r\n(Bond 1)"]
        r2 = row["Position 2\r\n(catalytic)"]
        r3 = row["Position 3\r\n(Bond 2)"]
    else:
        r1 = row["r1_af"]
        r2 = row["r2_af"]
        r3 = row["r3_af"]
    seq_start = min([r1, r2, r3])
    seq_end = max([r1, r2, r3])
    sequence = list(SeqIO.parse(structure_path, "pdb-atom"))[0]
    # Adjust seq start and end based on pdb seq structure start and end
    pdb_start = sequence.annotations["start"]
    seq_start = seq_start - pdb_start - 20
    seq_end = seq_end - pdb_start + 20
    if seq_start < 0:
        seq_start = 0
    return str(sequence.seq)[seq_start:seq_end]

In [3]:
load_dotenv("../.env")
PDB_BIOCHEM = os.getenv("PDB_BIOCHEM")
AF2_BIOCHEM = os.getenv("AF2_BIOCHEM")
AF2_TEMPLATES_BIOCHEM = os.getenv("AF2_TEMPLATES_BIOCHEM")

In [4]:
pdb_df = pd.read_csv(PDB_BIOCHEM)
af_df = pd.read_csv(AF2_BIOCHEM)
af_templates_df = pd.read_csv(AF2_TEMPLATES_BIOCHEM)

# Get isopep sequence (+-20 amino acids)
pdb_df["isopep_sequence"] = pdb_df.apply(get_sequence, pdb=True, axis=1)
af_df["isopep_sequence"] = af_df.apply(get_sequence, pdb=False, axis=1)
af_templates_df["isopep_sequence"] = af_templates_df.apply(get_sequence, pdb=False, axis=1)



## Aromatic residues

In [5]:
#Â Adopt some approximate values since I am not considering hydrogens
# max pi-H distance + C-H distance
dist_cutoff = 4.3+1.09
angle_cutoffs = [80, 100]

In [6]:
test_df = pdb_df.drop_duplicates("isopep_sequence").copy()
test_df["Aromatic cap"] = True
test_df.loc[(test_df["distance_to_aro"]>dist_cutoff)&\
    ((test_df["angle_with_aro"]<angle_cutoffs[0])|(test_df["angle_with_aro"]>angle_cutoffs[1])), "Aromatic cap"] = False

test_df[(test_df["Is bonded"]) & (~test_df["Bad rotamer"]) & (test_df["Isopeptide type"]!="Mutant")].value_counts(["Isopeptide type", "Aromatic cap"], normalize=True)

Isopeptide type  Aromatic cap
CnaB-like        False           0.350993
CnaA-like        True            0.317881
CnaB-like        True            0.251656
CnaA-like        False           0.079470
Name: proportion, dtype: float64

In [7]:
test_df = af_df.drop_duplicates("isopep_sequence").copy()
test_df["Aromatic cap"] = True
test_df.loc[(test_df["distance_to_aro"]>dist_cutoff)&\
    ((test_df["angle_with_aro"]<angle_cutoffs[0])|(test_df["angle_with_aro"]>angle_cutoffs[1])), "Aromatic cap"] = False

test_df[(test_df["Is bonded"]) & (~test_df["Bad rotamer"]) & (test_df["Isopeptide type"]!="Mutant")].value_counts(["Isopeptide type", "Aromatic cap"], normalize=True)

Isopeptide type  Aromatic cap
CnaB-like        False           0.397351
CnaA-like        True            0.238411
CnaB-like        True            0.205298
CnaA-like        False           0.158940
Name: proportion, dtype: float64