# Analyse results Isopeptor capability to predict isopeptide bonds in AF2 structures 

In [None]:
import pandas as pd
import os
import shutil
from isopeptor.isopeptide import Isopeptide
from isopeptor.jess_wrapper import run_jess

In [2]:
from dotenv import load_dotenv
load_dotenv("../.env")

TABLE = os.getenv("TABLE")
AF2_TEMPLATES_BIOCHEM = os.getenv("AF2_TEMPLATES_BIOCHEM")

af_df = pd.read_csv(AF2_TEMPLATES_BIOCHEM)
af_df = af_df.rename(columns={"r1_af":"r1_bond",
                        "r2_af":"r_cat",
                        "r3_af":"r2_bond"})
af_df["protein_name"] = af_df["PDB code"] + "_" + af_df["Chain"]
af_df["true_positive"] = True
af_df.loc[~af_df["Is bonded"], "true_positive"] = False

## With templates

In [3]:
path = "/nfs/research/agb/research/francesco/projects/20240212_isopeptideBonds_v1/20240529_findWithJess_v1/analysis/20240903_runAF2_v1/output/AF2_templates"
tmp_dir = "../tmp/template_af2"
os.makedirs(tmp_dir, exist_ok=True)
for prot in os.listdir(path):
    if prot == "sequences":
        continue
    src = os.path.join(path, prot, "ranked_0.pdb")
    dst = os.path.join(tmp_dir, f"{prot}.pdb")
    shutil.copyfile(src, dst)


In [4]:
i = Isopeptide(tmp_dir)
i.predict()
# Do not remove redundancy: this will be done manually to exclude 
# template-match pairs with sequence identity > 30%
#i.jess_hits = run_jess(i.structure_files, i.distance)
#i._load_hits()
#i._calc_rasa()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
headers = [
    "protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond",
    "r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type",
    "rmsd", "r_asa", "template"
    ]
outlist = []
for bond in i.isopeptide_bonds:
    row = [
            bond.protein_name, bond.probability, bond.chain, bond.r1_bond, 
            bond.r_cat, bond.r2_bond, 
            bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type,
            bond.rmsd, bond.r_asa, bond.template
        ]
    outlist.append(row)

res_df = pd.DataFrame(outlist, columns=headers)
test_df = pd.merge(res_df, af_df[["protein_name", "r1_bond", "r_cat", "r2_bond", "true_positive"]], how="outer")\
    .query('true_positive==True')
test_df["predicted"] = True
test_df.loc[test_df["probability"]<.65, "predicted"] = False
test_df.value_counts("predicted", normalize=True).round(3)


predicted
True     0.938
False    0.062
Name: proportion, dtype: float64

## Without templates

In [16]:
path = "/nfs/research/agb/research/francesco/projects/20240212_isopeptideBonds_v1/20240529_findWithJess_v1/analysis/20240903_runAF2_v1/output/AF2"
tmp_dir = "../tmp/af2"
os.makedirs(tmp_dir, exist_ok=True)
for prot in os.listdir(path):
    if prot == "sequences":
        continue
    src = os.path.join(path, prot, "ranked_0.pdb")
    dst = os.path.join(tmp_dir, f"{prot}.pdb")
    shutil.copyfile(src, dst)


In [17]:
i = Isopeptide(tmp_dir)
i.predict()

In [20]:
headers = [
    "protein_name", "probability", "chain", "r1_bond", "r_cat", "r2_bond",
    "r1_bond_name", "r_cat_name", "r2_bond_name", "bond_type",
    "rmsd", "r_asa", "template"
    ]
outlist = []
for bond in i.isopeptide_bonds:
    row = [
            bond.protein_name, bond.probability, bond.chain, bond.r1_bond, 
            bond.r_cat, bond.r2_bond, 
            bond.r1_bond_name, bond.r_cat_name, bond.r2_bond_name, bond.bond_type,
            bond.rmsd, bond.r_asa, bond.template
        ]
    outlist.append(row)

res_df = pd.DataFrame(outlist, columns=headers)
test_df = pd.merge(res_df, af_df[["protein_name", "r1_bond", "r_cat", "r2_bond", "true_positive"]], how="outer")\
    .query('true_positive==True')
test_df["predicted"] = True
test_df.loc[test_df["probability"]<.5, "predicted"] = False
test_df.value_counts("predicted", normalize=True).round(2)


predicted
True     0.97
False    0.03
Name: proportion, dtype: float64

In [21]:
len(test_df)

194