In [1]:
from uuid import uuid4
from pdb2sql import pdb2sql
import numpy as np
from typing import List
from scipy.spatial import distance_matrix
from deeprankcore.molstruct.structure import Chain, PDBStructure
from deeprankcore.molstruct.atom import Atom
from deeprankcore.molstruct.pair import AtomicContact, ResidueContact
from deeprankcore.molstruct.variant import SingleResidueVariant
from deeprankcore.utils.graph import Edge, Graph
from deeprankcore.utils.buildgraph import get_structure
from deeprankcore.features.contact import add_features, _intra_partners
from deeprankcore.domain.aminoacidlist import alanine
from deeprankcore.domain import edgestorage as Efeat
from tests.features.test_contact import _get_atom



In [2]:
def _abs_distance(positions, atom1: List[int], atom2: List[int]):
    pos1, pos2 = positions[atom1], positions[atom2]
    sum_of_squares = 0
    for i in range(3):
        sum_of_squares += (pos1[i]-pos2[i])**2
    return np.sqrt(sum_of_squares)


In [105]:
data = '1A0Z'

if data == '1ATN':
    pdb = pdb2sql(f"../data/pdb/{data}/{data}_1w.pdb")
else:
    pdb = pdb2sql(f"../data/pdb/{data}/{data}.pdb")
    
try:
    structure = get_structure(pdb, data)
finally:
    pdb._close() # pylint: disable=protected-access

atoms = structure.get_atoms()
positions = np.array([atom.position for atom in atoms])
distances = distance_matrix(positions, positions)

print(len(atoms))



4386


In [19]:
# For 1ak4.pdb

if data == '1ak4':

    count_atoms = len(atoms)
    intra_matrix = _intra_partners(distances, 3)
    assert intra_matrix.shape == (count_atoms, count_atoms)

    chain_C = structure.get_chain('C')
    chain_D = structure.get_chain('D')

    index_C_phe60_CE1 = atoms.index(_get_atom(chain_C, 60, "CE1"))
    index_C_trp121_CZ2 = atoms.index(_get_atom(chain_C, 121, "CZ2"))
    index_C_asn102_O = atoms.index(_get_atom(chain_C, 102, "O"))
    index_D_leu111_CG = atoms.index(_get_atom(chain_D, 111, "CG"))
    index_D_pro93_CA = atoms.index(_get_atom(chain_D, 93, "CA"))
    index_D_pro93_CB = atoms.index(_get_atom(chain_D, 93, "CB"))
    index_D_pro93_CG = atoms.index(_get_atom(chain_D, 93, "CG"))
    index_D_pro93_CD = atoms.index(_get_atom(chain_D, 93, "CD"))
    index_D_ala92_CA = atoms.index(_get_atom(chain_D, 92, "CA"))
    index_D_ala92_CB = atoms.index(_get_atom(chain_D, 92, "CB"))
    index_D_gly89_N = atoms.index(_get_atom(chain_D, 89, "N"))

    # one bond away
    print('1', _abs_distance(positions, index_D_pro93_CA, index_D_pro93_CB))
    assert intra_matrix[index_D_pro93_CA, index_D_pro93_CB]
    assert intra_matrix[index_D_pro93_CB, index_D_pro93_CA]

    # two bonds away
    print('2', _abs_distance(positions, index_D_pro93_CA, index_D_pro93_CG))
    assert intra_matrix[index_D_pro93_CA, index_D_pro93_CG]
    assert intra_matrix[index_D_pro93_CG, index_D_pro93_CA]

    # three bonds away
    print('3', _abs_distance(positions, index_D_pro93_CA, index_D_ala92_CA))
    assert intra_matrix[index_D_pro93_CA, index_D_ala92_CA]
    assert intra_matrix[index_D_ala92_CA, index_D_pro93_CA]

    # four bonds away
    print('4', _abs_distance(positions, index_D_pro93_CA, index_D_ala92_CB))
    assert not intra_matrix[index_D_pro93_CA, index_D_ala92_CB]

    # in different chain, but hydrogen bonded
    print('Hbond, diff chain', _abs_distance(positions, index_D_gly89_N, index_C_asn102_O))
    assert not intra_matrix[index_D_gly89_N, index_C_asn102_O]

    # close, but not connected
    print('close, but not connected', _abs_distance(positions, index_C_trp121_CZ2, index_C_phe60_CE1))
    assert not intra_matrix[index_C_trp121_CZ2, index_C_phe60_CE1]

    # far away from each other
    print('far', _abs_distance(positions, index_D_leu111_CG, index_D_pro93_CA))
    assert not intra_matrix[index_D_leu111_CG, index_D_pro93_CA]


else:
    print('other dataset tested:', data)


1 1.5269060219934996
2 2.3760523984121265
3 3.8166208876439427
4 4.347795993374118
Hbond, diff chain 3.0299316823981353
close, but not connected 4.094302504700892
far 19.142930261587434


In [107]:
pair_12 = distances < 2.1

pair_13_max = _intra_partners(distances, 2)
pair_13_exact = np.logical_xor(pair_13_max, pair_12)

pair_14_max = _intra_partners(distances, 3)
pair_14_exact = np.logical_xor(pair_13_max, pair_14_max)

beyond_13 = np.logical_not(pair_13_max)
beyond_14 = np.logical_not(pair_14_max)


In [109]:
# 1-3 pairing

cutoffs = [2.1, 2.9, 3.6, 4.2, 10]

for cutoff in cutoffs:
    within_cutoff = distances < cutoff
    beyond_cutoff = np.logical_not(within_cutoff)

    tp_matrix = np.logical_and(within_cutoff, pair_13_max)
    fp_matrix = np.logical_and(within_cutoff, beyond_13)
    tn_matrix = np.logical_and(beyond_cutoff, beyond_13)
    fn_matrix = np.logical_and(beyond_cutoff, pair_13_max)

    tp = np.sum(tp_matrix)
    fp = np.sum(fp_matrix)
    tn = np.sum(tn_matrix)
    fn = np.sum(fn_matrix)

    tp_rate = tp/np.sum(pair_13_max)
    fp_rate = fp/np.sum(beyond_13)
    tn_rate = tn/np.sum(beyond_13)
    fn_rate = fn/np.sum(pair_13_max)

    print(f'1-3 pairing at distance cutoff {cutoff} Angstrom for {data}')
    # print(tp, fp, tn, fn)

    print(f'true positives: {tp_rate*100:.2f}%')
    print(f'false positives: {fp_rate*100:.2f}%')

    print(f'precision: {tp/(tp+fp):.2f}')
    print(f'f1 score: {2*tp/(2*tp+fp+fn):.2f}')
    print(f'sensitivity: {tp/(tp+fn):.3}')
    # print(f'specificity: {tn/(tn+fp):.3f}')
    # print(f'accuracy: {(tp+tn)/(tp+tn+fp+fn):.3f}')
    # print(f'mcc: {((tp*tn)-(fp*fn)) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)):.2f}')

    # print(tn_rate*100)
    # print(fn_rate*100)
    print('')


1-3 pairing at distance cutoff 2.1 Angstrom for 1A0Z
true positives: 52.24%
false positives: 0.00%
precision: 1.00
f1 score: 0.69
sensitivity: 0.522

1-3 pairing at distance cutoff 2.9 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.02%
precision: 0.88
f1 score: 0.93
sensitivity: 1.0

1-3 pairing at distance cutoff 3.6 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.10%
precision: 0.58
f1 score: 0.74
sensitivity: 1.0

1-3 pairing at distance cutoff 4.2 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.20%
precision: 0.40
f1 score: 0.57
sensitivity: 1.0

1-3 pairing at distance cutoff 10 Angstrom for 1A0Z
true positives: 100.00%
false positives: 3.22%
precision: 0.04
f1 score: 0.08
sensitivity: 1.0



In [110]:
# 1-4 pairing

cutoffs = [2.1, 2.9, 3.6, 4.2, 5.0, 5.5, 6.0, 6.3, 10]

for cutoff in cutoffs:
    within_cutoff = distances < cutoff
    beyond_cutoff = np.logical_not(within_cutoff)

    tp_matrix = np.logical_and(within_cutoff, pair_14_max)
    fp_matrix = np.logical_and(within_cutoff, beyond_14)
    tn_matrix = np.logical_and(beyond_cutoff, beyond_14)
    fn_matrix = np.logical_and(beyond_cutoff, pair_14_max)

    tp = np.sum(tp_matrix)
    fp = np.sum(fp_matrix)
    tn = np.sum(tn_matrix)
    fn = np.sum(fn_matrix)

    tp_rate = tp/np.sum(pair_14_max)
    fp_rate = fp/np.sum(beyond_14)
    tn_rate = tn/np.sum(beyond_14)
    fn_rate = fn/np.sum(pair_14_max)

    print(f'1-4 pairing at distance cutoff {cutoff} Angstrom for {data}')
    # print(tp, fp, tn, fn)

    print(f'true positives: {tp_rate*100:.2f}%')
    print(f'false positives: {fp_rate*100:.2f}%')

    print(f'precision: {tp/(tp+fp):.2f}')
    print(f'f1 score: {2*tp/(2*tp+fp+fn):.2f}')
    print(f'sensitivity: {tp/(tp+fn):.3}')
    # print(f'specificity: {tn/(tn+fp):.3f}')
    # print(f'accuracy: {(tp+tn)/(tp+tn+fp+fn):.3f}')
    # print(f'mcc: {((tp*tn)-(fp*fn)) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)):.2f}')

    # print(tn_rate*100)
    # print(fn_rate*100)
    print('')


1-4 pairing at distance cutoff 2.1 Angstrom for 1A0Z
true positives: 34.22%
false positives: 0.00%
precision: 1.00
f1 score: 0.51
sensitivity: 0.342

1-4 pairing at distance cutoff 2.9 Angstrom for 1A0Z
true positives: 72.91%
false positives: 0.00%
precision: 0.98
f1 score: 0.83
sensitivity: 0.729

1-4 pairing at distance cutoff 3.6 Angstrom for 1A0Z
true positives: 88.38%
false positives: 0.05%
precision: 0.79
f1 score: 0.83
sensitivity: 0.884

1-4 pairing at distance cutoff 4.2 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.13%
precision: 0.61
f1 score: 0.76
sensitivity: 1.0

1-4 pairing at distance cutoff 5.0 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.37%
precision: 0.36
f1 score: 0.53
sensitivity: 1.0

1-4 pairing at distance cutoff 5.5 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.53%
precision: 0.28
f1 score: 0.43
sensitivity: 1.0

1-4 pairing at distance cutoff 6.0 Angstrom for 1A0Z
true positives: 100.00%
false positives: 0.72%
pre