# Fingerprint distance

## Imports

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path
import sys

import pandas as pd

sys.path.append('../..')
from kinsim_structure.auxiliary import KlifsMoleculeLoader
from kinsim_structure.encoding import Fingerprint, FingerprintGenerator
from kinsim_structure.similarity import FeatureDistances, FingerprintDistance, FeatureDistancesGenerator, FingerprintDistanceGenerator

_ColormakerRegistry()

## IO paths

In [4]:
path_to_kinsim = Path('.') / '..' / '..'
path_to_data = path_to_kinsim / 'examples' / 'data'
path_to_results = None

In [5]:
PATH_TO_RESULTS = path_to_kinsim / 'examples' / 'results' / 'fingerprints'

## Load KLIFS metadata

In [6]:
klifs_metadata = pd.read_csv(path_to_data / 'postprocessed' / 'klifs_metadata_postprocessed.csv' , index_col=0)

In [7]:
klifs_metadata.shape

(3878, 23)

In [8]:
klifs_metadata.head()

Unnamed: 0,metadata_index,kinase,family,groups,pdb_id,chain,alternate_model,species,ligand_orthosteric_name,ligand_orthosteric_pdb_id,...,ac_helix,rmsd1,rmsd2,qualityscore,pocket,resolution,missing_residues,missing_atoms,full_ifp,code
0,2886,AAK1,NAK,Other,4wsq,B,A,Human,K-252A,KSA,...,in,0.777,2.125,8.6,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.95,0,14,0000000000000010000001000000000000000000000000...,HUMAN/AAK1/4wsq_chainB_altA
1,10043,AAK1,NAK,Other,5l4q,A,A,Human,"~{N}-[5-(4-cyanophenyl)-1~{H}-pyrrolo[2,3-b]py...",LKB,...,in,0.78,2.137,9.7,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.97,0,3,0000000000000010000000000000000000000000000000...,HUMAN/AAK1/5l4q_chainA_altA
2,7046,AAK1,NAK,Other,5te0,A,-,Human,methyl (3Z)-3-{[(4-{methyl[(4-methylpiperazin-...,XIN,...,in,0.776,2.12,8.8,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,1.9,0,12,1000101000000010000001000000000000000000000000...,HUMAN/AAK1/5te0_chainA
3,843,ABL1,Abl,TK,2f4j,A,-,Human,CYCLOPROPANECARBOXYLIC ACID {4-[4-(4-METHYL-PI...,VX6,...,in,0.779,2.128,8.0,HKLGGGQYGEVYEVAVKTLEFLKEAAVMKEIKPNLVQLLGVYIITE...,1.91,0,0,0000000000000010000001000000000000000000000000...,HUMAN/ABL1/2f4j_chainA
4,815,ABL1,Abl,TK,2g1t,A,-,Human,-,-,...,out,0.825,2.154,8.0,HKLGGGQYGEVYEVAVKTLEFLKEAAVMKEIKPNLVQLLGVYIITE...,1.8,0,0,,HUMAN/ABL1/2g1t_chainA


## Fingerprint pair

### Fingerprint

In [9]:
fp1 = Fingerprint()
fp1.from_metadata_entry(klifs_metadata_entry=klifs_metadata.iloc[100])

In [10]:
fp2 = Fingerprint()
fp2.from_metadata_entry(klifs_metadata_entry=klifs_metadata.iloc[101])

In [11]:
fp2.fingerprint_normalized

{'physicochemical':     size       hbd  hba  charge  aromatic  aliphatic       sco  exposure
 1    1.0  1.000000  0.0     1.0       0.0        0.0  0.502833  0.142857
 2    0.0  0.000000  0.0     0.5       0.0        0.0       NaN  0.190476
 3    0.5  0.000000  0.0     0.5       0.0        1.0  0.334056  0.814815
 4    0.0  0.000000  0.0     0.5       0.0        0.0       NaN  0.423077
 5    0.5  0.333333  0.5     0.5       1.0        0.0       NaN  0.142857
 ..   ...       ...  ...     ...       ...        ...       ...       ...
 81   0.5  0.000000  1.0     0.0       0.0        0.0  0.649444  0.400000
 82   1.0  0.000000  0.0     0.5       1.0        0.0  0.444556  0.613636
 83   0.0  0.000000  0.0     0.5       0.0        0.0       NaN  0.444444
 84   0.5  0.000000  0.0     0.5       0.0        1.0  0.706722  0.612903
 85   0.0  0.000000  0.0     0.5       0.0        1.0       NaN  0.742857
 
 [85 rows x 8 columns],
 'distances':     distance_to_centroid  distance_to_hinge_region  d

### FeatureDistances

In [12]:
feature_distances = FeatureDistances()

In [13]:
feature_distances.from_fingerprints(fp1, fp2)

In [14]:
feature_distances.molecule_codes

['HUMAN/ALK_6e0r_altA_chainA', 'HUMAN/ALK_6ebw_altA_chainA']

In [15]:
feature_distances.distance_measure

'euclidean'

In [16]:
feature_distances.data

Unnamed: 0,feature_type,feature_name,distance,bit_coverage,bit_number
0,physicochemical,size,0.0,0.94,80
1,physicochemical,hbd,0.0,0.94,80
2,physicochemical,hba,0.0,0.94,80
3,physicochemical,charge,0.0,0.94,80
4,physicochemical,aromatic,0.0,0.94,80
5,physicochemical,aliphatic,0.0,0.94,80
6,physicochemical,sco,0.004035,0.76,65
7,physicochemical,exposure,0.004225,0.94,80
8,distances,distance_to_centroid,0.001501,0.94,80
9,distances,distance_to_hinge_region,0.000939,0.94,80


### FingerprintDistance

In [17]:
fingerprint_distance = FingerprintDistance()

In [18]:
fingerprint_distance.from_feature_distances(feature_distances, None)

In [19]:
fingerprint_distance.molecule_codes

['HUMAN/ALK_6e0r_altA_chainA', 'HUMAN/ALK_6ebw_altA_chainA']

In [20]:
fingerprint_distance.distance_measure

'euclidean'

In [21]:
fingerprint_distance.feature_weights

0     0.066667
1     0.066667
2     0.066667
3     0.066667
4     0.066667
5     0.066667
6     0.066667
7     0.066667
8     0.066667
9     0.066667
10    0.066667
11    0.066667
12    0.066667
13    0.066667
14    0.066667
Name: weight, dtype: float64

In [22]:
fingerprint_distance.data

distance    0.007422
coverage    0.940000
dtype: float64

## Fingerprint pairs in bulk

### FingerprintGenerator

In [23]:
fingerprint_generator = FingerprintGenerator()

In [24]:
fingerprint_generator.from_metadata_entry(klifs_metadata=klifs_metadata[:20])

2019-11-06 17:08:04.267189
2019-11-06 17:08:16.624308


In [25]:
fingerprint_generator.data

{'HUMAN/AAK1_4wsq_altA_chainB': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8b630>,
 'HUMAN/AAK1_5l4q_altA_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8a588>,
 'HUMAN/AAK1_5te0_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8b898>,
 'HUMAN/ABL1_2f4j_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a88390>,
 'HUMAN/ABL1_2g1t_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8aa90>,
 'HUMAN/ABL1_2g2i_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8a9e8>,
 'HUMAN/ABL1_2gqg_altB_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8a5c0>,
 'HUMAN/ABL1_2hz4_chainB': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8d080>,
 'HUMAN/ABL1_2v7a_chainB': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a8e4a8>,
 'HUMAN/ABL1_4twp_chainB': <kinsim_structure.encoding.Fingerprint at 0x7fc1e9a903c8>,
 'HUMAN/ABL2_2xyn_altA_chainA': <kinsim_structure.encoding.Fingerprint at 0x7fc1e988b400>,
 'HUMAN/ACK_1u46_chainA': <kinsim_

### FeatureDistanceGenerator

In [26]:
feature_distances_generator = FeatureDistancesGenerator()

In [27]:
feature_distances_generator.from_fingerprint_generator(fingerprint_generator)

2019-11-06 17:08:16.666683
2019-11-06 17:08:18.941420


In [28]:
feature_distances_generator.molecule_codes

['HUMAN/AAK1_4wsq_altA_chainB',
 'HUMAN/AAK1_5l4q_altA_chainA',
 'HUMAN/AAK1_5te0_chainA',
 'HUMAN/ABL1_2f4j_chainA',
 'HUMAN/ABL1_2g1t_chainA',
 'HUMAN/ABL1_2g2i_chainA',
 'HUMAN/ABL1_2gqg_altB_chainA',
 'HUMAN/ABL1_2hz4_chainB',
 'HUMAN/ABL1_2v7a_chainB',
 'HUMAN/ABL1_4twp_chainB',
 'HUMAN/ABL2_2xyn_altA_chainA',
 'HUMAN/ACK_1u46_chainA',
 'HUMAN/ACK_1u4d_chainB',
 'HUMAN/ACK_1u54_chainB',
 'HUMAN/ACK_3eqp_chainB',
 'HUMAN/ACK_3eqr_chainA',
 'HUMAN/ACK_4ewh_chainA',
 'HUMAN/ACK_4hzr_altA_chainA',
 'HUMAN/ACK_4hzs_chainC',
 'HUMAN/ACK_4id7_chainA']

In [29]:
feature_distances_generator.kinase_names

['AAK1', 'ABL1', 'ABL2', 'ACK']

In [30]:
feature_distances_generator.distance_measure

'euclidean'

In [31]:
feature_distances_generator.data

{('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/AAK1_5l4q_altA_chainA'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9ae5358>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/AAK1_5te0_chainA'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9a883c8>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/ABL1_2f4j_chainA'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9adf358>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/ABL1_2g1t_chainA'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9adfeb8>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/ABL1_2g2i_chainA'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9a86048>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/ABL1_2gqg_altB_chainA'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9a86358>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/ABL1_2hz4_chainB'): <kinsim_structure.similarity.FeatureDistances at 0x7fc1e9a86860>,
 ('HUMAN/AAK1_4wsq_altA_chainB',
  'HUMAN/ABL1_2v7a_chainB'): <kinsim_structure.similarity.Fea

In [32]:
feature_distances_generator.data[('HUMAN/AAK1_4wsq_altA_chainB', 'HUMAN/AAK1_5l4q_altA_chainA')]

<kinsim_structure.similarity.FeatureDistances at 0x7fc1e9ae5358>

### FingerprintDistanceGenerator

In [33]:
fingerprint_distance_generator = FingerprintDistanceGenerator()

In [34]:
fingerprint_distance_generator.from_feature_distances_generator(feature_distances_generator, None)

2019-11-06 17:08:19.052710
2019-11-06 17:08:19.422117


In [35]:
fingerprint_distance_generator.distance_measure

'euclidean'

In [36]:
fingerprint_distance_generator.feature_weights

In [37]:
fingerprint_distance_generator.molecule_codes[:5]

['HUMAN/AAK1_4wsq_altA_chainB',
 'HUMAN/AAK1_5l4q_altA_chainA',
 'HUMAN/AAK1_5te0_chainA',
 'HUMAN/ABL1_2f4j_chainA',
 'HUMAN/ABL1_2g1t_chainA']

In [38]:
fingerprint_distance_generator.kinase_names

['AAK1', 'ABL1', 'ABL2', 'ACK']

In [39]:
fingerprint_distance_generator.data.head()

Unnamed: 0,molecule_code_1,molecule_code_2,distance,coverage
0,HUMAN/AAK1_4wsq_altA_chainB,HUMAN/AAK1_5l4q_altA_chainA,0.005896,0.989333
1,HUMAN/AAK1_4wsq_altA_chainB,HUMAN/AAK1_5te0_chainA,0.007228,0.988
2,HUMAN/AAK1_4wsq_altA_chainB,HUMAN/ABL1_2f4j_chainA,0.030164,0.985333
3,HUMAN/AAK1_4wsq_altA_chainB,HUMAN/ABL1_2g1t_chainA,0.030657,0.985333
4,HUMAN/AAK1_4wsq_altA_chainB,HUMAN/ABL1_2g2i_chainA,0.034038,0.954667


In [40]:
fingerprint_distance_generator.get_kinase_distance_matrix()

Unnamed: 0,AAK1,ABL1,ABL2,ACK
AAK1,0.005896,0.028653,0.029712,0.028508
ABL1,,0.005416,0.005073,0.016274
ABL2,,,,0.022821
ACK,,,,0.0033
