# Performance summary over different `kissim` setups

Summarize performance of different `kissim` setups:

- Profiling vs. `kissim` AUCs: How well does `kissim` reflect profiling data?
- Phylogenetic `kissim` tree: How well do selected on- and off-targets cluster together?
- Top `kissim` ranks: How high do selected on- and off-targets rank in `kissim`?

DFG-in conformations only!

In [1]:
from pathlib import Path

import pandas as pd
import seaborn as sns
from Bio import Phylo

from src.paths import PATH_RESULTS



In [2]:
cm = sns.light_palette("blue", as_cmap=True)

In [3]:
HERE = Path(_dh[-1])  # noqa: F821
RESULTS = PATH_RESULTS

## `kissim` setups

In [4]:
DATA_SUBSET = [
    "dfg_in",
]
WEIGHTING_SCHEMES = ["15", "110", "101", "100"]
CLUSTERING_METHODS = ["ward", "average", "weighted"]

In [5]:
ON_OFF_PAIRS = [
    ["EGFR", ["SLK", "LOK", "GAK"]],  # Erlotinib
    ["SLK", ["LOK"]],  # Erlotinib
    ["DRAK2", ["CaMKK2"]],
    ["ABL2", ["AurA"]],  # VX-680/MK-0457
    ["ABL1", ["GAK"]],  # Dasatinib
    ["GAK", ["DAPK3"]],  # Inrebic
    ["AurC", ["KIT"]],  # Inlyta
    ["KIT", ["AMPKa2", "FMS"]],  # JNJ-28312141
    ["ABL1", ["BMPR1B"]],  # PD-173955
]

## Profiling vs. `kissim` AUCs

In [6]:
auc_dfs = {}
mean_df = []
median_df = []
std_df = []
for data_subset in DATA_SUBSET:
    path = RESULTS / data_subset
    auc_df = pd.read_csv(path / "auc.csv")
    auc_dfs[data_subset] = auc_df[["15", "100", "110", "101", "111"]]

    mean = auc_dfs[data_subset].describe().loc["mean", :]
    mean.name = data_subset
    mean_df.append(mean)

    median = auc_dfs[data_subset].describe().loc["50%", :]
    median.name = data_subset
    median_df.append(median)

    std = auc_dfs[data_subset].describe().loc["std", :]
    std.name = data_subset
    std_df.append(std)
mean_df = pd.concat(mean_df, axis=1)
median_df = pd.concat(median_df, axis=1)
std_df = pd.concat(std_df, axis=1)

FileNotFoundError: [Errno 2] No such file or directory: '/home/dominique/Documents/GitHub/kissim_app/src/../results/dfg_in/auc.csv'

### Mean

In [7]:
mean_df.style.background_gradient(cmap=cm, axis=None)

AttributeError: 'list' object has no attribute 'style'

In [8]:
mean_df.style.highlight_max(axis=None, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [9]:
mean_df.style.highlight_max(axis=0, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [10]:
mean_df.style.highlight_max(axis=1, color="yellow")

AttributeError: 'list' object has no attribute 'style'

### Median

In [11]:
median_df.style.background_gradient(cmap=cm, axis=None)

AttributeError: 'list' object has no attribute 'style'

In [12]:
median_df.style.highlight_max(axis=None, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [13]:
median_df.style.highlight_max(axis=1, color="yellow")

AttributeError: 'list' object has no attribute 'style'

### Standard deviation

In [14]:
std_df.style.background_gradient(cmap=cm, axis=None)

AttributeError: 'list' object has no attribute 'style'

In [15]:
std_df.style.highlight_max(axis=None, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [16]:
std_df.style.highlight_max(axis=0, color="yellow")

AttributeError: 'list' object has no attribute 'style'

In [17]:
std_df.style.highlight_max(axis=1, color="yellow")

AttributeError: 'list' object has no attribute 'style'

## Top `kissim` ranks

In [18]:
def get_ranks(distance_matrix, rank_from, rank_to):
    """
    Get ranks for all kinases w.r.t. to a query kinase.
    """

    ranks = pd.concat(
        [
            distance_matrix[rank_from].sort_values(),
            distance_matrix[rank_from].sort_values().rank(),
        ],
        axis=1,
    )
    ranks.columns = ["distance", "rank"]
    if rank_to is not None:
        ranks = ranks.loc[rank_to, :]
    pair_names = [f"{rank_from}-{rank_to}" for i in rank_to]
    return ranks, pair_names

In [19]:
%%time

results_list = []

for data_subset in DATA_SUBSET:
    for weighting in WEIGHTING_SCHEMES:
        results = []
        columns = []
        results.extend([data_subset, weighting])

        kinase_matrix_path = RESULTS / f"{data_subset}/fingerprint_distances_to_kinase_matrix.csv"
        kinase_matrix = pd.read_csv(kinase_matrix_path, index_col=0)

        for pair in ON_OFF_PAIRS:
            ranks, pair_names = get_ranks(kinase_matrix, pair[0], pair[1])
            results.extend(ranks["rank"].to_list())

        results_list.append(results)

CPU times: user 91.3 ms, sys: 3.7 ms, total: 95 ms
Wall time: 95.7 ms


In [20]:
matrix_ranks_df = pd.DataFrame(
    results_list,
    columns=["subset", "weighting"] + [f"{i[0]}-{j}" for i in ON_OFF_PAIRS for j in i[1]],
)
matrix_ranks_df = matrix_ranks_df.set_index(["subset", "weighting"])
cm = sns.light_palette("blue", as_cmap=True, reverse=True)

In [21]:
matrix_ranks_df.style.applymap(lambda x: "background-color : yellow" if x < 25 else "")

Unnamed: 0_level_0,Unnamed: 1_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,SLK-LOK,DRAK2-CaMKK2,ABL2-AurA,ABL1-GAK,GAK-DAPK3,AurC-KIT,KIT-AMPKa2,KIT-FMS,ABL1-BMPR1B
subset,weighting,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
dfg_in,15,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0
dfg_in,110,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0
dfg_in,101,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0
dfg_in,100,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0


In [22]:
matrix_ranks_df.style.background_gradient(cmap=cm, axis=None)

Unnamed: 0_level_0,Unnamed: 1_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,SLK-LOK,DRAK2-CaMKK2,ABL2-AurA,ABL1-GAK,GAK-DAPK3,AurC-KIT,KIT-AMPKa2,KIT-FMS,ABL1-BMPR1B
subset,weighting,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
dfg_in,15,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0
dfg_in,110,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0
dfg_in,101,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0
dfg_in,100,44.0,27.0,183.0,2.0,5.0,87.0,126.0,11.0,127.0,74.0,2.0,220.0


## Ranks

In [23]:
from kissim.comparison import FingerprintDistanceGenerator
from src.definitions import COVERAGE_CUTOFF

In [24]:
fingerprint_distances = FingerprintDistanceGenerator.from_csv(
    RESULTS / f"{DATA_SUBSET[0]}/fingerprint_distances.csv"
)

In [25]:
kinase_matrix = fingerprint_distances.kinase_distance_matrix(coverage_min=COVERAGE_CUTOFF)

In [26]:
kinase_matrix[["EGFR"]]

kinase.2,EGFR
kinase.1,Unnamed: 1_level_1
AAK1,0.111434
ABL1,0.063537
ABL2,0.067081
ACK,0.066619
ACTR2,0.121684
...,...
p38a,0.091348
p38b,0.095911
p38d,0.101236
p38g,0.094483


In [27]:
from opencadd.databases.klifs import setup_remote

In [28]:
klifs_session = setup_remote()

kinase_names = kinase_matrix["EGFR"].index.to_list()
kinase_groups = klifs_session.kinases.by_kinase_name(kinase_names, species="Human")[
    ["kinase.klifs_name", "kinase.group"]
]
kinase_groups = kinase_groups.set_index("kinase.klifs_name").squeeze()

kinase_ranks = kinase_matrix["EGFR"]
kinase_ranks.name = "ranks"
kinase_ranks.index.name = "kinase.klifs_name"

kinase_ranks = pd.merge(
    kinase_groups, kinase_ranks, left_index=True, right_index=True
).reset_index()

In [29]:
kinase_ranks.sort_values("ranks").head(50)

Unnamed: 0,kinase.klifs_name,kinase.group,ranks
86,EGFR,TK,0.0
76,ErbB4,TK,0.024878
88,ErbB2,TK,0.039648
89,ErbB3,TK,0.046541
126,SYK,TK,0.053504
92,FGFR4,TK,0.061678
183,BTK,TK,0.062406
166,ABL1,TK,0.063537
115,RET,TK,0.064704
190,IGF1R,TK,0.065141


## Phylogenetic `kissim` tree

In [30]:
import itertools
import numpy as np

In [31]:
def pairs_to_symmetric_matrix(pairs):
    """
    Create symmetric matrix with diagonal of 0.0 from pair combinations.
    """

    pairs1 = pd.DataFrame(pairs)
    pairs2 = pairs1[[1, 0, 2]]
    pairs2.columns = [0, 1, 2]
    pairs_all = pd.concat([pairs1, pairs2])
    matrix = pairs_all.pivot(index=0, columns=1, values=2)
    np.fill_diagonal(matrix.values, 0.0)
    symmetic_matrix = pd.DataFrame(matrix.values, columns=matrix.columns, index=matrix.index)
    return symmetic_matrix

In [32]:
def get_tree_distance_matrix(tree):
    """
    Get a matrix of all-against-all kinase distances in the kinase tree.
    """

    kinases = [clade.name for clade in tree.get_terminals()]
    kinase_pairs = itertools.combinations(kinases, 2)

    kinase_pairs_tree_distances = []

    for kinase1, kinase2 in kinase_pairs:
        kinase_pairs_tree_distances.append([kinase1, kinase2, tree.distance(kinase1, kinase2)])

    tree_distance_matrix = pairs_to_symmetric_matrix(kinase_pairs_tree_distances)

    return tree_distance_matrix

In [33]:
%%time

results_list = []

for data_subset in DATA_SUBSET:
    print(data_subset)
    for weighting in WEIGHTING_SCHEMES:
        for cmethod in CLUSTERING_METHODS:
            results = []
            results.extend([data_subset, weighting, cmethod])

            tree_path = RESULTS / f"{data_subset}/trees/tree_0.8_{weighting}_{cmethod}.tree"
            kissim_tree = Phylo.read(tree_path, "newick")
            tree_distance_matrix = get_tree_distance_matrix(kissim_tree)

            for pair in ON_OFF_PAIRS:
                ranks, pair_names = get_ranks(tree_distance_matrix, pair[0], pair[1])
                results.extend(ranks["rank"].to_list())

            results_list.append(results)

dfg_in
CPU times: user 1min 58s, sys: 18.8 ms, total: 1min 58s
Wall time: 1min 58s


In [34]:
tree_ranks_df = pd.DataFrame(
    results_list,
    columns=["subset", "weighting", "cmethod"]
    + [f"{i[0]}-{j}" for i in ON_OFF_PAIRS for j in i[1]],
)
tree_ranks_df = tree_ranks_df.set_index(["subset", "weighting", "cmethod"])
cm = sns.light_palette("blue", as_cmap=True, reverse=True)

In [35]:
tree_ranks_df.style.applymap(lambda x: "background-color : yellow" if x < 25 else "")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,SLK-LOK,DRAK2-CaMKK2,ABL2-AurA,ABL1-GAK,GAK-DAPK3,AurC-KIT,KIT-AMPKa2,KIT-FMS,ABL1-BMPR1B
subset,weighting,cmethod,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dfg_in,15,ward,185.0,185.0,128.0,2.0,10.5,112.0,112.0,146.0,206.0,103.0,2.0,73.0
dfg_in,15,average,105.0,105.0,183.0,2.0,12.0,110.5,183.5,155.0,137.0,139.0,2.0,226.0
dfg_in,15,weighted,139.0,139.0,53.0,2.0,11.0,101.0,53.0,64.0,123.5,139.0,2.0,212.0
dfg_in,110,ward,163.5,163.5,163.5,2.0,16.5,112.0,136.5,156.0,215.5,226.5,2.0,123.0
dfg_in,110,average,114.0,114.0,168.5,2.0,12.0,73.5,168.5,127.0,148.0,73.5,2.0,225.0
dfg_in,110,weighted,54.0,54.0,128.0,2.0,18.0,92.5,131.5,39.5,157.5,75.5,2.0,203.5
dfg_in,101,ward,205.5,205.5,112.5,2.0,14.0,151.0,112.5,49.5,206.5,100.0,2.0,234.0
dfg_in,101,average,148.5,148.5,210.0,2.0,2.0,83.5,214.0,78.5,133.5,54.0,2.0,236.0
dfg_in,101,weighted,136.0,136.0,219.0,2.0,2.0,98.0,218.0,27.0,107.0,58.0,2.0,234.0
dfg_in,100,ward,176.5,176.5,55.5,2.0,17.5,241.5,55.5,133.0,242.5,110.5,2.0,176.5


In [36]:
tree_ranks_df.loc[:, "15", :].style.background_gradient(cmap=cm, axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EGFR-SLK,EGFR-LOK,EGFR-GAK,SLK-LOK,DRAK2-CaMKK2,ABL2-AurA,ABL1-GAK,GAK-DAPK3,AurC-KIT,KIT-AMPKa2,KIT-FMS,ABL1-BMPR1B
subset,weighting,cmethod,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dfg_in,15,ward,185.0,185.0,128.0,2.0,10.5,112.0,112.0,146.0,206.0,103.0,2.0,73.0
dfg_in,15,average,105.0,105.0,183.0,2.0,12.0,110.5,183.5,155.0,137.0,139.0,2.0,226.0
dfg_in,15,weighted,139.0,139.0,53.0,2.0,11.0,101.0,53.0,64.0,123.5,139.0,2.0,212.0
