# KLIFS kinase names

Explore different kinase name columns.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from opencadd.databases.klifs import setup_remote, setup_local

INFO:opencadd.databases.klifs.api:If you want to see an non-truncated version of the DataFrames in this module, use `pd.set_option('display.max_columns', 50)` in your notebook.


In [3]:
pd.set_option('display.max_columns', 50)

In [4]:
remote = setup_remote()

INFO:opencadd.databases.klifs.api:Set up remote session...
INFO:opencadd.databases.klifs.api:Remote session is ready!


## Kinase details 1 (short version)

In [5]:
kinases1 = remote.kinases.all_kinases()
kinases1.sort_values("kinase.id", inplace=True)
kinases1.reset_index(drop=True, inplace=True)
kinases1.head()

Unnamed: 0,kinase.id,kinase.hgnc_name,kinase.full_name,species.klifs
0,1,AKT1,,Human
1,2,AKT2,,Human
2,3,AKT3,,Human
3,4,CIT,,Human
4,5,DMPK,,Human


## Kinase details 2 (long details)

In [6]:
kinase_ids = kinases1["kinase.id"].to_list()
print(f"Number of IDs: {len(kinase_ids)}")

Number of IDs: 1127


In [7]:
kinases2 = remote.kinases.by_kinase_ids(kinase_ids)
print(f"Number of kinases: {kinases2.shape[0]}")
kinases2.sort_values("kinase.id", inplace=True)
kinases2.reset_index(drop=True, inplace=True)
kinases2.head()

Number of kinases: 1127


Unnamed: 0,kinase.id,kinase.klifs_name,kinase.hgnc_name,kinase.family,kinase.group,kinase.class,species.klifs,kinase.full_name,kinase.uniprot,kinase.iuphar,kinase.pocket
0,1,AKT1,AKT1,Akt,AGC,,Human,,P31749,1479,KLLGKGTFGKVILYAMKILHTLTENRVLQNSRPFLTALKYSCFVME...
1,2,AKT2,AKT2,Akt,AGC,,Human,,P31751,1480,KLLGKGTFGKVILYAMKILHTVTESRVLQNTRPFLTALKYACFVME...
2,3,AKT3,AKT3,Akt,AGC,,Human,,Q9Y243,2286,KLLGKGTFGKVILYAMKILHTLTESRVLKNTRPFLTSLKYSCFVME...
3,4,CRIK,CIT,DMPK,AGC,CRIK,Human,,O14578,1509,SLVGCGHFAEVQVYAMKVMFFEEERNILSRSTPWIPQLQYAYLVME...
4,5,DMPK1,DMPK,DMPK,AGC,GEK,Human,,Q09013,1505,KVIGRGAFSEVAVYAMKIMCFREERDVLVNGDRWITQLHFAYLVME...


### How many kinases have unambiguous name?

In [9]:
kinases2[kinases2.apply(lambda x: x["kinase.klifs_name"] != x["kinase.hgnc_name"], axis=1)].shape

(817, 11)

### Which columns are matched for kinase name?

In [10]:
remote.kinases.by_kinase_names(kinase_names='CRIK')

HBox(children=(HTML(value='Processing...'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,kinase.id,kinase.klifs_name,kinase.hgnc_name,kinase.family,kinase.group,kinase.class,species.klifs,kinase.full_name,kinase.uniprot,kinase.iuphar,kinase.pocket
0,4,CRIK,CIT,DMPK,AGC,CRIK,Human,,O14578,1509,SLVGCGHFAEVQVYAMKVMFFEEERNILSRSTPWIPQLQYAYLVME...
1,637,CRIK,Cit,DMPK,AGC,,Mouse,,P49025,0,SLVGCGHFAEVQVYAMKIMFFEEERNILSRSTPWIPQLQYAYLVME...


In [11]:
remote.kinases.by_kinase_names(kinase_names='CIT')

HBox(children=(HTML(value='Processing...'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,kinase.id,kinase.klifs_name,kinase.hgnc_name,kinase.family,kinase.group,kinase.class,species.klifs,kinase.full_name,kinase.uniprot,kinase.iuphar,kinase.pocket
0,4,CRIK,CIT,DMPK,AGC,CRIK,Human,,O14578,1509,SLVGCGHFAEVQVYAMKVMFFEEERNILSRSTPWIPQLQYAYLVME...
1,637,CRIK,Cit,DMPK,AGC,,Mouse,,P49025,0,SLVGCGHFAEVQVYAMKIMFFEEERNILSRSTPWIPQLQYAYLVME...


__Note__: Apparently, the kinase name is matched for `kinase.name` and `kinase.hgnc`.

### Differing `kinases.name` and `kinases.hgnc`?

In [14]:
diff2 = kinases2[kinases2.apply(lambda x: x["kinase.klifs_name"] != x["kinase.hgnc_name"], axis=1)]
print(f"Number of differing names/HGNC: {diff2.shape[0]}")
diff2[["kinase.klifs_name", "kinase.hgnc_name"]].head()

Number of differing names/HGNC: 817


Unnamed: 0,kinase.klifs_name,kinase.hgnc_name
3,CRIK,CIT
4,DMPK1,DMPK
5,MRCKa,CDC42BPA
6,MRCKb,CDC42BPB
7,DMPK2,CDC42BPG


## Merge details for kinases 1 and 2

In [16]:
kinases = kinases1.merge(kinases2, on="kinase.id", how="left")
kinases = kinases.iloc[:, [0, 1, 2, 4, 5, 10]]
kinases

Unnamed: 0,kinase.id,kinase.hgnc_name_x,kinase.full_name_x,kinase.klifs_name,kinase.hgnc_name_y,kinase.full_name_y
0,1,AKT1,,AKT1,AKT1,
1,2,AKT2,,AKT2,AKT2,
2,3,AKT3,,AKT3,AKT3,
3,4,CIT,,CRIK,CIT,
4,5,DMPK,,DMPK1,DMPK,
...,...,...,...,...,...,...
1122,1123,Pip5k1a,,Pip5k1a,Pip5k1a,
1123,1124,Map4k2,,Map4k2,Map4k2,
1124,1125,Pan3,,Pan3,Pan3,
1125,1126,Plk5,,Plk5,Plk5,


In [17]:
kinases[kinases["kinase.hgnc_name_x"] == "MAPK14"]

Unnamed: 0,kinase.id,kinase.hgnc_name_x,kinase.full_name_x,kinase.klifs_name,kinase.hgnc_name_y,kinase.full_name_y
248,249,MAPK14,,p38a,MAPK14,


In [18]:
kinases[kinases["kinase.klifs_name"].isin(["", " ", 0, "0", None])]

Unnamed: 0,kinase.id,kinase.hgnc_name_x,kinase.full_name_x,kinase.klifs_name,kinase.hgnc_name_y,kinase.full_name_y


In [19]:
kinases[kinases["kinase.hgnc_name_x"].isin(["", " ", 0, "0", None])]

Unnamed: 0,kinase.id,kinase.hgnc_name_x,kinase.full_name_x,kinase.klifs_name,kinase.hgnc_name_y,kinase.full_name_y


In [20]:
kinases[kinases["kinase.hgnc_name_y"].isin(["", " ", 0, "0", None])]

Unnamed: 0,kinase.id,kinase.hgnc_name_x,kinase.full_name_x,kinase.klifs_name,kinase.hgnc_name_y,kinase.full_name_y
528,529,A6,,A6,,
529,530,A6r,,A6r,,


### Differing `kinase.name_full`?

In [21]:
diff1 = kinases[kinases.apply(lambda x: x["kinase.full_name_x"] != x["kinase.full_name_y"], axis=1)]
print(f"Number of differing full names: {diff1.shape[0]}")

Number of differing full names: 0


### Differing `kinase.hgnc`?

In [22]:
diff3 = kinases[kinases.apply(lambda x: x["kinase.hgnc_name_x"] != x["kinase.hgnc_name_y"], axis=1)]
print(f"Number of differing HGNC names: {diff3.shape[0]}")
diff3.head()

Number of differing HGNC names: 2


Unnamed: 0,kinase.id,kinase.hgnc_name_x,kinase.full_name_x,kinase.klifs_name,kinase.hgnc_name_y,kinase.full_name_y
528,529,A6,,A6,,
529,530,A6r,,A6r,,


__Note__: In case of kinases A6 and A6r, the HGNC column from `all_kinases` got non-HGNC entries?

## Local kinase details

In [23]:
from opencadd.databases.klifs.local import _LocalDatabaseGenerator
local = _LocalDatabaseGenerator()
klifs_export_path = "data/KLIFS_export.20201020.csv.zip"
klifs_export = local._from_klifs_export_file(klifs_export_path)
klifs_overview_path = "data/overview.20201020.csv.zip"
klifs_overview = local._from_klifs_overview_file(klifs_overview_path)
print(klifs_export.shape, klifs_overview.shape)

(11592, 15) (11592, 27)


In [24]:
klifs_export.sort_values(["structure.pdb", "structure.chain", "structure.alternate_model"], inplace=True, ignore_index=True)
klifs_export.head()

Unnamed: 0,kinase.names,kinase.family,kinase.group,structure.pdb,structure.chain,structure.alternate_model,species.klifs,ligand.name,ligand.pdb,ligand_allosteric.name,ligand_allosteric.pdb,structure.dfg,structure.ac_helix,kinase.hgnc_name,kinase.klifs_name
0,"[MAPK14, p38a]",MAPK,CMGC,1a9u,A,-,Human,4-[5-(4-FLUORO-PHENYL)-2-(4-METHANESULFINYL-PH...,SB2,-,-,in,out-like,MAPK14,p38a
1,[HCK],Src,TK,1ad5,A,-,Human,PHOSPHOAMINOPHOSPHONIC ACID-ADENYLATE ESTER,ANP,-,-,in,out,HCK,HCK
2,[HCK],Src,TK,1ad5,B,-,Human,PHOSPHOAMINOPHOSPHONIC ACID-ADENYLATE ESTER,ANP,-,-,in,out-like,HCK,HCK
3,[FGFR1],FGFR,TK,1agw,A,A,Human,3-[4-(1-FORMYLPIPERAZIN-4-YL)-BENZYLIDENYL]-2-...,SU2,-,-,in,out-like,FGFR1,FGFR1
4,[FGFR1],FGFR,TK,1agw,A,B,Human,3-[4-(1-FORMYLPIPERAZIN-4-YL)-BENZYLIDENYL]-2-...,SU2,-,-,in,out-like,FGFR1,FGFR1


In [26]:
klifs_export[klifs_export["kinase.names"].apply(len) == 2].shape

(4867, 15)

In [27]:
klifs_overview.sort_values(["structure.pdb", "structure.chain", "structure.alternate_model"], inplace=True, ignore_index=True)
klifs_overview.head()

Unnamed: 0,species.klifs,kinase.klifs_name,structure.pdb,structure.alternate_model,structure.chain,ligand.pdb,ligand_allosteric.pdb,structure.rmsd1,structure.rmsd2,structure.qualityscore,structure.pocket,structure.resolution,structure.missing_residues,structure.missing_atoms,interaction.fingerprint,structure.fp_i,structure.fp_ii,structure.bp_i_a,structure.bp_i_b,structure.bp_ii_in,structure.bp_ii_a_in,structure.bp_ii_b_in,structure.bp_ii_out,structure.bp_ii_b,structure.bp_iii,structure.bp_iv,structure.bp_v
0,Human,p38a,1a9u,-,A,SB2,-,0.828,2.186,8.0,SPVGSGAYGSVCAVAVKKLRTYRELRLLKHMKENVIGLLDVYLVTH...,2.5,0,0,0000000000000000000000000000000000000000000000...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Human,HCK,1ad5,-,A,ANP,-,0.816,2.141,9.6,KKLGAGQFGEVWMVAVKTMAFLAEANVMKTLQDKLVKLHAVYIITE...,2.6,0,4,0000000000000010000000000000000000000000000000...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Human,HCK,1ad5,-,B,ANP,-,0.817,2.141,9.6,KKLGAGQFGEVWMVAVKTMAFLAEANVMKTLQDKLVKLHAVYIITE...,2.6,0,4,0000000000000010000001000000000000000000000000...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Human,FGFR1,1agw,A,A,SU2,-,0.831,2.001,7.6,KPLG_____QVVLVAVKMLDLISEMEMMKMIGKNIINLLGAYVIVE...,2.4,5,4,0000000000000010000000000000000000000000000000...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Human,FGFR1,1agw,B,A,SU2,-,0.831,2.001,7.6,KPLG_____QVVLVAVKMLDLISEMEMMKMIGKNIINLLGAYVIVE...,2.4,5,4,0000000000000010000000000000000000000000000000...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Kinase name mismatches in local overview and export files?

In [28]:
klifs_export[klifs_export["kinase.klifs_name"] != klifs_overview["kinase.klifs_name"]]

Unnamed: 0,kinase.names,kinase.family,kinase.group,structure.pdb,structure.chain,structure.alternate_model,species.klifs,ligand.name,ligand.pdb,ligand_allosteric.name,ligand_allosteric.pdb,structure.dfg,structure.ac_helix,kinase.hgnc_name,kinase.klifs_name


### Kinase name mismatches locally and remotely?

In [29]:
klifs_export[~klifs_export["kinase.klifs_name"].isin(kinases["kinase.klifs_name"].to_list())]

Unnamed: 0,kinase.names,kinase.family,kinase.group,structure.pdb,structure.chain,structure.alternate_model,species.klifs,ligand.name,ligand.pdb,ligand_allosteric.name,ligand_allosteric.pdb,structure.dfg,structure.ac_helix,kinase.hgnc_name,kinase.klifs_name


### Kinase HGNC name mismatches locally and remotely?

In [30]:
klifs_export[~klifs_export["kinase.hgnc_name"].isin(kinases["kinase.hgnc_name_x"].to_list())]

Unnamed: 0,kinase.names,kinase.family,kinase.group,structure.pdb,structure.chain,structure.alternate_model,species.klifs,ligand.name,ligand.pdb,ligand_allosteric.name,ligand_allosteric.pdb,structure.dfg,structure.ac_helix,kinase.hgnc_name,kinase.klifs_name
