# Poincare Maps projection on Kinases starting from scratch

In [1]:
import os
import numpy as np
import pandas as pd
#import plotly
#import plotly.express as px
from pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Data preparation

The user must provide a path to the input file in .mfasta format and path to the output directory for intermediate file storage:

In [3]:
path_to_PM = "/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA"
mfasta = path_to_PM+"/examples/globins_try_large/glob_filter4.mfasta" # full path to the input MSA in mfasta format
path_out = path_to_PM+"/examples/globins_try_large_test" # a directory to write resutling files
out_name = "globins_large" # name given to the output files
path_to_figures = "/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/figures"

All scripts necessary for data preparation are located in scirpts/data_preparation:

In [4]:
path_prep_scripts = path_to_PM+"/scripts/prepare_data"

Data preparation consists in `.mfasta` cleaning according to a gap threshold and translation of each sequence to the PSSM profile:

In [5]:
gapth = "0.9" # threshold for filtering gapped positions
prep_parameters = path_prep_scripts+" "+mfasta+" "+path_out+" "+out_name+" "+gapth # parameters for data preparation
# print(prep_parameters)
# print(path_prep_scripts+"/create_projection.sh " + prep_parameters)

In [6]:
os.system(path_prep_scripts+"/create_projection.sh " + prep_parameters)
print("Output files ready for projection are written to: "+path_out+"/fasta"+gapth)

Output files ready for projection are written to: /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_try_large_test/fasta0.9


## Data projection using Poincaré disk
You can change the parameters of the projection here:

In [7]:
knn = "5"
gamma = "2.00"
sigma = "1.00"
batchs = "4"
cospca = "0"
epochs = "1000"
seed = "0"

Then, the following command creates a projection of encoded sequences to a Poincaré disk:

In [8]:
path_to_build_PM = path_to_PM+"/scripts/build_poincare_map"
pm_command = "python "+path_to_build_PM+"/main.py --input_path "+path_out+"/fasta"+gapth+" --output_path "+path_out+"/projections/ --gamma "+gamma+" --pca "+ cospca+" --epochs "+epochs+" --seed "+seed
print(pm_command)
os.system(pm_command)

python /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/build_poincare_map/main.py --input_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_try_large_test/fasta0.9 --output_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_try_large_test/projections/ --gamma 2.00 --pca 0 --epochs 1000 --seed 0


0

Parameters by default are provided in comments. The output files are then written to the following file:

In [9]:
path_embedding = path_out+"/projections/PM"+knn+"sigma="+sigma+"gamma="+gamma+"cosinepca="+cospca+"_seed"+seed+".csv"
print(path_embedding)

/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_try_large_test/projections/PM5sigma=1.00gamma=2.00cosinepca=0_seed0.csv


## Projection visualization
One can visualieze the resulting projection using any convenient coloring. To do so, the user shoud provide a `.csv` file with each line corresponding to a protein:

In [15]:
path_annotation = path_to_PM+"/examples/globins_try_large/globs-species.csv" # path to annotation file
kinase_df = pd.read_csv(path_annotation, index_col = 0)
kinase_df

Unnamed: 0_level_0,Short_name,Species,Short_species
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Globin C, coelomic",Globin,Caudina arenicola,Caudina
Globin (Fragment),Globin,Stegodyphus mimosarum,Stegodyphus
Uncharacterized protein,Uncharacterized,Strigamia maritima,Strigamia
Uncharacterized protein,Uncharacterized,Scylla olivacea,Scylla
Uncharacterized protein,Uncharacterized,Scylla olivacea,Scylla
...,...,...,...
Uncharacterized protein,Uncharacterized,Roseiflexus sp.,Roseiflexus
Uncharacterized protein,Uncharacterized,Herpetosiphon aurantiacus,Herpetosiphon
Uncharacterized protein,Uncharacterized,Pseudomonas sp.,Pseudomonas
Globin coupled histidine kinase,Globin,Serratia fonticola,Serratia


A user can also create a custom color palette:

In [13]:
# construction of palette 
#kinase_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "None" :"#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
#                 "TYR": "#bd065f", "CMGC": "#d5c203", "TKL": "#997e73","STE": "#80b412", # kinase groups 
#                 "CK1": "#0dbae9", "AGC": "#00bba1", "CAMK":  "#1f6ed4", "NEK": "#8ce4fa", "RGC":"#f59a62"}

In [16]:
df = read_embeddings(path_embedding, path_annotation, withroot=False)

result:                    pm1       pm2                             Name  \
proteins_id                                                        
1           -0.024999  0.535806               Globin C, coelomic   
2           -0.265485  0.458785                Globin (Fragment)   
3           -0.157738  0.380762          Uncharacterized protein   
4           -0.114742  0.350055          Uncharacterized protein   
5           -0.114837  0.349793          Uncharacterized protein   
...               ...       ...                              ...   
3154        -0.816356 -0.560106          Uncharacterized protein   
3155        -0.814966 -0.560571          Uncharacterized protein   
3156        -0.813099 -0.559961          Uncharacterized protein   
3157        -0.818654 -0.558018  Globin coupled histidine kinase   
3158        -0.813948 -0.558028  Globin coupled histidine kinase   

                  Short_name                    Species  Short_species  
proteins_id                      

Here follow several examples of kinase family visualization.

## Color by globin name

In [20]:
# construction of palette 
globin_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
                 "Nematostella": "#ad288b", "Amphimedon": "#fdb7fd", "Micromonas": "#4d9b03", "Saccoglossus":"#b0ffe8", "Trichoplax":"#e9bd6b" } 

globin_others_eukaryota = ["Amphimedon", # porifera 
 "Saccoglossus", # hemichordata
 "Trichoplax", # placozoa
 "Micromonas", # viridiplantae
 "Nematostella" ] # cnidaria  
 
globin_arthropoda = ["Stegodyphus", "Strigamia", "Scylla", "Carcinus",  "Ixodes", "Zootermopsis", "Coptotermes","Oryctes", "Tetranychus", 
                     "Acyrthosiphon", "Nemastomella","Apis","Solenopsis", "Habropoda", "Fopius", "Cerapachys", "Camponotus", "Nasonia", 
                     "Acromyrmex", "Tribolium", "Sarcoptes", "Lutzomyia", "Rhodnius", "Triatoma", "Cherax", "Chironomus", "Polypedilum", 
                     "Pediculus", "Daphnia", "Harpegnathos", "Artemia", "Parartemia", "Lepeophtheirus", "Glossina", "Operophtera"] 
  
globin_mollusca = ["Arion", "Crassostrea", "Lottia", "Anadara", "Spisula", "Barbatia", "Nassarius", "Cerithidea", "Nerita", "Phacoides",
                   "Octopus", "Biomphalaria"] 
globin_annelida = ["Capitella", "Metaphire"]  
globin_chordata = ["Scleropages", "Branchiostoma", "Xenopus", "Oreochromis", "Nothobranchius", "Gasterosteus", "Bos", "Iguana", "Oryzias", 
                   "Anas", "Tetraodon", "Takifugu", "Xiphophorus", "Larimichthys", "Oncorhynchus", "Alligator", "Astyanax", "Latimeria",
                   "Ictalurus", "Chaenocephalus", "Danio", "Cyprinus", "Callorhinchus", "Lepisosteus", "Chelonia", "Poecilia", "Ciona", 
                   "Petromyzon", "Lethenteron", "Cavia", "Scalopus", "Neotoma", "Pelodiscus","Fundulus", "Oryctolagus", "Anolis",
                   "Salmo", "Amazona", "Ornithorhynchus", "Sarcophilus", "Gallus", "Nomascus", "Tupaia"] 
 
globin_echinodermata = ["Caudina", "Strongylocentrotus", "Hemipholis", "Ophiactis"] 
globin_bacteria = ["Gemmatirosa", "Halothiobacillus"] 
globin_nematoda = ["Ancylostoma", "Necator", "Caenorhabditis", "Haemonchus", "Ascaris", "Trichinella", "Globodera", "Strongyloides", "Angiostrongylus"] 
globin_fungi = ["Blastobotrys", "Cyberlindnera"] 


#globin_palette.update(dict.fromkeys(globin_others_eukaryota, "#22ba69")) 
globin_palette.update(dict.fromkeys(globin_arthropoda, "#0b237c"))
globin_palette.update(dict.fromkeys(globin_mollusca, "#512ff8")) #512ff8 401bf9
globin_palette.update(dict.fromkeys(globin_annelida, "#a191f3"))
globin_palette.update(dict.fromkeys(globin_chordata, "#26c9d9"))
globin_palette.update(dict.fromkeys(globin_echinodermata, "#086b75"))
globin_palette.update(dict.fromkeys(globin_bacteria, "#f10000"))
globin_palette.update(dict.fromkeys(globin_nematoda, "#5d78e3"))
globin_palette.update(dict.fromkeys(globin_fungi, "#a0e361"))

In [23]:
trace1 = plot_embedding_interactive(df, 
                                    labels_name = 'Short_name',#'1_Group',#'2_Gene', 
                                    show_text=True, 
                                    color_palette = globin_palette,
                                    title = "Poinicaré Map projection colored by kinase groups", 
                                    fontsize = 10,
                              )
trace1.write_image(path_to_figures+"/Globins_Short_name.pdf")
trace1.show()


Index(['Name', 'Short_name', 'Species', 'Short_species'], dtype='object')


You can highlight several points by using plotly iterface. Provide the corresponding column name in `second_labels_name` and a list of labels to show in `labels_text`:

In [17]:
trace2 = plot_embedding_interactive(df, 
                                    labels_name = '1_Group',#'1_Group',#'2_Gene', 
                                    show_text=True, 
                                    color_palette = kinase_palette,
                                    title = "Poinicaré Map projection colored by kinase groups", 
                                    fontsize = 10,
                                    second_labels_name = "4_Uni_entry", 
                                    #labels_text = ["ST17B_HUMAN", "MYLK2_HUMAN", "KALRN_HUMAN"]
                                    labels_text = ["CLK3_HUMAN", "SRPK3_HUMAN", "HIPK1_HUMAN","CSK22_HUMAN"] # some CMGC kinase
                                    #second_labels_name = "2_Gene",
                                    #labels_text = ["RPS6KA1_1", "RPS6KA2_1", "RPS6KA5_1", "RPS6KB2", "RPS6KA1_2", "RPS6KA2_2",  "RPS6KA3_2",  "RPS6KA5_2"] # some sequences of AGC first domain and CAMK second domain (mentionned in the article) to label
                                    
                              )
trace2.show()

Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
       '6_Domain_begin', '7_Domain_end', '8_Domain_length',
       '9_Largest_insert_length', '10_PDB_validation',
       '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
       '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
      dtype='object')


You can also compare different colorings by assigning the `labels_name` variable a name of the desired column. Here we compare the kinase group coloring proposed in the article to the Uniprot name of kinase:

In [20]:
trace3 = plot_embedding_interactive(df, 
                                    labels_name = '13_Group_in_Uni',#'1_Group',#'2_Gene', 
                                    show_text=True, 
                                    color_palette = kinase_palette,
                                    title = "PM projection on kinases by kinase groups in Uniprot - KNN 5 gamma 3 batchsize 4 epochs 1000",
                                    fontsize = 10,
                                    )
trace3.write_image(path_to_figures+"/Kinases_by_UniGroup.pdf")
trace3.show()

Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
       '6_Domain_begin', '7_Domain_end', '8_Domain_length',
       '9_Largest_insert_length', '10_PDB_validation',
       '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
       '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
      dtype='object')


...and to the classification provided in Manning et al. study:

In [22]:
trace4 = plot_embedding_interactive(df, 
                                    labels_name = '14_Group_in_Manning',#'1_Group',#'2_Gene', 
                                    show_text=True, 
                                    color_palette = kinase_palette,
                                    second_labels_name = "2_Gene", 
                                    title = "Poincaré Maps projection by kinases groups according to Manning",
                                    fontsize = 10, 
                                    #labels_text = ["MAP3K7, MAP3K9", "MAP3K10", "MAP3K11", "MAP3K12", "MAP3K13", "MAP3K20", "MAP3K21"] # STE dans TKL
                                    #labels_text = ["AURKA", "AURKB", "AURKC", "CAMKK1", "CAMKK2", "PLK1", "PLK2", "PLK3", "PLK4"] # 10 kinases pourraient être CAMK
                                    
                              )
trace4.write_image(path_to_figures+"/Kinases_by_ManningGroup.pdf")
trace4.show()

Index(['1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc',
       '6_Domain_begin', '7_Domain_end', '8_Domain_length',
       '9_Largest_insert_length', '10_PDB_validation',
       '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni',
       '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'],
      dtype='object')
