# Poincare Maps projection on Globins starting from scratch

In [2]:
import os
import numpy as np
import pandas as pd
#import plotly
#import plotly.express as px
from pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Data preparation

The user must provide a path to the input file in .mfasta format and path to the output directory for intermediate file storage:

In [3]:
path_to_PM="/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA"
mfasta = path_to_PM+"/examples/globins/glob.mfasta" # full path to the input MSA in mfasta format
path_out = path_to_PM+"/examples/globins_test" # a directory to write resutling files
out_name = "globins" # name given to the output files

All scripts necessary for data preparation are located in scirpts/data_preparation:

In [4]:
path_prep_scripts = path_to_PM+"/scripts/prepare_data"
print(path_prep_scripts)

/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/prepare_data


Data preparation consists in `.mfasta` cleaning according to a gap threshold and translation of each sequence to the PSSM profile:

In [10]:
gapth = "0.9" # threshold for filtering gapped positions
prep_parameters = path_prep_scripts + " " + mfasta+" "+path_out+" "+out_name+" "+gapth # parameters for data preparation
# print(prep_parameters)
print(path_prep_scripts+"/create_projection.sh " + prep_parameters)
os.system(path_prep_scripts+"/create_projection.sh " + prep_parameters)
print("Output files ready for projection are written to: "+path_out+"/fasta"+gapth)

/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/prepare_data/create_projection.sh /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/prepare_data /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins/glob.mfasta /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_test globins 0.9
Output files ready for projection are written to: /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_test/fasta0.9


## Data projection using Poincaré disk
You can change the parameters of the projection here:

In [23]:
knn = 5
gamma = 2
sigma = 1
batchs = 4
epochs = 500

Then, the following command creates a projection of encoded sequences to a Poincaré disk:

In [24]:
path_to_build_PM = path_to_PM+"/scripts/build_poincare_map"
pm_command = "python "+path_to_build_PM+"/main.py --input_path "+path_out+"/fasta"+gapth+" --output_path "+path_out+"/projections/ --epochs " + str(epochs)
print(pm_command)
os.system(pm_command)
# --knn knn --gamma gamma --batchsize 4 --epochs 1000 (--rotate)

python /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/build_poincare_map/main.py --input_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_test/fasta0.9 --output_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_test/projections/ --epochs 500


0

Parameters by default are provided in comments. The output files are then written to the following file:

In [13]:
path_embedding = path_out+"/projections/PM%1.0fsigma=%2.2fgamma=%2.2fcosinepca=0_seed0.csv"%(knn,sigma,gamma)
print(path_embedding)

/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/globins_test/projections/PM5sigma=1.00gamma=2.00cosinepca=0_seed0.csv


## Projection visualization
One can visualieze the resulting projection using any convenient coloring. To do so, the user shoud provide a `.csv` file with each line corresponding to a protein:

In [14]:
path_annotation = path_to_PM+"/visualization/data/globin_colors_new.csv" # path to annotation file
globin_df = pd.read_csv(path_annotation, index_col = 0)
globin_df

Unnamed: 0_level_0,tree1,tree2,tree3,tree4,full_name,short_name,full_species,short_species,evo_distance
proteins_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,5,3,3,2,"Globin C, coelomic",Globin,Caudina arenicola,Caudina,2.101128
2,-1,28,18,10,Globin (Fragment),Globin,Stegodyphus mimosarum,Stegodyphus,1.757280
3,-1,27,18,10,Uncharacterized protein,Uncharacterized,Strigamia maritima,Strigamia,2.041487
4,37,27,18,10,Uncharacterized protein,Uncharacterized,Scylla olivacea,Scylla,2.318169
5,37,27,18,10,Uncharacterized protein,Uncharacterized,Scylla olivacea,Scylla,2.170775
...,...,...,...,...,...,...,...,...,...
248,14,10,9,3,Uncharacterized protein,Uncharacterized,Strongylocentrotus purpuratus,Strongylocentrotus,2.586926
249,-1,10,9,3,Uncharacterized protein,Uncharacterized,Strongylocentrotus purpuratus,Strongylocentrotus,2.519191
250,14,10,9,3,Uncharacterized protein,Uncharacterized,Strongylocentrotus purpuratus,Strongylocentrotus,2.276065
251,14,10,9,3,Uncharacterized protein,Uncharacterized,Strongylocentrotus purpuratus,Strongylocentrotus,2.867471


A user can also create a custom color palette:

In [15]:
# construction of palette 
globin_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
                 "Nematostella": "#ad288b", "Amphimedon": "#fdb7fd", "Micromonas": "#4d9b03", "Saccoglossus":"#b0ffe8", "Trichoplax":"#e9bd6b" } 

globin_others_eukaryota = ["Amphimedon", # porifera 
 "Saccoglossus", # hemichordata
 "Trichoplax", # placozoa
 "Micromonas", # viridiplantae
 "Nematostella" ] # cnidaria  
 
globin_arthropoda = ["Stegodyphus", "Strigamia", "Scylla", "Carcinus",  "Ixodes", "Zootermopsis", "Coptotermes","Oryctes", "Tetranychus", 
                     "Acyrthosiphon", "Nemastomella","Apis","Solenopsis", "Habropoda", "Fopius", "Cerapachys", "Camponotus", "Nasonia", 
                     "Acromyrmex", "Tribolium", "Sarcoptes", "Lutzomyia", "Rhodnius", "Triatoma", "Cherax", "Chironomus", "Polypedilum", 
                     "Pediculus", "Daphnia", "Harpegnathos", "Artemia", "Parartemia", "Lepeophtheirus", "Glossina", "Operophtera"] 
  
globin_mollusca = ["Arion", "Crassostrea", "Lottia", "Anadara", "Spisula", "Barbatia", "Nassarius", "Cerithidea", "Nerita", "Phacoides",
                   "Octopus", "Biomphalaria"] 
globin_annelida = ["Capitella", "Metaphire"]  
globin_chordata = ["Scleropages", "Branchiostoma", "Xenopus", "Oreochromis", "Nothobranchius", "Gasterosteus", "Bos", "Iguana", "Oryzias", 
                   "Anas", "Tetraodon", "Takifugu", "Xiphophorus", "Larimichthys", "Oncorhynchus", "Alligator", "Astyanax", "Latimeria",
                   "Ictalurus", "Chaenocephalus", "Danio", "Cyprinus", "Callorhinchus", "Lepisosteus", "Chelonia", "Poecilia", "Ciona", 
                   "Petromyzon", "Lethenteron", "Cavia", "Scalopus", "Neotoma", "Pelodiscus","Fundulus", "Oryctolagus", "Anolis",
                   "Salmo", "Amazona", "Ornithorhynchus", "Sarcophilus", "Gallus", "Nomascus", "Tupaia"] 
 
globin_echinodermata = ["Caudina", "Strongylocentrotus", "Hemipholis", "Ophiactis"] 
globin_bacteria = ["Gemmatirosa", "Halothiobacillus"] 
globin_nematoda = ["Ancylostoma", "Necator", "Caenorhabditis", "Haemonchus", "Ascaris", "Trichinella", "Globodera", "Strongyloides", "Angiostrongylus"] 
globin_fungi = ["Blastobotrys", "Cyberlindnera"] 


#globin_palette.update(dict.fromkeys(globin_others_eukaryota, "#22ba69")) 
globin_palette.update(dict.fromkeys(globin_arthropoda, "#0b237c"))
globin_palette.update(dict.fromkeys(globin_mollusca, "#512ff8")) #512ff8 401bf9
globin_palette.update(dict.fromkeys(globin_annelida, "#a191f3"))
globin_palette.update(dict.fromkeys(globin_chordata, "#26c9d9"))
globin_palette.update(dict.fromkeys(globin_echinodermata, "#086b75"))
globin_palette.update(dict.fromkeys(globin_bacteria, "#f10000"))
globin_palette.update(dict.fromkeys(globin_nematoda, "#5d78e3"))
globin_palette.update(dict.fromkeys(globin_fungi, "#a0e361"))

In [17]:
df5 = read_embeddings(path_embedding, path_annotation, withroot=False)

result:                    pm1       pm2 tree1 tree2 tree3 tree4  \
proteins_id                                               
1           -0.266668  0.361557     5     3     3     2   
2           -0.120496  0.269804    -1    28    18    10   
3           -0.161223  0.454501    -1    27    18    10   
4           -0.226452  0.602182    37    27    18    10   
5           -0.164180  0.550417    37    27    18    10   
...               ...       ...   ...   ...   ...   ...   
248          0.354382 -0.730680    14    10     9     3   
249          0.399043 -0.751967    -1    10     9     3   
250          0.384139 -0.707005    14    10     9     3   
251          0.442477 -0.715055    14    10     9     3   
252         -0.509467 -0.685960    -1    25    15     8   

                           full_name       short_name  \
proteins_id                                             
1                 Globin C, coelomic           Globin   
2                  Globin (Fragment)           Globi

Here follow several examples of globin family visualization.

# Globins by name

## KNN 5 gamma 2 (batchsize 4 epochs 1000)

In [1]:
trace1 = plot_embedding_interactive(df5, 
                                    labels_name = 'short_name',
                                    show_text=True, 
                                    color_palette = globin_palette, 
                                    title = "PM projection on globins according to the first term of protein name - KNN 5 gamma 2 batchsize 4 epochs 1000", 
                                    fontsize = 11
                              )
trace1.show()

NameError: name 'plot_embedding_interactive' is not defined

In [22]:
trace1 = plot_embedding_interactive(df5, 
                                    labels_name = 'tree3',
                                    show_text=True, 
                                    color_palette = globin_palette,
                                    title = "PM projection on globins according to the tree cut \"tree4\" - KNN 5 gamma 2 batchsize 4 epochs 500",
                                    fontsize = 11
                              )
trace1.show()

Index(['tree1', 'tree2', 'tree3', 'tree4', 'full_name', 'short_name',
       'full_species', 'short_species', 'evo_distance'],
      dtype='object')
