[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/klanita/PoincareMSA/blob/master/PoincareMSA_colab.ipynb)

<img src="https://github.com/klanita/PoincareMSA/blob/master/.github/PoincareMSA_small_logo.png?raw=true" height="100" style="height:100px;margin-left: 0px;">


# Poincaré maps for visualization of large protein famillies

PoincareMSA builds a projection of protein multiple sequence alignemnt (MSA) on a Poincaré disk. The proximity of the points to the disk center corresponds to their hierarchy and correlates with the proximity of the proteins to the root of the phylogenetic tree. Thus, must central point often correspond to the ancestor proteins and protein located close to the border to the leaves of phylogenetic tree.

# Notebook initialization

### Load dependencies

In [12]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
%matplotlib inline

#Create optional variables
path_annotation = ""

# Data import

In [13]:
# OPTIONS =================================================
mfasta = "examples/globins/glob.mfasta"
path_annotation = "examples/globins/globin_colors_new.csv"
#==========================================================

#Check files
#mfasta
nb_seq = 0
if os.path.isfile(mfasta):
    with open(mfasta, "r") as f_in:
        for line in f_in:
            if line[0] == ">":
                nb_seq += 1
    print(f"\nNumber of sequences found: {nb_seq}.")
else:
    print(f"File {mfasta} not found.")

#Annotations
if os.path.isfile(path_annotation):
    try:
        df_annotation = pd.read_csv(path_annotation)
        if len(df_annotation) != nb_seq:
            raise ValueError("Annotation file doesn't match the .mfasta file length.")
    except:
        raise ValueError("Annotation file is not in .csv format.")
    print("\nAnnotation file correctly loaded.")
    annotation_names = list(df_annotation.columns)
    print(f"{len(annotation_names)} annotations found: {annotation_names}.")
else:
    print(f"File {path_annotation} not found")


Number of sequences found: 252.

Annotation file correctly loaded.
11 annotations found: ['proteins_id', 'tree1', 'tree2', 'tree3', 'tree4', 'full_name', 'short_name', 'full_species', 'short_species', 'evo_distance', 'Color_species'].


# Settings

In [14]:
# OPTIONS =================================================
# Job name
out_name = "globins_data"
#----------------------------------------------------------
# Threshold for filtering gapped positions
gapth = 0.9 
#----------------------------------------------------------
# Projection parameters
knn = 5
gamma = 2
sigma = 1
cospca = 0
batchs = 4
epochs = 1000
seed = 0
#==========================================================

# Data preparation and data projection using Poincaré disk

In [15]:
# 1. Data preparation
# Data preparation consists in `.mfasta` cleaning according to a gap threshold and translation of each sequence to the PSSM profile.
print("1. Data preparation")
prep_parameters = "scripts/prepare_data" + " " + mfasta + " " + out_name + " " + out_name + " " + str(gapth)
bash_projection = "bash scripts/prepare_data/create_projection.sh " + prep_parameters
!{bash_projection}

print("\n2. Data projection using Poincaré disk")
# 2. Data projection using Poincaré disk
#This step creates a projection of encoded sequences to a Poincaré disk.
bash_pm = "python3 "+ "scripts/build_poincare_map/main.py --input_path " + out_name + "/fasta" + str(gapth) + " --output_path " + out_name + "/projections/ --gamma "+ str(gamma) +" --pca "+ str(cospca) + " --epochs "+ str(epochs) +" --seed "+ str(seed) + " --knn " + str(knn)
!{bash_pm}

print("\n3. Format data for visualization")
# 3. Format data for visualization
#Check that an annotation file was provided. Create a dummy one instead
if not path_annotation:
    df_annotation = pd.DataFrame(np.full(nb_seq, "-", dtype=object), columns=["default"])
    df_annotation.to_csv("dummy_annotation.csv", index=False)
    path_annotation = "dummy_annotation.csv"
    annotation_names = ["default"]

path_embedding = f"{out_name}/projections/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}cosinepca={cospca:1.0f}_seed{seed:1.0f}.csv"
df_embedding = read_embeddings(path_embedding, path_annotation, withroot=False)

1. Data preparation
Input file: examples/globins/glob.mfasta
Name of the protein family: glob
80 X aa replaced by gaps in 252 sequences
filter_gaps finished for examples/globins/glob.mfasta
mfasta2fasta finished for globins_data/globins_data.clean0.9.mfasta
80 X aa replaced by gaps in 252 sequences

2. Data projection using Poincaré disk
CUDA: True
252 proteins found in folder globins_data/fasta0.9.
No root detected
Prepare data: tensor construction
Prepare data: successfully terminated
Computing laplacian...
Laplacian computed in 0.04 sec
Computing RFA...
RFA computed in 0.00 sec
Starting training...
loss: 0.35447:  55%|█████████████▊           | 550/1000 [02:59<03:02,  2.47it/s]
Stopped at epoch 550
loss: 0.35447:  55%|█████████████▊           | 550/1000 [02:59<02:26,  3.07it/s]
PM computed in 179.24 sec

loss = 3.545e-01
time = 2.989 min

3. Format data for visualization


# Projection visualization

In [16]:
globin_palette = {'Echinodermata': '#086b75', 'Arthropoda': '#0b237c', 'Mollusca': '#512ff8', 'Annelida': '#a191f3', 'Chordata': '#26c9d9', 'Cnidaria': '#ad288b', 'Porifera': '#fdb7fd', 'Placozoa': '#e9bd6b', 'Bacteria': '#f10000', 'Nematoda': '#5d78e3', 'Hemichordata': '#b0ffe8', 'Fungi': '#a0e361', 'Viridiplantae': '#4d9b03'}

In [17]:
# OPTIONS =================================================
title = "PM projection of globins colored according family name"
#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "Color_species"
# Select classes to label among the "labels_name" column (comma separated list):
labels_text = ""
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = globin_palette
use_custom_palette = True
#==========================================================


#Check projection visualization parameters
#Labels name
if labels_name == "" and path_annotation == "dummy_annotation.csv":
    labels_name = "default"
elif labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")

#Labels text
try:
    labels_text = [s.strip() for s in labels_text.split(",")]
except:
    print("Error: label_text field is not a valid list.")

#Convert labels_text to labels_name dtype
if labels_name and labels_name != "default":
    try:
        labels_text_dtype = df_annotation[labels_name].dtypes
        labels_text = list(np.array(labels_text).astype(labels_text_dtype))
    except:
        raise TypeError(f'"labels_text" is not compatible with {labels_name}" data format ({labels_text_dtype}).')

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()

In [18]:
# OPTIONS =================================================
title = "PM projection of globins colored by phylogenetic tree cluster"
#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "tree3"
# Select classes to label among the "labels_name" column (comma separated list):
labels_text = ""
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = None
use_custom_palette = False
#==========================================================


#Check projection visualization parameters
#Labels name
if labels_name == "" and path_annotation == "dummy_annotation.csv":
    labels_name = "default"
elif labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")

#Labels text
if labels_text:
    try:
        labels_text = [s.strip() for s in labels_text.split(",")]
    except:
        print("Error: label_text field is not a valid list.")
else:
    labels_text = [""]

#Convert labels_text to labels_name dtype
if labels_name and labels_name != "default" and labels_text != [""]:
    try:
        labels_text_dtype = df_annotation[labels_name].dtypes
        labels_text = list(np.array(labels_text).astype(labels_text_dtype))
    except:
        raise TypeError(f'"labels_text" is not compatible with {labels_name}" data format ({labels_text_dtype}).')

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()

### Save plot to file

In [19]:
# OPTIONS =================================================
output_name = "fig1"
output_format = "png" #Format availables: ["png", "html", "pdf", "svg"]
#==========================================================


if output_format != "html":
    fig.write_image(f"{output_name}.{output_format}", engine="kaleido")
else:
    fig.write_html(f"{output_name}.{output_format}")